In [None]:
import sys
import os
from pathlib import Path
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime
import torch
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen
from tqdm.auto import tqdm
import numpy as np

import shutil


In [None]:
RAW_DATA_DIR = "raw/geolife"
PROCESSED_DATA_DIR = "processed"
LABLES_TO_EXTRACT = ['run', 'walk', 'bus', 'car', 'taxi', 'subway', 'train', 'bike', 'motorcycle']
raw_data = Path(RAW_DATA_DIR) / "Geolife Trajectories 1.3" / "Data"

Download dataset if not exists

In [None]:
if not os.path.exists(RAW_DATA_DIR):
    with tqdm(total=1) as pbar:
        os.makedirs(RAW_DATA_DIR)

        url = "https://download.microsoft.com/download/F/4/8/F4894AA5-FDBC-481E-9285-D5F8C4C4F039/Geolife%20Trajectories%201.3.zip"
        with urlopen(url) as zipresp:
            with ZipFile(BytesIO(zipresp.read())) as zfile:
                zfile.extractall(
                    RAW_DATA_DIR
                )
        pbar.update()
else :
    print("Data already downloaded")

In [None]:
if os.path.exists(RAW_DATA_DIR):
    raw_data = Path(RAW_DATA_DIR) / "Geolife Trajectories 1.3" / "Data"

    for participant_folder in raw_data.iterdir():
        if participant_folder.is_dir():
            contents = list(participant_folder.iterdir())
            if len(contents) != 2 or not any(item.is_file() for item in contents) or not any(item.is_dir() for item in contents):
                print(f"Removing folder: {participant_folder}")
                shutil.rmtree(participant_folder)

In [None]:
if os.path.exists(RAW_DATA_DIR):
    raw_data = Path(RAW_DATA_DIR) / "Geolife Trajectories 1.3" / "Data"

    participants = sorted(raw_data.iterdir())
    for idx, participant_folder in enumerate(participants, start=1):
        new_name = raw_data / f"{idx}"
        participant_folder.rename(new_name)

In [None]:
# Multi threaded version
import threading

import concurrent.futures

lock = threading.Lock()

def process_trajectory_file(file_path):
    df = pd.read_csv(file_path, skiprows=6, header=None, usecols=[0, 1, 5, 6], names=['latitude', 'longitude', 'date', 'time'])
    df['date_time'] = pd.to_datetime(df['date'] + ' ' + df['time'])
    df.drop(columns=['date', 'time'], inplace=True)
    df['timestamp'] = df['date_time'].astype(np.int64) / 10**9
    return df

PROCESSED_DATA = {}

def process_participant_folder(participant_folder):
    print(f"Processing folder: {participant_folder.name}")
    
    if participant_folder.is_dir():
        labels_file = participant_folder / "labels.txt"
        trajectory_folder = participant_folder / "Trajectory"
        
        if labels_file.exists() and trajectory_folder.exists():
            print(f"Processing {participant_folder.name} labels")
            # Read the labels file and create a dataframe
            labels_df = pd.read_csv(labels_file, sep='\t')

            labels_df['Start Time'] = pd.to_datetime(labels_df['Start Time'])
            labels_df['End Time'] = pd.to_datetime(labels_df['End Time'])
            labels_df['Transportation Mode'] = labels_df['Transportation Mode'].astype(str)

            labels_df = labels_df[labels_df['Transportation Mode'].isin(LABLES_TO_EXTRACT)].reset_index(drop=True)

            # Get the trajectory files
            trajectory_files = sorted(list(trajectory_folder.glob("*.plt")))
            trajectory_dataframes = [process_trajectory_file(file) for file in trajectory_files]

            print(f"Checking {participant_folder.name} labels for null values")
            # Check for null values and empty dataframes
            for i, df in enumerate(trajectory_dataframes):
                if df.isnull().values.any():
                    print(f"DataFrame at index {i} contains null values.")
                if df.empty:
                    print(f"DataFrame at index {i} is empty.")

            print(f"Extracting {participant_folder.name} data")
            # Extract the dataframes for each row in the labels dataframe
            extracted_dataframes = {}

            for index, row in labels_df.iterrows():
                start_time = row['Start Time']
                end_time = row['End Time']
                label = row['Transportation Mode']

                filtered_dfs = []
                for df in trajectory_dataframes:
                    filtered_df = df[(df['date_time'] >= start_time) & (df['date_time'] <= end_time)]
                    if not filtered_df.empty:
                        filtered_dfs.append(filtered_df)

                if filtered_dfs:
                    combined_df = pd.concat(filtered_dfs)
                    extracted_dataframes[f"{index}_{label}"] = combined_df

            print(f"Checking {participant_folder.name} data for null values")
            # Check for null values and empty dataframes
            for i, df in extracted_dataframes.items():
                if df.isnull().values.any():
                    print(f"DataFrame at index {i} contains null values.")
                if df.empty:
                    print(f"DataFrame at index {i} is empty.")
            
            with lock:
                print(f"Adding {participant_folder.name} data to final result")
                PROCESSED_DATA[participant_folder.name] = extracted_dataframes
                print(f"Done adding {participant_folder.name} data to final result")

# Use ThreadPoolExecutor to process participant folders in parallel
with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
    executor.map(process_participant_folder, raw_data.iterdir())

In [None]:
participants_with_no_data = [participant for participant, dataframes in PROCESSED_DATA.items() if not dataframes]
print("Participants with no data:", participants_with_no_data)

# Remove participants with no data from PROCESSED_DATA
PROCESSED_DATA = {participant: dataframes for participant, dataframes in PROCESSED_DATA.items() if dataframes}

In [None]:
for participant, dataframes in PROCESSED_DATA.items():
    for label, df in dataframes.items():
        dataframes[label] = df.sort_values(by='date_time').reset_index(drop=True)

In [None]:

tmp_processed_data = {}

for new_id, (participant, dataframes) in enumerate(PROCESSED_DATA.items(), start=1):
    tmp_processed_data[new_id] = dataframes

PROCESSED_DATA = tmp_processed_data


In [None]:
import pickle

if not os.path.exists(PROCESSED_DATA_DIR):
    os.makedirs(PROCESSED_DATA_DIR)
    
with open(os.path.join(PROCESSED_DATA_DIR, 'geolife_processed_data.pkl'), 'wb') as f:
    pickle.dump(PROCESSED_DATA, f)

In [None]:
# Load data from the processed data file
import pickle

LOADED_DATA = {}

# Load the processed data
with open(os.path.join(PROCESSED_DATA_DIR, 'geolife_processed_data.pkl'), 'rb') as f:
    LOADED_DATA = pickle.load(f)

# Test the dataframes
for participant, dataframes in LOADED_DATA.items():
    for label, df in dataframes.items():
        print(f"Participant: {participant}, Label: {label}")
        print(df.info())
        print(df.head())
        break  # Remove this break to print all dataframes
    break  # Remove this break to print all participants

In [None]:
import folium

participant_dataframes = LOADED_DATA[60]

# Create a map centered around Beijing
beijing_map = folium.Map(location=[39.9042, 116.4074], zoom_start=12)

# Plot each route for participant 1
for label, df in participant_dataframes.items():
    route = folium.PolyLine(
        locations=df[['latitude', 'longitude']].values,
        color='blue',
        weight=2.5,
        opacity=1
    )
    route.add_to(beijing_map)

# Display the map
beijing_map

In [None]:
print(len(LOADED_DATA))

In [None]:
import matplotlib.pyplot as plt

# Create a dictionary to store the number of entries for each participant
participant_entries = {participant: len(dataframes) for participant, dataframes in LOADED_DATA.items()}

# Plot the number of entries for each participant
plt.figure(figsize=(12, 6))
plt.bar(participant_entries.keys(), participant_entries.values())
plt.xlabel('Participant')
plt.ylabel('Number of Routes')
plt.title('Number of Routes for Each Participant')
plt.show()

In [None]:
from collections import Counter

label_distribution = Counter()

for participant, dataframes in LOADED_DATA.items():
    for label in dataframes.keys():
        label_distribution[label.split('_')[1]] += 1

print(label_distribution)
plt.figure(figsize=(12, 6))
plt.bar(label_distribution.keys(), label_distribution.values())
plt.xlabel('Transportation Mode')
plt.ylabel('Frequency')
plt.title('Distribution of Transportation Modes')
plt.xticks(rotation=45)
plt.show()

# due to the low count of run and motorcycle data, i would remove them
#or retag them as walk and car

In [None]:
import pprint
print(type(LOADED_DATA[1]['0_bus']))
pprint.pprint(LOADED_DATA[1]['0_bus'])