In [18]:
import sys
import os
from pathlib import Path
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime
import torch
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen
from tqdm.auto import tqdm
import numpy as np

import shutil

In [19]:
RAW_DATA_DIR = "raw/geolife"
PROCESSED_DATA_DIR = "processed"
data = Path(RAW_DATA_DIR) / "Geolife Trajectories 1.3" / "RawData"

PREPROCESS_DATA_DIR = False

In [None]:
if PREPROCESS_DATA_DIR:
    participants = sorted(data.iterdir())

    # for idx, participant_folder in enumerate(participants, start=1):
    #     new_name = data / f"old_{idx}"
    #     participant_folder.rename(new_name)

    for idx, participant_folder in enumerate(participants, start=1):
        new_name = data / f"{idx}"
        participant_folder.rename(new_name)

In [20]:
import threading
import concurrent.futures
lock = threading.Lock()

def process_trajectory_file(file_path):
    df = pd.read_csv(file_path, skiprows=6, header=None, usecols=[0, 1, 5, 6], names=['latitude', 'longitude', 'date', 'time'])
    df['date_time'] = pd.to_datetime(df['date'] + ' ' + df['time'])
    df.drop(columns=['date', 'time'], inplace=True)
    df['timestamp'] = df['date_time'].astype(np.int64) / 10**9
    return df

DATA = {}

def process_participant_folder(participant_folder):
    print(f"Processing folder: {participant_folder.name}")
    
    if participant_folder.is_dir():
        trajectory_folder = participant_folder / "Trajectory"
        
        if trajectory_folder.exists():
            print(f"Processing {participant_folder.name} labels")

            # Get the trajectory files
            trajectory_files = sorted(list(trajectory_folder.glob("*.plt")))
            trajectory_dataframes = [process_trajectory_file(file) for file in trajectory_files]
            print(f"Found {len(trajectory_dataframes)} trajectory files in {participant_folder.name}")

            print(f"Checking participant {participant_folder.name} trajectory dataframes for null values")
            # Check for null values and empty dataframes
            for i, df in enumerate(trajectory_dataframes):
                if df.isnull().values.any():
                    print(f"DataFrame at index {i} contains null values.")
                if df.empty:
                    print(f"DataFrame at index {i} is empty.")

            extracted_dataframes = {}

            for index, df in enumerate(trajectory_dataframes):
                extracted_dataframes[f"{index}"] = df
            
            print(f"Extracted {len(extracted_dataframes)} dataframes from {participant_folder.name}")

            print(f"Checking participant {participant_folder.name} data for null values")
            # Check for null values and empty dataframes
            for i, df in extracted_dataframes.items():
                if df.isnull().values.any():
                    print(f"DataFrame at index {i} contains null values.")
                if df.empty:
                    print(f"DataFrame at index {i} is empty.")
            
            with lock:
                print(f"Adding {participant_folder.name} data to final result")
                DATA[participant_folder.name] = extracted_dataframes
                print(f"Done adding {participant_folder.name} data to final result")

with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
    executor.map(process_participant_folder, data.iterdir())

Processing folder: 1
Processing 1 labels
Processing folder: 10
Processing 10 labels
Processing folder: 100
Processing 100 labels
Processing folder: 101
Processing 101 labels
Processing folder: 102
Processing 102 labels
Processing folder: 103
Processing folder: 104
Processing 104 labels
Processing 103 labels
Processing folder: 105
Processing 105 labels
Processing folder: 106
Processing 106 labels
Processing folder: 107
Processing folder: 108
Processing 108 labels
Processing 107 labels
Processing folder: 109
Processing 109 labels
Processing folder: 11
Processing 11 labels
Processing folder: 110
Processing 110 labels
Processing folder: 111
Processing 111 labels
Found 2 trajectory files in 102
Checking participant 102 trajectory dataframes for null values
Extracted 2 dataframes from 102
Checking participant 102 data for null values
Adding 102 data to final result
Done adding 102 data to final result
Processing folder: 112
Processing 112 labels
Processing folder: 113
Processing 113 labels
F

  df['date_time'] = pd.to_datetime(df['date'] + ' ' + df['time'])


Found 8 trajectory files in 59
Checking participant 59 trajectory dataframes for null values
Extracted 8 dataframes from 59
Checking participant 59 data for null values
Adding 59 data to final result
Done adding 59 data to final result
Processing folder: 6
Processing 6 labels
Found 25 trajectory files in 57
Checking participant 57 trajectory dataframes for null values
Extracted 25 dataframes from 57
Checking participant 57 data for null values
Adding 57 data to final result
Done adding 57 data to final result
Processing folder: 60
Processing 60 labels
Found 28 trajectory files in 58
Checking participant 58 trajectory dataframes for null values
Extracted 28 dataframes from 58
Checking participant 58 data for null values
Adding 58 data to final result
Done adding 58 data to final result
Processing folder: 61
Processing 61 labels
Found 322 trajectory files in 21
Checking participant 21 trajectory dataframes for null values
Found 11 trajectory files in 60
Checking participant 60 trajectory

In [21]:
participants_with_no_data = [participant for participant, dataframes in DATA.items() if not dataframes]
print("Participants with no data:", participants_with_no_data)

# Remove participants with no data from DATA
DATA = {participant: dataframes for participant, dataframes in DATA.items() if dataframes}

Participants with no data: []


In [22]:
for participant, dataframes in DATA.items():
    for index, df in dataframes.items():
        dataframes[index] = df.sort_values(by='date_time').reset_index(drop=True)

In [23]:
import pickle

if not os.path.exists(PROCESSED_DATA_DIR):
    os.makedirs(PROCESSED_DATA_DIR)
    
with open(os.path.join(PROCESSED_DATA_DIR, 'geolife_next_point_data.pkl'), 'wb') as f:
    pickle.dump(DATA, f)

In [24]:
# Load data from the processed data file
import pickle

LOADED_DATA = {}

# Load the processed data
with open(os.path.join(PROCESSED_DATA_DIR, 'geolife_next_point_data.pkl'), 'rb') as f:
    LOADED_DATA = pickle.load(f)

# Test the dataframes
for participant, dataframes in LOADED_DATA.items():
    print(len(dataframes))
    for label, df in dataframes.items():
        print(f"Participant: {participant}, Label: {label}")
        print(df.info())
        print(df.head())
        break  # Remove this break to print all dataframes
    break  # Remove this break to print all participants

print(len(LOADED_DATA))

2
Participant: 102, Label: 0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8218 entries, 0 to 8217
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   latitude   8218 non-null   float64       
 1   longitude  8218 non-null   float64       
 2   date_time  8218 non-null   datetime64[ns]
 3   timestamp  8218 non-null   float64       
dtypes: datetime64[ns](1), float64(3)
memory usage: 256.9 KB
None
    latitude   longitude           date_time     timestamp
0  39.982375  116.320442 2009-09-19 07:11:37  1.253344e+09
1  39.982380  116.320448 2009-09-19 07:11:48  1.253344e+09
2  39.982380  116.320455 2009-09-19 07:11:50  1.253344e+09
3  39.982397  116.320455 2009-09-19 07:11:52  1.253344e+09
4  39.982425  116.320460 2009-09-19 07:11:54  1.253344e+09
182


In [None]:
for participant, dataframes in LOADED_DATA.items():
    min_max_time_diff = 10000000
    for label, df in dataframes.items():
        time_diffs = df['timestamp'].diff().dropna()
        min_time_diff = time_diffs.min()
        max_time_diff = time_diffs.max()
        avg_time_diff = time_diffs.mean()
        print(f"Participant: {participant}, Label: {label}")
        print(f"Min time diff: {min_time_diff}, Max time diff: {max_time_diff}, Avg time diff: {avg_time_diff}")
        var_time_diff = time_diffs.var()
        print(f"Variance in time diff: {var_time_diff}")
        if max_time_diff < min_max_time_diff:
            min_max_time_diff = max_time_diff
print(f"Min max time diff: {min_max_time_diff}")

Participant: 102, Label: 0
Min time diff: 1.0, Max time diff: 7273.0, Avg time diff: 6.157965194109773
Variance in time diff: 11880.893252334228
Participant: 102, Label: 1
Min time diff: 0.0, Max time diff: 901.0, Avg time diff: 4.713341112407623
Variance in time diff: 866.9827752528577
Participant: 103, Label: 0
Min time diff: 1.0, Max time diff: 980.0, Avg time diff: 6.545844044558698
Variance in time diff: 1259.7129479790874
Participant: 103, Label: 1
Min time diff: 2.0, Max time diff: 2238.0, Avg time diff: 9.558875219683655
Variance in time diff: 9057.165981831235
Participant: 103, Label: 2
Min time diff: 2.0, Max time diff: 5105.0, Avg time diff: 12.896586345381525
Variance in time diff: 29373.127988335247
Participant: 103, Label: 3
Min time diff: 2.0, Max time diff: 115.0, Avg time diff: 5.454022988505747
Variance in time diff: 70.19151551391938
Participant: 103, Label: 4
Min time diff: 2.0, Max time diff: 17988.0, Avg time diff: 14.77198697068404
Variance in time diff: 132856.9

In [38]:
for participant, dataframes in LOADED_DATA.items():
    for label, df in dataframes.items():
        # time_diffs = df['timestamp'].diff().dropna()
        # median_time_diff = time_diffs.median()
        # print(f"Median time diff: {median_time_diff}")
        time = 300
        split_indices = df['timestamp'].diff().gt(time).to_numpy().nonzero()[0]
        # Split the dataframe into segments based on the identified indices
        segments = np.split(df, split_indices)
        # Print the number of segments created for this label
        print(f"Participant: {participant}, Label: {label}, Segments: {len(segments)}, Indices: {split_indices}")

  return bound(*args, **kwds)


Participant: 102, Label: 0, Segments: 11, Indices: [1870 2177 3638 7369 7445 7449 7468 7666 7791 8003]
Participant: 102, Label: 1, Segments: 7, Indices: [ 381  679 1548 1849 2467 2570]
Participant: 103, Label: 0, Segments: 4, Indices: [191 491 619]
Participant: 103, Label: 1, Segments: 3, Indices: [179 367]
Participant: 103, Label: 2, Segments: 4, Indices: [744 809 993]
Participant: 103, Label: 3, Segments: 1, Indices: []
Participant: 103, Label: 4, Segments: 8, Indices: [ 334  438  847 1914 2186 2191 2204]
Participant: 105, Label: 0, Segments: 1, Indices: []
Participant: 105, Label: 1, Segments: 10, Indices: [ 712  954  961 1000 1462 1601 1850 2214 2315]
Participant: 105, Label: 2, Segments: 9, Indices: [   7  271  777  944 1875 1876 2072 2111]
Participant: 105, Label: 3, Segments: 5, Indices: [242 435 534 630]
Participant: 105, Label: 4, Segments: 10, Indices: [ 154  183  425  566  723  798  814 1177 1521]
Participant: 1, Label: 0, Segments: 5, Indices: [ 1  6  7 10]
Participant: 1, 