In [102]:
import os

directory_path = './Data/'
directory_content = os.listdir(directory_path)
if '.DS_Store' in directory_content:
    directory_content.remove('.DS_Store')


In [103]:
import pandas as pd

def get_trajecotries_and_labels_datasets(user_path):
    ## Get trajectories Dataset
    df_trajectories = pd.read_csv(user_path + 'trajectories.csv',
                                  names=['Latitude', 'Longitude', '0', 'Altitude', 'Date', 'Date_str', 'Time_str'],
                                  parse_dates=True)
    df_trajectories['DateTime'] = df_trajectories['Date_str'] + ' ' + df_trajectories['Time_str']
    df_trajectories['DateTime'] = pd.to_datetime(df_trajectories['DateTime'])
    df_trajectories = df_trajectories.drop(columns=['Date_str', 'Time_str', '0', 'Date'])

    ## Get Labels Dataset
    df_labels = pd.read_csv(user_path + 'labels.txt', delim_whitespace=True, skiprows=1, header=None)
    df_labels['StartTime'] = df_labels[0] + ' ' + df_labels[1]
    df_labels['EndTime'] = df_labels[2] + ' ' + df_labels[3]
    df_labels['StartTime'] = pd.to_datetime(df_labels['StartTime'])
    df_labels['EndTime'] = pd.to_datetime(df_labels['EndTime'])
    df_labels = df_labels.drop(columns=[0, 1, 2, 3])
    df_labels = df_labels.rename(columns={4: "TransportMode"})
    
    labels_to_drop = list()
    for idx, row in df_labels.iterrows():
        if idx != 0:
            if (df_labels.loc[idx - 1, 'EndTime'] - row['StartTime']).total_seconds() > 0:
                labels_to_drop.append(idx - 1)

    df_labels.drop(labels_to_drop, inplace=True)
    df_labels.reset_index(drop=True, inplace=True)
    
    return df_trajectories, df_labels

In [104]:
import math

def haversine_distance(lat1, lon1, lat2, lon2):
    # Convert coordinates to radians
    lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = math.sin(dlat / 2) ** 2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2) ** 2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    r = 6371 # radius of the Earth in kilometers
    distance = r * c
    return distance * 1000

In [105]:
from tqdm import tqdm

def make_labeled_trajectories_data(trajectories, labels):
    trip_end_row_idxs = list()

    for idx in tqdm(range(len(labels.index))):
        mask = (trajectories['DateTime'] >= labels.loc[idx, 'StartTime']) & (trajectories['DateTime'] <= labels.loc[idx, 'EndTime'])
        df_trip = trajectories[mask]
        trip_indices = df_trip.index

        if (len(trip_indices) > 1):
            trajectories.loc[trip_indices, 'TransportMode'] = labels.loc[idx, 'TransportMode']

            if (not df_trip.iloc[0].equals(trajectories.iloc[0])):
                if len(trip_end_row_idxs) == 0:
                    trajectories.drop(list(range(0, trip_indices[0])), inplace=True)
                else:
                    trajectories.drop(list(range(trip_end_row_idxs[-1] + 1, trip_indices[0])), inplace=True)

            trip_end_row_idxs.append(trip_indices[-1])

    if len(trip_end_row_idxs) > 0:
        trajectories.drop(list(range(trip_end_row_idxs[-1] + 1, trajectories.index[-1] + 1)), inplace=True)
    
        df_trajectories_sub = trajectories.iloc[1:]
        trajectories.drop(trajectories.index[-1], inplace=True)
        trajectories['LatEnd'] = df_trajectories_sub['Latitude'].tolist()
        trajectories['LongEnd'] = df_trajectories_sub['Longitude'].tolist()
        trajectories['AltEnd'] = df_trajectories_sub['Altitude'].tolist()
        trajectories['TimeEnd'] = df_trajectories_sub['DateTime'].tolist()
        trajectories.drop(trip_end_row_idxs[:-1], inplace=True)

        trajectories['Altitude_avg'] = trajectories.apply(lambda row: float(row[2] + row[7]) / 2.0, axis=1)
        trajectories['Duration(s)'] = trajectories.apply(lambda row: row[8] - row[3], axis=1)
        trajectories["Duration(s)"] = trajectories["Duration(s)"].apply(lambda x: x.total_seconds())
        trajectories.drop(trajectories[(trajectories['Duration(s)'] == 0) | (trajectories['Altitude'] == -777)].index, inplace=True)
        trajectories['Distance(m)'] = trajectories.apply(lambda row: haversine_distance(row[0], row[1], row[5], row[6]), axis=1)
        trajectories["Velocity(m/s)"] = trajectories["Distance(m)"] / trajectories["Duration(s)"]
        trajectories["Altitude_avg(m)"] = trajectories["Altitude_avg"] / 3.2808
    
        return trajectories.loc[:, ["Distance(m)", "Duration(s)", "Velocity(m/s)", "Altitude_avg(m)", "TransportMode"]]
    
    else:
        
        return pd.DataFrame()
    

In [106]:
import random
from collections import Counter

random.seed(123)

total_dataset_size = 1000000
max_data_per_user = 200000
curr_dataset_size = 0
users_visited = list()

list_dfs = list()

while curr_dataset_size < total_dataset_size:
    if max_data_per_user > (total_dataset_size - curr_dataset_size):
        max_data_per_user = total_dataset_size - curr_dataset_size
    
    random_user = random.choice(directory_content)
    directory_content.remove(random_user)
    users_visited.append(random_user)
    print('Processing User: ', random_user)
    
    df_trajectories, df_labels = get_trajecotries_and_labels_datasets('Data/' + random_user + '/')
    df_labeled_trajectories = make_labeled_trajectories_data(df_trajectories, df_labels)
    user_data_length = len(df_labeled_trajectories)
        
    if user_data_length > 0:

        if user_data_length > max_data_per_user:
            df_samples = df_labeled_trajectories.sample(n=200000)
            list_dfs.append(df_samples)
            curr_dataset_size += 200000

            transport_modes = df_samples['TransportMode']
            modes_count = dict(Counter(transport_modes))
            modes_percentage = dict()
            for key, val in modes_count.items():
                modes_percentage[key] = val / len(transport_modes) * 100
            modes_analytics = {key: [modes_count[key], modes_percentage[key]] for key in modes_count}
            print(modes_analytics)
        else:
            list_dfs.append(df_labeled_trajectories)
            curr_dataset_size += user_data_length

            transport_modes = df_labeled_trajectories['TransportMode']
            modes_count = dict(Counter(transport_modes))
            modes_percentage = dict()
            for key, val in modes_count.items():
                modes_percentage[key] = val / len(transport_modes) * 100
            modes_analytics = {key: [modes_count[key], modes_percentage[key]] for key in modes_count}
            print(modes_analytics)
    
    
    

Processing User:  085


100%|██████████| 1275/1275 [01:17<00:00, 16.40it/s] 


{'bus': [89859, 44.9295], 'car': [5542, 2.771], 'subway': [32010, 16.005], 'taxi': [1917, 0.9585], 'walk': [70257, 35.1285], 'bike': [415, 0.2075]}
Processing User:  138


100%|██████████| 36/36 [00:00<00:00, 388.62it/s]


{'bike': [791, 49.4375], 'walk': [809, 50.5625]}
Processing User:  096


100%|██████████| 112/112 [00:02<00:00, 40.39it/s]


{'walk': [8501, 20.670119386291244], 'bike': [28870, 70.19719405743186], 'subway': [2284, 5.553529311644419], 'bus': [1472, 3.5791572446324795]}
Processing User:  056


100%|██████████| 33/33 [00:00<00:00, 351.30it/s]


{'bike': [735, 58.47255369928401], 'taxi': [158, 12.569610182975339], 'walk': [364, 28.957836117740655]}
Processing User:  106


100%|██████████| 3/3 [00:00<00:00, 222.49it/s]


{'car': [1982, 100.0]}
Processing User:  098


100%|██████████| 25/25 [00:00<00:00, 371.30it/s]


{'taxi': [278, 45.203252032520325], 'bike': [8, 1.3008130081300813], 'walk': [193, 31.382113821138212], 'bus': [101, 16.422764227642276], 'train': [35, 5.691056910569105]}
Processing User:  060


100%|██████████| 2/2 [00:00<00:00, 284.78it/s]

{'walk': [15, 100.0]}
Processing User:  141



100%|██████████| 103/103 [00:02<00:00, 35.31it/s]


{'walk': [48391, 74.80676477862973], 'subway': [1257, 1.943173386099431], 'bus': [15040, 23.250061835270838]}
Processing User:  124


100%|██████████| 1/1 [00:00<00:00, 253.83it/s]

Processing User:  102



100%|██████████| 57/57 [00:00<00:00, 289.89it/s]


{'walk': [2866, 48.07918134541184], 'bus': [1314, 22.043281328636137], 'bike': [1506, 25.26421741318571], 'taxi': [275, 4.613319912766315]}
Processing User:  128


100%|██████████| 935/935 [02:02<00:00,  7.63it/s]


{'car': [99240, 49.62], 'walk': [38834, 19.417], 'subway': [29787, 14.893500000000001], 'run': [614, 0.307], 'train': [4010, 2.005], 'bus': [4568, 2.284], 'taxi': [5584, 2.792], 'airplane': [2480, 1.24], 'bike': [14192, 7.095999999999999], 'boat': [691, 0.34550000000000003]}
Processing User:  089


100%|██████████| 34/34 [00:00<00:00, 182.24it/s]


{'car': [15521, 87.95262650875503], 'walk': [2126, 12.04737349124497]}
Processing User:  073


100%|██████████| 54/54 [00:00<00:00, 211.16it/s]


{'walk': [9898, 93.91782901603568], 'bus': [641, 6.082170983964323]}
Processing User:  175


100%|██████████| 12/12 [00:00<00:00, 320.44it/s]

{'taxi': [59, 20.774647887323944], 'walk': [125, 44.014084507042256], 'bus': [98, 34.50704225352113], 'subway': [2, 0.7042253521126761]}
Processing User:  126



100%|██████████| 460/460 [00:17<00:00, 26.74it/s]


{'walk': [83350, 41.675000000000004], 'bus': [45970, 22.985], 'train': [20471, 10.2355], 'car': [13741, 6.8705], 'bike': [31866, 15.933], 'taxi': [4396, 2.198], 'motorcycle': [147, 0.0735], 'subway': [59, 0.029500000000000002]}
Processing User:  167


100%|██████████| 936/936 [00:51<00:00, 18.19it/s]


{'bike': [65957, 32.9785], 'walk': [56791, 28.395500000000002], 'train': [27677, 13.838500000000002], 'bus': [33549, 16.7745], 'car': [8324, 4.162], 'taxi': [6174, 3.087], 'subway': [1453, 0.7264999999999999], 'motorcycle': [75, 0.0375]}
Processing User:  053


100%|██████████| 19/19 [00:00<00:00, 362.66it/s]

{'bus': [25, 9.057971014492754], 'walk': [193, 69.92753623188406], 'car': [58, 21.014492753623188]}
Processing User:  091



100%|██████████| 169/169 [00:00<00:00, 511.78it/s]


{'bus': [120, 17.49271137026239], 'walk': [521, 75.94752186588921], 'train': [2, 0.2915451895043732], 'subway': [43, 6.2682215743440235]}
Processing User:  154


100%|██████████| 54/54 [00:00<00:00, 320.89it/s]


{'taxi': [150, 5.9008654602675055], 'bus': [199, 7.8284815106215575], 'walk': [1733, 68.17466561762392], 'car': [276, 10.85759244689221], 'bike': [184, 7.238394964594807]}
Processing User:  010


100%|██████████| 434/434 [00:40<00:00, 10.75it/s]


{'train': [138111, 69.05550000000001], 'taxi': [21880, 10.94], 'bus': [14505, 7.2525], 'subway': [8598, 4.299], 'walk': [15347, 7.6735], 'car': [1009, 0.5045], 'airplane': [550, 0.27499999999999997]}


In [111]:
final_trajectories = pd.concat(list_dfs, ignore_index=True)

In [112]:
final_trajectories

Unnamed: 0,Distance(m),Duration(s),Velocity(m/s),Altitude_avg(m),TransportMode
0,2.352338,5.0,0.470468,37.033650,bus
1,19.703794,2.0,9.851897,22.098269,car
2,15.366449,2.0,7.683225,102.261644,bus
3,3.633798,5.0,0.726760,32.613997,bus
4,0.344299,2.0,0.172149,53.950256,bus
...,...,...,...,...,...
1149214,5.209176,1.0,5.209176,54.864667,taxi
1149215,28.433548,1.0,28.433548,49.378200,train
1149216,29.896226,1.0,29.896226,677.426238,train
1149217,0.375776,1.0,0.375776,-17.983419,train


In [113]:
transport_modes = combined_trajectories['TransportMode']
modes_count = dict(Counter(transport_modes))
modes_percentage = dict()
for key, val in modes_count.items():
    modes_percentage[key] = val / len(transport_modes) * 100
modes_analytics = {key: [modes_count[key], modes_percentage[key]] for key in modes_count}
print(modes_analytics)

{'bus': [207461, 18.05234685469001], 'car': [145693, 12.677566242813597], 'subway': [75493, 6.569069950984104], 'taxi': [40871, 3.55641526984848], 'walk': [340314, 29.612632579168984], 'bike': [144524, 12.575844986899797], 'train': [190306, 16.559593950326263], 'run': [614, 0.053427588649334896], 'airplane': [3030, 0.26365731857896535], 'boat': [691, 0.06012779113467494], 'motorcycle': [222, 0.01931746690578558]}


In [114]:
final_trajectories.to_csv('./labeled_data/final_traj.csv')