In [1]:
import os

directory_path = './Data/'
directory_content = os.listdir(directory_path)
if '.DS_Store' in directory_content:
    directory_content.remove('.DS_Store')


In [2]:
import pandas as pd

def get_trajecotries_and_labels_datasets(user_path):
    ## Get trajectories Dataset
    df_trajectories = pd.read_csv(user_path + 'trajectories.csv',
                                  names=['Latitude', 'Longitude', '0', 'Altitude', 'Date', 'Date_str', 'Time_str'],
                                  parse_dates=True)
    df_trajectories['DateTime'] = df_trajectories['Date_str'] + ' ' + df_trajectories['Time_str']
    df_trajectories['DateTime'] = pd.to_datetime(df_trajectories['DateTime'])
    df_trajectories = df_trajectories.drop(columns=['Date_str', 'Time_str', '0', 'Date'])

    ## Get Labels Dataset
    df_labels = pd.read_csv(user_path + 'labels.txt', delim_whitespace=True, skiprows=1, header=None)
    df_labels['StartTime'] = df_labels[0] + ' ' + df_labels[1]
    df_labels['EndTime'] = df_labels[2] + ' ' + df_labels[3]
    df_labels['StartTime'] = pd.to_datetime(df_labels['StartTime'])
    df_labels['EndTime'] = pd.to_datetime(df_labels['EndTime'])
    df_labels = df_labels.drop(columns=[0, 1, 2, 3])
    df_labels = df_labels.rename(columns={4: "TransportMode"})
    
    labels_to_drop = list()
    for idx, row in df_labels.iterrows():
        if idx != 0:
            if (df_labels.loc[idx - 1, 'EndTime'] - row['StartTime']).total_seconds() > 0:
                labels_to_drop.append(idx - 1)

    df_labels.drop(labels_to_drop, inplace=True)
    df_labels.reset_index(drop=True, inplace=True)
    
    return df_trajectories, df_labels

In [3]:
import math

def haversine_distance(lat1, lon1, lat2, lon2):
    # Convert coordinates to radians
    lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = math.sin(dlat / 2) ** 2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2) ** 2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    r = 6371 # radius of the Earth in kilometers
    distance = r * c
    return distance * 1000

In [4]:
from tqdm import tqdm

def make_labeled_trajectories_data(trajectories, labels):
    trip_end_row_idxs = list()

    for idx in tqdm(range(len(labels.index))):
        mask = (trajectories['DateTime'] >= labels.loc[idx, 'StartTime']) & (trajectories['DateTime'] <= labels.loc[idx, 'EndTime'])
        df_trip = trajectories[mask]
        trip_indices = df_trip.index

        if (len(trip_indices) > 1):
            trajectories.loc[trip_indices, 'TransportMode'] = labels.loc[idx, 'TransportMode']

            if (not df_trip.iloc[0].equals(trajectories.iloc[0])):
                if len(trip_end_row_idxs) == 0:
                    trajectories.drop(list(range(0, trip_indices[0])), inplace=True)
                else:
                    trajectories.drop(list(range(trip_end_row_idxs[-1] + 1, trip_indices[0])), inplace=True)

            trip_end_row_idxs.append(trip_indices[-1])

    if len(trip_end_row_idxs) > 0:
        trajectories.drop(list(range(trip_end_row_idxs[-1] + 1, trajectories.index[-1] + 1)), inplace=True)
    
        df_trajectories_sub = trajectories.iloc[1:]
        trajectories.drop(trajectories.index[-1], inplace=True)
        trajectories['LatEnd'] = df_trajectories_sub['Latitude'].tolist()
        trajectories['LongEnd'] = df_trajectories_sub['Longitude'].tolist()
        trajectories['AltEnd'] = df_trajectories_sub['Altitude'].tolist()
        trajectories['TimeEnd'] = df_trajectories_sub['DateTime'].tolist()
        trajectories.drop(trip_end_row_idxs[:-1], inplace=True)

        trajectories['Altitude_avg'] = trajectories.apply(lambda row: float(row[2] + row[7]) / 2.0, axis=1)
        trajectories['Duration(s)'] = trajectories.apply(lambda row: row[8] - row[3], axis=1)
        trajectories["Duration(s)"] = trajectories["Duration(s)"].apply(lambda x: x.total_seconds())
        trajectories.drop(trajectories[(trajectories['Duration(s)'] == 0) | (trajectories['Altitude'] == -777)].index, inplace=True)
        trajectories['Distance(m)'] = trajectories.apply(lambda row: haversine_distance(row[0], row[1], row[5], row[6]), axis=1)
        trajectories["Velocity(m/s)"] = trajectories["Distance(m)"] / trajectories["Duration(s)"]
        trajectories["Altitude_avg(m)"] = trajectories["Altitude_avg"] / 3.2808
    
        return trajectories.loc[:, ["Distance(m)", "Duration(s)", "Velocity(m/s)", "Altitude_avg(m)", "TransportMode"]]
    
    else:
        
        return pd.DataFrame()
    

In [5]:
import random
from collections import Counter

random.seed(123)

total_dataset_size = 1000000
max_data_per_user = 200000
curr_dataset_size = 0
users_visited = list()

list_dfs = list()

while curr_dataset_size < total_dataset_size:
    if max_data_per_user > (total_dataset_size - curr_dataset_size):
        max_data_per_user = total_dataset_size - curr_dataset_size
    
    random_user = random.choice(directory_content)
    directory_content.remove(random_user)
    users_visited.append(random_user)
    print('Processing User: ', random_user)
    
    df_trajectories, df_labels = get_trajecotries_and_labels_datasets('Data/' + random_user + '/')
    df_labeled_trajectories = make_labeled_trajectories_data(df_trajectories, df_labels)
    user_data_length = len(df_labeled_trajectories)
        
    if user_data_length > 0:

        if user_data_length > max_data_per_user:
            df_samples = df_labeled_trajectories.sample(n=200000)
            list_dfs.append(df_samples)
            curr_dataset_size += 200000

            transport_modes = df_samples['TransportMode']
            modes_count = dict(Counter(transport_modes))
            modes_percentage = dict()
            for key, val in modes_count.items():
                modes_percentage[key] = val / len(transport_modes) * 100
            modes_analytics = {key: [modes_count[key], modes_percentage[key]] for key in modes_count}
            print(modes_analytics)
        else:
            list_dfs.append(df_labeled_trajectories)
            curr_dataset_size += user_data_length

            transport_modes = df_labeled_trajectories['TransportMode']
            modes_count = dict(Counter(transport_modes))
            modes_percentage = dict()
            for key, val in modes_count.items():
                modes_percentage[key] = val / len(transport_modes) * 100
            modes_analytics = {key: [modes_count[key], modes_percentage[key]] for key in modes_count}
            print(modes_analytics)
    
    
    

Processing User:  085


100%|██████████| 1275/1275 [00:42<00:00, 30.27it/s] 


{'bus': [89863, 44.9315], 'subway': [32287, 16.1435], 'walk': [70096, 35.048], 'car': [5518, 2.759], 'bike': [407, 0.2035], 'taxi': [1829, 0.9145000000000001]}
Processing User:  138


100%|██████████| 36/36 [00:00<00:00, 382.73it/s]


{'bike': [791, 49.4375], 'walk': [809, 50.5625]}
Processing User:  096


100%|██████████| 112/112 [00:02<00:00, 54.85it/s]


{'walk': [8501, 20.670119386291244], 'bike': [28870, 70.19719405743186], 'subway': [2284, 5.553529311644419], 'bus': [1472, 3.5791572446324795]}
Processing User:  056


100%|██████████| 33/33 [00:00<00:00, 347.54it/s]

{'bike': [693, 57.84641068447412], 'taxi': [146, 12.186978297161936], 'walk': [359, 29.966611018363942]}
Processing User:  106



100%|██████████| 3/3 [00:00<00:00, 228.36it/s]


{'car': [1982, 100.0]}
Processing User:  098


100%|██████████| 25/25 [00:00<00:00, 263.84it/s]


{'taxi': [278, 45.203252032520325], 'bike': [8, 1.3008130081300813], 'walk': [193, 31.382113821138212], 'bus': [101, 16.422764227642276], 'train': [35, 5.691056910569105]}
Processing User:  060


100%|██████████| 2/2 [00:00<00:00, 417.01it/s]


Processing User:  141


100%|██████████| 103/103 [00:01<00:00, 56.63it/s]


{'walk': [48278, 74.77425849918686], 'subway': [1257, 1.946875242004182], 'bus': [15030, 23.278866258808954]}
Processing User:  124


100%|██████████| 1/1 [00:00<00:00, 239.52it/s]

Processing User:  102



100%|██████████| 57/57 [00:00<00:00, 322.10it/s]


{'walk': [2866, 48.07918134541184], 'bus': [1314, 22.043281328636137], 'bike': [1506, 25.26421741318571], 'taxi': [275, 4.613319912766315]}
Processing User:  128


100%|██████████| 935/935 [01:25<00:00, 10.96it/s]


{'car': [99372, 49.686], 'subway': [29653, 14.826500000000001], 'bike': [14116, 7.058000000000001], 'walk': [38949, 19.4745], 'boat': [680, 0.33999999999999997], 'train': [3972, 1.986], 'taxi': [5494, 2.7470000000000003], 'bus': [4616, 2.308], 'airplane': [2505, 1.2525], 'run': [643, 0.3215]}
Processing User:  089


100%|██████████| 34/34 [00:00<00:00, 249.56it/s]


{'car': [15521, 87.95262650875503], 'walk': [2126, 12.04737349124497]}
Processing User:  073


100%|██████████| 54/54 [00:00<00:00, 243.62it/s]


{'walk': [9898, 93.91782901603568], 'bus': [641, 6.082170983964323]}
Processing User:  175


100%|██████████| 12/12 [00:00<00:00, 368.50it/s]

{'taxi': [59, 20.774647887323944], 'walk': [125, 44.014084507042256], 'bus': [98, 34.50704225352113], 'subway': [2, 0.7042253521126761]}
Processing User:  126



100%|██████████| 460/460 [00:13<00:00, 34.01it/s]


{'bus': [46004, 23.002], 'walk': [83364, 41.682], 'train': [20543, 10.2715], 'bike': [31757, 15.8785], 'car': [13717, 6.858499999999999], 'taxi': [4406, 2.2030000000000003], 'subway': [60, 0.03], 'motorcycle': [149, 0.0745]}
Processing User:  167


100%|██████████| 936/936 [00:38<00:00, 24.35it/s]


{'walk': [56749, 28.3745], 'bike': [65951, 32.975500000000004], 'bus': [33813, 16.906499999999998], 'train': [27460, 13.73], 'taxi': [6211, 3.1055], 'car': [8251, 4.1255], 'subway': [1489, 0.7445], 'motorcycle': [76, 0.038]}
Processing User:  053


100%|██████████| 19/19 [00:00<00:00, 319.68it/s]

{'bus': [25, 9.057971014492754], 'walk': [193, 69.92753623188406], 'car': [58, 21.014492753623188]}
Processing User:  091



100%|██████████| 169/169 [00:00<00:00, 190.20it/s]


{'bus': [120, 17.49271137026239], 'walk': [521, 75.94752186588921], 'train': [2, 0.2915451895043732], 'subway': [43, 6.2682215743440235]}
Processing User:  154


100%|██████████| 54/54 [00:00<00:00, 242.59it/s]


{'taxi': [150, 5.9008654602675055], 'bus': [199, 7.8284815106215575], 'walk': [1733, 68.17466561762392], 'car': [276, 10.85759244689221], 'bike': [184, 7.238394964594807]}
Processing User:  010


100%|██████████| 434/434 [00:34<00:00, 12.71it/s]


{'train': [138042, 69.021], 'taxi': [21685, 10.8425], 'subway': [8554, 4.277], 'walk': [15577, 7.7885], 'bus': [14597, 7.2985], 'car': [1010, 0.505], 'airplane': [535, 0.26749999999999996]}


In [6]:
final_trajectories = pd.concat(list_dfs, ignore_index=True)

In [7]:
final_trajectories

Unnamed: 0,Distance(m),Duration(s),Velocity(m/s),Altitude_avg(m),TransportMode
0,48.768254,5.0,9.753651,147.220190,bus
1,0.085222,2.0,0.042611,24.079493,bus
2,26.096515,1.0,26.096515,105.919288,subway
3,2.341303,2.0,1.170652,47.549378,walk
4,2.249866,2.0,1.124933,33.223604,walk
...,...,...,...,...,...
1149017,20.418209,1.0,20.418209,1.524019,train
1149018,18.119181,1.0,18.119181,3283.955133,train
1149019,29.091872,1.0,29.091872,2050.109729,train
1149020,66.686709,1.0,66.686709,52.578639,train


In [8]:
transport_modes = final_trajectories['TransportMode']
modes_count = dict(Counter(transport_modes))
modes_percentage = dict()
for key, val in modes_count.items():
    modes_percentage[key] = val / len(transport_modes) * 100
modes_analytics = {key: [modes_count[key], modes_percentage[key]] for key in modes_count}
print(modes_analytics)

{'bus': [207893, 18.093039123706944], 'subway': [75629, 6.582032371877997], 'walk': [340337, 29.619711371931956], 'car': [145705, 12.680784179937374], 'bike': [144283, 12.557026758408455], 'taxi': [40533, 3.5276086967873552], 'train': [190054, 16.540501400321318], 'boat': [680, 0.05918076416291421], 'airplane': [3040, 0.26457282802244], 'run': [643, 0.0559606343481674], 'motorcycle': [225, 0.019581870495081904]}


In [10]:
final_trajectories.to_csv('./labeled_data/final_traj.csv')