In [106]:
user_trajectories_path = 'Data/096/trajectories.csv'
trajectories_labels_path = 'Data/096/labels.txt'

In [112]:
import pandas as pd

## Get trajectories Dataset
df_trajectories = pd.read_csv(user_trajectories_path,
                              names=['Latitude', 'Longitude', '0', 'Altitude', 'Date', 'Date_str', 'Time_str'],
                              parse_dates=True)
df_trajectories['DateTime'] = df_trajectories['Date_str'] + ' ' + df_trajectories['Time_str']
df_trajectories['DateTime'] = pd.to_datetime(df_trajectories['DateTime'])
df_trajectories = df_trajectories.drop(columns=['Date_str', 'Time_str', '0', 'Date'])

## Get Labels Dataset
## Todo:
## Make sure that all starting time and dates happend after the end of the date and time of the row above
df_labels = pd.read_csv(trajectories_labels_path, delim_whitespace=True, skiprows=1, header=None)
df_labels['StartTime'] = df_labels[0] + ' ' + df_labels[1]
df_labels['EndTime'] = df_labels[2] + ' ' + df_labels[3]
df_labels['StartTime'] = pd.to_datetime(df_labels['StartTime'])
df_labels['EndTime'] = pd.to_datetime(df_labels['EndTime'])
df_labels = df_labels.drop(columns=[0, 1, 2, 3])
df_labels = df_labels.rename(columns={4: "TransportMode"})
labels_to_drop = list()
for idx, row in df_labels.iterrows():
    if idx != 0:
        if (df_labels.loc[idx - 1, 'EndTime'] - row['StartTime']).total_seconds() > 0:
            labels_to_drop.append(idx - 1)
df_labels.drop(labels_to_drop, inplace=True)
df_labels.reset_index(drop=True, inplace=True)

In [49]:
df_trajectories

Unnamed: 0,Latitude,Longitude,Altitude,DateTime
0,39.976200,116.330320,0.0,2008-02-13 13:56:47
1,39.976188,116.330322,0.0,2008-02-13 13:56:48
2,39.975307,116.330622,0.0,2008-02-13 14:08:25
3,39.975332,116.330570,0.0,2008-02-13 14:08:28
4,39.975395,116.330513,0.0,2008-02-13 14:08:31
...,...,...,...,...
601866,40.066387,116.404935,215.0,2009-09-28 14:27:47
601867,40.066386,116.404935,215.0,2009-09-28 14:27:49
601868,40.066343,116.404958,215.0,2009-09-28 14:27:54
601869,40.066285,116.404985,215.0,2009-09-28 14:27:59


In [87]:
df_labels

Unnamed: 0,TransportMode,StartTime,EndTime
0,bike,2008-07-10 00:38:24,2008-07-10 01:05:44
1,walk,2008-07-10 10:09:29,2008-07-10 10:10:12
2,bike,2008-07-10 10:10:12,2008-07-10 10:18:48
3,walk,2008-07-10 10:18:48,2008-07-10 11:30:38
4,bike,2008-07-10 11:30:38,2008-07-10 11:41:20
...,...,...,...
114,bike,2008-08-22 01:38:43,2008-08-22 01:59:16
115,walk,2008-08-22 01:59:16,2008-08-22 02:00:18
116,walk,2008-12-10 10:11:32,2008-12-10 10:28:18
117,bike,2008-12-10 10:28:18,2008-12-10 10:30:46


In [51]:
import math

def haversine_distance(lat1, lon1, lat2, lon2):
    # Convert coordinates to radians
    lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = math.sin(dlat / 2) ** 2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2) ** 2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    r = 6371 # radius of the Earth in kilometers
    distance = r * c
    return distance * 1000

In [111]:
from tqdm import tqdm

def make_labeled_trajectories_data(trajectories, labels):
    trip_end_row_idxs = list()

    for idx in tqdm(range(len(labels.index))):
        mask = (trajectories['DateTime'] >= labels.loc[idx, 'StartTime']) & (trajectories['DateTime'] <= labels.loc[idx, 'EndTime'])
        df_trip = trajectories[mask]
        trip_indices = df_trip.index

        if (len(trip_indices) > 1):
            trajectories.loc[trip_indices, 'TransportMode'] = labels.loc[idx, 'TransportMode']

            if (not df_trip.iloc[0].equals(trajectories.iloc[0])):
                if len(trip_end_row_idxs) == 0:
                    trajectories.drop(list(range(0, trip_indices[0])), inplace=True)
                else:
                    trajectories.drop(list(range(trip_end_row_idxs[-1] + 1, trip_indices[0])), inplace=True)

            trip_end_row_idxs.append(trip_indices[-1])
            
    if len(trip_end_row_idxs) > 0:

        trajectories.drop(list(range(trip_end_row_idxs[-1] + 1, trajectories.index[-1] + 1)), inplace=True)

        df_trajectories_sub = trajectories.iloc[1:]
        trajectories.drop(trip_end_row_idxs[-1], inplace=True)
        trajectories['LatEnd'] = df_trajectories_sub['Latitude'].tolist()
        trajectories['LongEnd'] = df_trajectories_sub['Longitude'].tolist()
        trajectories['AltEnd'] = df_trajectories_sub['Altitude'].tolist()
        trajectories['TimeEnd'] = df_trajectories_sub['DateTime'].tolist()
        trajectories.drop(trip_end_row_idxs[:-1], inplace=True)

        trajectories['Altitude_avg'] = trajectories.apply(lambda row: float(row[2] + row[7]) / 2.0, axis=1)
        trajectories['Duration(s)'] = trajectories.apply(lambda row: row[8] - row[3], axis=1)
        trajectories["Duration(s)"] = trajectories["Duration(s)"].apply(lambda x: x.total_seconds())
        trajectories.drop(trajectories[(trajectories['Duration(s)'] == 0) | (trajectories['Altitude'] == -777)].index, inplace=True)
        trajectories['Distance(m)'] = trajectories.apply(lambda row: haversine_distance(row[0], row[1], row[5], row[6]), axis=1)
        trajectories["Velocity(m/s)"] = trajectories["Distance(m)"] / trajectories["Duration(s)"]
        trajectories["Altitude_avg(m)"] = trajectories["Altitude_avg"] / 3.2808

        return trajectories.loc[:, ["Distance(m)", "Duration(s)", "Velocity(m/s)", "Altitude_avg(m)", "TransportMode"]]
    else:
        return pd.DataFrame()
    

In [110]:
df_labeled_trajectories = make_labeled_trajectories_data(df_trajectories, df_labels)

100%|██████████| 112/112 [00:02<00:00, 48.79it/s]


[22, 212, 1221, 1498, 2266, 14305, 14973, 15375, 15411, 16013, 16054, 16365, 16486, 17023, 17432, 20973, 21425, 22383, 22549, 23131, 23863, 24626, 31374, 31671, 32206, 32450, 33435, 34370, 36451, 37250, 37285, 37698, 40480, 40921, 41807, 41965, 42766, 43702, 43985, 44139, 44787, 45172, 45849, 46239, 46811, 47027, 47155, 47333, 48011, 48536, 48701, 48737, 49584, 49766, 50373, 50542, 51026, 51664, 52118, 52151, 52890, 52993, 53300, 54684, 55220, 55583, 55983, 56064, 56750, 57499, 57705, 58278, 58305, 58617, 59012, 59055, 59769, 60351, 61325, 61895, 62613, 64316, 64356, 64553, 64971, 65004, 65574, 65608, 66530, 67979, 67994, 68495, 69378, 69428, 69534, 69845, 70080, 70624, 70670, 71213, 71812, 71849, 85928, 86636, 86667, 230192, 230261, 231063]


In [32]:
df_labeled_trajectories

Unnamed: 0,Distance(m),Duration(s),Velocity(m/s),Altitude_avg(m),TransportMode
183896,4.689863,2.0,2.344931,1248.171178,walk
183897,3.444495,1.0,3.444495,1247.866374,walk
183898,2.327486,1.0,2.327486,1247.866374,walk
183899,6.255833,38.0,0.164627,1248.933187,walk
183900,5.955012,2.0,2.977506,1250.000000,walk
...,...,...,...,...,...
720010,12.582622,2.0,6.291311,56.541088,taxi
720011,4.357512,1.0,4.357512,55.474275,taxi
720012,3.066865,1.0,3.066865,54.407462,taxi
720013,1.381074,1.0,1.381074,53.493050,taxi


In [40]:
df_labeled_trajectories.to_csv("./labeled_data/traj_10.csv")

In [41]:
from collections import Counter

transport_modes = df_labeled_trajectories['TransportMode']
modes_count = dict(Counter(transport_modes))
print(modes_count)

modes_percentage = dict()

for key, val in modes_count.items():
    modes_percentage[key] = val / len(transport_modes) * 100
    
print(modes_percentage)

{'walk': 36801, 'taxi': 52215, 'bus': 34937, 'train': 330273, 'subway': 20338, 'airplane': 1323, 'car': 2371}
{'walk': 7.6948007142588315, 'taxi': 10.917747324665767, 'bus': 7.305052921226618, 'train': 69.05749616315879, 'subway': 4.252516424189454, 'airplane': 0.27662893250086773, 'car': 0.49575751999966544}


In [44]:
merged_dict = {key: [modes_count[key], modes_percentage[key]] for key in modes_count}

In [45]:
merged_dict

{'walk': [36801, 7.6948007142588315],
 'taxi': [52215, 10.917747324665767],
 'bus': [34937, 7.305052921226618],
 'train': [330273, 69.05749616315879],
 'subway': [20338, 4.252516424189454],
 'airplane': [1323, 0.27662893250086773],
 'car': [2371, 0.49575751999966544]}