In [1]:
import pandas as pd

df_trajectories = pd.read_csv('Data/010/trajectories.csv',
                              names=['Latitude', 'Longitude', '0', 'Altitude', 'Date', 'Date_str', 'Time_str'],
                              parse_dates=True)

df_trajectories['DateTime'] = df_trajectories['Date_str'] + ' ' + df_trajectories['Time_str']
df_trajectories['DateTime'] = pd.to_datetime(df_trajectories['DateTime'])

df_trajectories = df_trajectories.drop(columns=['Date_str', 'Time_str', '0', 'Date'])

df_trajectories

Unnamed: 0,Latitude,Longitude,Altitude,DateTime
0,39.921712,116.472343,13,2007-08-04 03:30:32
1,39.921705,116.472343,13,2007-08-04 03:30:33
2,39.921695,116.472345,13,2007-08-04 03:30:34
3,39.921683,116.472342,13,2007-08-04 03:30:35
4,39.921672,116.472342,13,2007-08-04 03:30:36
...,...,...,...,...
935523,39.136261,117.218261,-59,2009-03-21 05:34:49
935524,39.136256,117.218276,-59,2009-03-21 05:34:50
935525,39.136256,117.218291,-59,2009-03-21 05:34:51
935526,39.136256,117.218303,-59,2009-03-21 05:34:52


In [2]:
df_labels = pd.read_csv('Data/010/labels.txt', delim_whitespace=True, skiprows=1, header=None)
df_labels['StartTime'] = df_labels[0] + ' ' + df_labels[1]
df_labels['EndTime'] = df_labels[2] + ' ' + df_labels[3]
df_labels['StartTime'] = pd.to_datetime(df_labels['StartTime'])
df_labels['EndTime'] = pd.to_datetime(df_labels['EndTime'])
df_labels = df_labels.drop(columns=[0, 1, 2, 3])
df_labels = df_labels.rename(columns={4: "TransportMode"})

df_labels

Unnamed: 0,TransportMode,StartTime,EndTime
0,bus,2007-06-26 11:32:29,2007-06-26 11:40:29
1,train,2008-03-28 14:52:54,2008-03-28 15:59:59
2,train,2008-03-28 16:00:00,2008-03-28 22:02:00
3,train,2008-03-29 01:27:50,2008-03-29 15:59:59
4,train,2008-03-29 16:00:00,2008-03-30 15:59:59
...,...,...,...
429,taxi,2008-12-07 10:30:54,2008-12-07 10:34:14
430,train,2008-12-07 10:59:29,2008-12-07 11:29:48
431,bus,2008-12-07 11:43:12,2008-12-07 12:23:26
432,walk,2008-12-07 12:23:34,2008-12-07 12:25:07


In [3]:
from tqdm import tqdm

trip_end_row_idxs = list()

for idx in tqdm(range(len(df_labels.index))):
    
    mask = (df_trajectories['DateTime'] >= df_labels.loc[idx, 'StartTime']) & (df_trajectories['DateTime'] <= df_labels.loc[idx, 'EndTime'])
        
    df_trip = df_trajectories[mask]
    trip_indices = df_trip.index
    
    if (len(trip_indices) > 1):
        df_trajectories.loc[trip_indices, 'TransportMode'] = df_labels.loc[idx, 'TransportMode']
        
        if (not df_trip.iloc[0].equals(df_trajectories.iloc[0])):
            if len(trip_end_row_idxs) == 0:
                df_trajectories.drop(list(range(0, trip_indices[0])), inplace=True)
            else:
                df_trajectories.drop(list(range(trip_end_row_idxs[-1] + 1, trip_indices[0])), inplace=True)
    
        trip_end_row_idxs.append(trip_indices[-1])
        
df_trajectories.drop(list(range(trip_end_row_idxs[-1] + 1, df_trajectories.index[-1] + 1)), inplace=True)

100%|██████████| 434/434 [00:28<00:00, 15.20it/s]


In [4]:
df_trajectories

Unnamed: 0,Latitude,Longitude,Altitude,DateTime,TransportMode
180180,39.894178,116.318200,-777,2008-03-28 14:54:40,train
180181,39.894505,116.321132,-777,2008-03-28 14:55:14,train
180182,39.894953,116.326452,-777,2008-03-28 14:56:13,train
180183,39.894600,116.332542,-777,2008-03-28 14:57:12,train
180184,39.889622,116.337040,-777,2008-03-28 14:58:11,train
...,...,...,...,...,...
719969,39.991644,116.326376,184,2008-12-07 12:37:24,taxi
719970,39.991641,116.326325,180,2008-12-07 12:37:25,taxi
719971,39.991641,116.326289,177,2008-12-07 12:37:26,taxi
719972,39.991643,116.326273,174,2008-12-07 12:37:27,taxi


In [5]:
import math

def haversine_distance(lat1, lon1, lat2, lon2):
    # Convert coordinates to radians
    lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = math.sin(dlat / 2) ** 2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2) ** 2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    r = 6371 # radius of the Earth in kilometers
    distance = r * c
    return distance * 1000

In [6]:
df_trajectories_sub = df_trajectories.iloc[1:]

df_trajectories.drop(trip_end_row_idxs[-1], inplace=True)

df_trajectories['LatEnd'] = df_trajectories_sub['Latitude'].tolist()
df_trajectories['LongEnd'] = df_trajectories_sub['Longitude'].tolist()
df_trajectories['AltEnd'] = df_trajectories_sub['Altitude'].tolist()
df_trajectories['TimeEnd'] = df_trajectories_sub['DateTime'].tolist()

df_trajectories.drop(trip_end_row_idxs[:-1], inplace=True)

df_trajectories

Unnamed: 0,Latitude,Longitude,Altitude,DateTime,TransportMode,LatEnd,LongEnd,AltEnd,TimeEnd
180180,39.894178,116.318200,-777,2008-03-28 14:54:40,train,39.894505,116.321132,-777,2008-03-28 14:55:14
180181,39.894505,116.321132,-777,2008-03-28 14:55:14,train,39.894953,116.326452,-777,2008-03-28 14:56:13
180182,39.894953,116.326452,-777,2008-03-28 14:56:13,train,39.894600,116.332542,-777,2008-03-28 14:57:12
180183,39.894600,116.332542,-777,2008-03-28 14:57:12,train,39.889622,116.337040,-777,2008-03-28 14:58:11
180184,39.889622,116.337040,-777,2008-03-28 14:58:11,train,39.882090,116.338353,-777,2008-03-28 14:59:10
...,...,...,...,...,...,...,...,...,...
719968,39.991633,116.326523,187,2008-12-07 12:37:22,taxi,39.991644,116.326376,184,2008-12-07 12:37:24
719969,39.991644,116.326376,184,2008-12-07 12:37:24,taxi,39.991641,116.326325,180,2008-12-07 12:37:25
719970,39.991641,116.326325,180,2008-12-07 12:37:25,taxi,39.991641,116.326289,177,2008-12-07 12:37:26
719971,39.991641,116.326289,177,2008-12-07 12:37:26,taxi,39.991643,116.326273,174,2008-12-07 12:37:27


In [7]:
df_trajectories['Altitude_avg'] = df_trajectories.apply(lambda row: float(row[2] + row[7]) / 2.0, axis=1)

In [8]:
df_trajectories['Distance(m)'] = df_trajectories.apply(lambda row: haversine_distance(row[0], row[1], row[5], row[6]), axis=1)

df_trajectories

Unnamed: 0,Latitude,Longitude,Altitude,DateTime,TransportMode,LatEnd,LongEnd,AltEnd,TimeEnd,Altitude_avg,Distance(m)
180180,39.894178,116.318200,-777,2008-03-28 14:54:40,train,39.894505,116.321132,-777,2008-03-28 14:55:14,-777.0,252.763508
180181,39.894505,116.321132,-777,2008-03-28 14:55:14,train,39.894953,116.326452,-777,2008-03-28 14:56:13,-777.0,456.582510
180182,39.894953,116.326452,-777,2008-03-28 14:56:13,train,39.894600,116.332542,-777,2008-03-28 14:57:12,-777.0,521.026904
180183,39.894600,116.332542,-777,2008-03-28 14:57:12,train,39.889622,116.337040,-777,2008-03-28 14:58:11,-777.0,673.538587
180184,39.889622,116.337040,-777,2008-03-28 14:58:11,train,39.882090,116.338353,-777,2008-03-28 14:59:10,-777.0,844.979544
...,...,...,...,...,...,...,...,...,...,...,...
719968,39.991633,116.326523,187,2008-12-07 12:37:22,taxi,39.991644,116.326376,184,2008-12-07 12:37:24,185.5,12.582622
719969,39.991644,116.326376,184,2008-12-07 12:37:24,taxi,39.991641,116.326325,180,2008-12-07 12:37:25,182.0,4.357512
719970,39.991641,116.326325,180,2008-12-07 12:37:25,taxi,39.991641,116.326289,177,2008-12-07 12:37:26,178.5,3.066865
719971,39.991641,116.326289,177,2008-12-07 12:37:26,taxi,39.991643,116.326273,174,2008-12-07 12:37:27,175.5,1.381074


In [9]:
df_trajectories['Duration(s)'] = df_trajectories.apply(lambda row: row[8] - row[3], axis=1)
df_trajectories

Unnamed: 0,Latitude,Longitude,Altitude,DateTime,TransportMode,LatEnd,LongEnd,AltEnd,TimeEnd,Altitude_avg,Distance(m),Duration(s)
180180,39.894178,116.318200,-777,2008-03-28 14:54:40,train,39.894505,116.321132,-777,2008-03-28 14:55:14,-777.0,252.763508,0 days 00:00:34
180181,39.894505,116.321132,-777,2008-03-28 14:55:14,train,39.894953,116.326452,-777,2008-03-28 14:56:13,-777.0,456.582510,0 days 00:00:59
180182,39.894953,116.326452,-777,2008-03-28 14:56:13,train,39.894600,116.332542,-777,2008-03-28 14:57:12,-777.0,521.026904,0 days 00:00:59
180183,39.894600,116.332542,-777,2008-03-28 14:57:12,train,39.889622,116.337040,-777,2008-03-28 14:58:11,-777.0,673.538587,0 days 00:00:59
180184,39.889622,116.337040,-777,2008-03-28 14:58:11,train,39.882090,116.338353,-777,2008-03-28 14:59:10,-777.0,844.979544,0 days 00:00:59
...,...,...,...,...,...,...,...,...,...,...,...,...
719968,39.991633,116.326523,187,2008-12-07 12:37:22,taxi,39.991644,116.326376,184,2008-12-07 12:37:24,185.5,12.582622,0 days 00:00:02
719969,39.991644,116.326376,184,2008-12-07 12:37:24,taxi,39.991641,116.326325,180,2008-12-07 12:37:25,182.0,4.357512,0 days 00:00:01
719970,39.991641,116.326325,180,2008-12-07 12:37:25,taxi,39.991641,116.326289,177,2008-12-07 12:37:26,178.5,3.066865,0 days 00:00:01
719971,39.991641,116.326289,177,2008-12-07 12:37:26,taxi,39.991643,116.326273,174,2008-12-07 12:37:27,175.5,1.381074,0 days 00:00:01


In [10]:
df_trajectories = df_trajectories[df_trajectories['Altitude'] != -777]

In [11]:
df_trajectories["Duration(s)"] = df_trajectories["Duration(s)"].apply(lambda x: x.total_seconds())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_trajectories["Duration(s)"] = df_trajectories["Duration(s)"].apply(lambda x: x.total_seconds())


In [12]:
df_trajectories["Velocity(m/s)"] = df_trajectories["Distance(m)"] / df_trajectories["Duration(s)"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_trajectories["Velocity(m/s)"] = df_trajectories["Distance(m)"] / df_trajectories["Duration(s)"]


In [13]:
df_trajectories["Altitude_avg(m)"] = df_trajectories["Altitude_avg"] / 3.2808

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_trajectories["Altitude_avg(m)"] = df_trajectories["Altitude_avg"] / 3.2808


In [14]:
# drop rows where Duration(s) is 0
df_trajectories = df_trajectories[df_trajectories['Duration(s)'] != 0]

In [15]:
df_trajectories

Unnamed: 0,Latitude,Longitude,Altitude,DateTime,TransportMode,LatEnd,LongEnd,AltEnd,TimeEnd,Altitude_avg,Distance(m),Duration(s),Velocity(m/s),Altitude_avg(m)
183878,39.476913,75.989937,4101,2008-04-02 06:10:11,walk,39.476905,75.989933,4101,2008-04-02 06:10:12,4101.0,0.953511,1.0,0.953511,1250.000000
183879,39.476905,75.989933,4101,2008-04-02 06:10:12,walk,39.476890,75.989917,4101,2008-04-02 06:10:13,4101.0,2.160517,1.0,2.160517,1250.000000
183880,39.476890,75.989917,4101,2008-04-02 06:10:13,walk,39.476878,75.989903,4101,2008-04-02 06:10:14,4101.0,1.795641,1.0,1.795641,1250.000000
183881,39.476878,75.989903,4101,2008-04-02 06:10:14,walk,39.476848,75.989903,4101,2008-04-02 06:10:15,4101.0,3.335848,1.0,3.335848,1250.000000
183882,39.476848,75.989903,4101,2008-04-02 06:10:15,walk,39.476828,75.989897,4101,2008-04-02 06:10:17,4101.0,2.282745,2.0,1.141373,1250.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
719968,39.991633,116.326523,187,2008-12-07 12:37:22,taxi,39.991644,116.326376,184,2008-12-07 12:37:24,185.5,12.582622,2.0,6.291311,56.541088
719969,39.991644,116.326376,184,2008-12-07 12:37:24,taxi,39.991641,116.326325,180,2008-12-07 12:37:25,182.0,4.357512,1.0,4.357512,55.474275
719970,39.991641,116.326325,180,2008-12-07 12:37:25,taxi,39.991641,116.326289,177,2008-12-07 12:37:26,178.5,3.066865,1.0,3.066865,54.407462
719971,39.991641,116.326289,177,2008-12-07 12:37:26,taxi,39.991643,116.326273,174,2008-12-07 12:37:27,175.5,1.381074,1.0,1.381074,53.493050


In [16]:
train = df_trajectories.loc[:, ["Distance(m)", "Duration(s)", "Velocity(m/s)", "Altitude_avg(m)", "TransportMode"]]
train.to_csv("final_traj_10.csv")