In [1]:
import datetime
import pandas as pd
from tqdm import tqdm

In [2]:
df0 = pd.read_feather("data/trip-updates.feather")
df0['datetime'] = df0['time'].apply(lambda x: datetime.datetime.fromtimestamp(x))

In [3]:
# Slice a smaller dataframe for testing
df = df0.iloc[:1_000_000].copy()

# Are there any duplicate rows?
print('Number of duplicate rows: ', df.duplicated().sum())

# Drop duplicates
df.drop_duplicates(inplace=True)

# Are there any duplicate rows?
print('Remaining number of duplicate rows: ', df.duplicated().sum())
df.shape

# Drop if 'stop_sequence' is 1
df = df[df['stop_sequence'] != 1]

# Drop if len(trip_id) > 20
df = df[df['trip_id'].str.len() <= 20]

# Create a new column 'group_id' to group rows by 'stop_sequence'
df['group_id'] = (
    (df['stop_sequence'].shift(1) - df['stop_sequence'] != -1) | 
    (df['trip_id'].shift(1) != df['trip_id'])
).cumsum().astype(int)

df.shape


row_counts = df.groupby(['group_id']).apply(len)
# row_counts = row_counts.sort_values(ascending=False)
# row_counts = row_counts[row_counts > 1]
row_counts.shape

# get the index of each group
group_idx = df.groupby(['group_id']).apply(lambda x: x.index)
# group_idx = group_idx.loc[row_counts.index]
group_idx.shape

Number of duplicate rows:  6981
Remaining number of duplicate rows:  0


(45904,)

In [4]:
df['uncertainty'].value_counts()

9      79464
16     58619
1      39929
3      34322
15     33092
       ...  
335        8
143        5
103        3
90         2
63         1
Name: uncertainty, Length: 107, dtype: int64

In [5]:
df.loc[15:30]

Unnamed: 0,trip_id,start_time,start_date,route_id,stop_sequence,delay,time,uncertainty,stop_id,datetime,group_id
15,0#785-17,16:39:00,20230506,115,27,-104,1683385880,21,72850,2023-05-06 17:11:20,1
16,0#785-17,16:39:00,20230506,115,28,-104,1683385996,16,82542,2023-05-06 17:13:16,1
18,0#715-26,17:00:00,20230506,87,2,-260,1683385012,9,75684,2023-05-06 16:56:52,2
19,0#715-26,17:00:00,20230506,87,3,-256,1683385000,5,73070,2023-05-06 16:56:40,2
20,0#715-26,17:00:00,20230506,87,4,-252,1683385086,5,73091,2023-05-06 16:58:06,2
21,0#715-26,17:00:00,20230506,87,5,-308,1683385118,5,73092,2023-05-06 16:58:38,2
22,0#715-26,17:00:00,20230506,87,6,-382,1683385206,5,73094,2023-05-06 17:00:06,2
23,0#715-26,17:00:00,20230506,87,7,-468,1683385262,5,73095,2023-05-06 17:01:02,2
24,0#715-26,17:00:00,20230506,87,8,-646,1683385514,5,73058,2023-05-06 17:05:14,2
25,0#715-26,17:00:00,20230506,87,9,-642,1683385550,5,73060,2023-05-06 17:05:50,2


In [6]:
def get_edge_times(df: pd.DataFrame) -> pd.DataFrame:
    """
    Get the time it took to travel between each stop.
    """

    # assert df['datetime'].is_monotonic_increasing

    # Add previous stop
    df['prev_stop_id']  = df['stop_id'].shift(1)

    # Add time it took between stops
    df['time_interval'] = df['datetime'].diff().dt.total_seconds()

    # Drop rows with missing time interval
    df.dropna(subset=['time_interval'], inplace=True)
    df['time_interval'] = df['time_interval'].astype(int)

    # trip_id
    df['trip_id'] = df['trip_id']

    return df[['datetime', 'prev_stop_id', 'stop_id', 'time_interval', 'trip_id', 'group_id']]

# get_edge_times(df.loc[group_idx.iat[0]].sort_values(by=['stop_sequence'])).head()

In [7]:
df['time_interval'] = df['datetime'].diff().dt.total_seconds()
df['prev_stop_id']  = df['stop_id'].shift(1)
# df.dropna(subset=['time_interval'], inplace=True)
# df['time_interval'] = df['time_interval'].astype(int)

# Drop first row of each group
df.drop([val.min() for val in group_idx.values], inplace=True)
df['time_interval'] = df['time_interval'].astype(int)
df.head()

Unnamed: 0,trip_id,start_time,start_date,route_id,stop_sequence,delay,time,uncertainty,stop_id,datetime,group_id,time_interval,prev_stop_id
1,0#785-17,16:39:00,20230506,115,13,36,1683384964,21,20215,2023-05-06 16:56:04,1,72,20323
2,0#785-17,16:39:00,20230506,115,14,56,1683385076,21,79756,2023-05-06 16:57:56,1,112,20215
3,0#785-17,16:39:00,20230506,115,15,90,1683385168,21,72993,2023-05-06 16:59:28,1,92,79756
4,0#785-17,16:39:00,20230506,115,16,56,1683385222,21,20295,2023-05-06 17:00:22,1,54,72993
5,0#785-17,16:39:00,20230506,115,17,24,1683385290,21,82709,2023-05-06 17:01:30,1,68,20295


In [8]:
df[df['time_interval'] < 0]

Unnamed: 0,trip_id,start_time,start_date,route_id,stop_sequence,delay,time,uncertainty,stop_id,datetime,group_id,time_interval,prev_stop_id
19,0#715-26,17:00:00,20230506,087,3,-256,1683385000,5,73070,2023-05-06 16:56:40,2,-12,75684
34,0#715-26,17:00:00,20230506,087,18,-640,1683385932,5,75695,2023-05-06 17:12:12,2,-38,75685
1819,0#415-25,16:54:00,20230506,01,18,0,0,0,78399,1970-01-01 01:00:00,75,-1683389114,76448
1982,0#1149-3,16:12:00,20230506,223,20,914,1683385208,9,70175,2023-05-06 17:00:08,85,-62,70173
1992,0#1149-3,16:12:00,20230506,223,30,652,1683385960,9,73571,2023-05-06 17:12:40,85,-70,73570
...,...,...,...,...,...,...,...,...,...,...,...,...,...
993617,0#1860-7,09:06:00,20230508,446,3,1834,1683531146,2,70057,2023-05-08 09:32:26,45669,-330,70055
993620,0#1860-7,09:06:00,20230508,446,6,1856,1683531344,2,72162,2023-05-08 09:35:44,45669,-286,70061
993641,0#1860-7,09:06:00,20230508,446,27,1836,1683532906,2,74239,2023-05-08 10:01:46,45669,-352,74237
995451,0#409-12,09:21:00,20230508,01,18,0,0,0,78399,1970-01-01 01:00:00,45757,-1683535210,76448


In [9]:
df.loc[1980:2000]

Unnamed: 0,trip_id,start_time,start_date,route_id,stop_sequence,delay,time,uncertainty,stop_id,datetime,group_id,time_interval,prev_stop_id
1981,0#1149-3,16:12:00,20230506,223,19,914,1683385270,1,70173,2023-05-06 17:01:10,85,168,76599
1982,0#1149-3,16:12:00,20230506,223,20,914,1683385208,9,70175,2023-05-06 17:00:08,85,-62,70173
1983,0#1149-3,16:12:00,20230506,223,21,928,1683385434,1,70009,2023-05-06 17:03:54,85,226,70175
1984,0#1149-3,16:12:00,20230506,223,22,910,1683385494,1,70179,2023-05-06 17:04:54,85,60,70009
1985,0#1149-3,16:12:00,20230506,223,23,846,1683385516,1,70181,2023-05-06 17:05:16,85,22,70179
1986,0#1149-3,16:12:00,20230506,223,24,668,1683385660,1,70225,2023-05-06 17:07:40,85,144,70181
1987,0#1149-3,16:12:00,20230506,223,25,652,1683385734,1,70226,2023-05-06 17:08:54,85,74,70225
1988,0#1149-3,16:12:00,20230506,223,26,662,1683385826,1,70227,2023-05-06 17:10:26,85,92,70226
1989,0#1149-3,16:12:00,20230506,223,27,660,1683385868,1,70228,2023-05-06 17:11:08,85,42,70227
1990,0#1149-3,16:12:00,20230506,223,28,668,1683385958,1,70229,2023-05-06 17:12:38,85,90,70228


In [10]:
# Drop rows with negative time interval
df = df[df['time_interval'] > 0]

# Drop rows with time interval > 1 hour
df[df['time_interval'] > 3600]
df = df[df['time_interval'] <= 3600]

df.describe()['time_interval'] / 60

count    13557.500000
mean         1.215847
std          0.958067
min          0.033333
25%          0.666667
50%          1.000000
75%          1.500000
max         50.600000
Name: time_interval, dtype: float64

In [11]:
edge_df = pd.read_parquet('data/edge_df.parquet')
edge_df = edge_df.astype(str)
edge_df['node_set'] = edge_df.apply(lambda x: tuple(sorted(list(x))), axis=1)
edge_map = {x['node_set']: idx for idx, x in edge_df.iterrows()}
edge_df.head()

Unnamed: 0,node1,node2,node_set
0,213,ROME6558,"(00213, ROME6558)"
1,213,75141,"(00213, 75141)"
2,213,70650,"(00213, 70650)"
3,213,78475,"(00213, 78475)"
4,213,70651,"(00213, 70651)"


In [12]:
stops = pd.read_csv('data/rome_static_gtfs_test/stops.txt')
stops['stop_id'] = stops['stop_id'].astype(str)

In [13]:
# Drop stops that are not in the stops.txt file
D = df[df['stop_id'].isin(stops.index) & (df['prev_stop_id'].isin(stops.index))].copy()

In [14]:
D = D[['datetime', 'prev_stop_id', 'stop_id', 'time_interval']]
D

Unnamed: 0,datetime,prev_stop_id,stop_id,time_interval


In [15]:
stop_groups = D.groupby(['prev_stop_id', 'stop_id']).apply(len)
stop_groups = pd.DataFrame(stop_groups).reset_index()

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  stop_groups = D.groupby(['prev_stop_id', 'stop_id']).apply(len)


ValueError: cannot insert stop_id, already exists

In [None]:
import geopy.distance

int(geopy.distance.geodesic(stops.loc[stop_groups.iloc[0]['prev_stop_id']][['stop_lat', 'stop_lon']].values,
                        stops.loc[stop_groups.iloc[0]['stop_id']][['stop_lat', 'stop_lon']].values).m)

def get_distance(x):
    return int(geopy.distance.geodesic(stops.loc[x['prev_stop_id']][['stop_lat', 'stop_lon']].values,
                                       stops.loc[x['stop_id']][['stop_lat', 'stop_lon']].values).m)

stop_groups['distance'] = stop_groups.apply(get_distance, axis=1)

In [None]:
# Make prev_stop_id, stop_id index
stop_groups.set_index(['prev_stop_id', 'stop_id'], inplace=True)

In [None]:
tqdm.pandas()
D['distance'] = D.progress_apply(lambda x: stop_groups.loc[(x['prev_stop_id'], x['stop_id'])]['distance'], axis=1)
D['hour'] = D['datetime'].dt.hour

100%|██████████| 810488/810488 [02:36<00:00, 5194.37it/s]


In [None]:
# Ols from statsmodels
from statsmodels.regression.linear_model import OLS
import statsmodels.api as sm

X = D[['distance', 'hour']]
X = sm.add_constant(X)
y = D['time_interval']

model = OLS(y, X, hasconst=True)
results = model.fit()
results.summary()

0,1,2,3
Dep. Variable:,time_interval,R-squared:,0.339
Model:,OLS,Adj. R-squared:,0.339
Method:,Least Squares,F-statistic:,207700.0
Date:,"Sun, 27 Aug 2023",Prob (F-statistic):,0.0
Time:,16:10:24,Log-Likelihood:,-4266400.0
No. Observations:,810488,AIC:,8533000.0
Df Residuals:,810485,BIC:,8533000.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,23.9971,0.126,190.455,0.000,23.750,24.244
distance,0.1209,0.000,635.913,0.000,0.121,0.121
hour,0.7587,0.008,99.570,0.000,0.744,0.774

0,1,2,3
Omnibus:,868392.667,Durbin-Watson:,1.581
Prob(Omnibus):,0.0,Jarque-Bera (JB):,337407726.88
Skew:,4.835,Prob(JB):,0.0
Kurtosis:,102.487,Cond. No.,1030.0


In [None]:
# cluster stops
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=10, random_state=0).fit(stops[['stop_lat', 'stop_lon']])
stops['cluster'] = kmeans.labels_
stop_centers = stops.groupby(['cluster']).mean()[['stop_lat', 'stop_lon']]
stop_centers

  stop_centers = stops.groupby(['cluster']).mean()[['stop_lat', 'stop_lon']]


Unnamed: 0_level_0,stop_lat,stop_lon
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,41.759507,12.335181
1,41.986198,12.478545
2,41.833326,12.449082
3,41.938547,12.547795
4,41.793156,12.506544
5,42.014584,12.345919
6,41.913217,12.39635
7,41.866556,12.57265
8,41.895181,12.486714
9,41.886209,12.660659


In [None]:
# Calculate the distance between each stop and the cluster center
def distance_from_centers(row, stop_centers, i):
    """Distance from the cluster center i"""
    return int(geopy.distance.geodesic([row['stop_lat'], row['stop_lon']], [stop_centers.iloc[i]['stop_lat'], stop_centers.iloc[i]['stop_lon']]).m)

stop_groups.reset_index(inplace=True)
for i in tqdm(range(10)):
    stops[f'dist_{i}'] = stops.apply(lambda x: distance_from_centers(x, stop_centers, i), axis=1)

100%|██████████| 10/10 [00:37<00:00,  3.73s/it]


In [None]:
# stop_groups.set_index(['prev_stop_id', 'stop_id'], inplace=True)
# Match the distance to each row
# for i in tqdm(range(10)):
#     D[f'dist_{i}'] = D['stop_id'].apply(lambda x: stops.loc[x, f'dist_{i}'])

D_temp = D['stop_id'].progress_apply(lambda x: stops.loc[x, [f'dist_{i}' for i in range(10)]])

  0%|          | 0/810488 [00:00<?, ?it/s]

100%|██████████| 810488/810488 [12:20<00:00, 1094.41it/s] 


In [None]:
D = pd.concat([D, D_temp], axis=1)

In [None]:
# Ols from statsmodels
from statsmodels.regression.linear_model import OLS
import statsmodels.api as sm

X = D[['distance', 'hour', 'dist_0', 'dist_1', 'dist_2', 'dist_3', 'dist_4', 'dist_5', 'dist_6', 'dist_7', 'dist_8', 'dist_9']]
X = sm.add_constant(X)
y = D['time_interval']

model = OLS(y, X, hasconst=True)
results = model.fit()
results.summary()

0,1,2,3
Dep. Variable:,time_interval,R-squared:,0.403
Model:,OLS,Adj. R-squared:,0.403
Method:,Least Squares,F-statistic:,45620.0
Date:,"Sun, 27 Aug 2023",Prob (F-statistic):,0.0
Time:,16:23:46,Log-Likelihood:,-4224900.0
No. Observations:,810488,AIC:,8450000.0
Df Residuals:,810475,BIC:,8450000.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-39.7204,1.761,-22.558,0.000,-43.172,-36.269
distance,0.1246,0.000,683.579,0.000,0.124,0.125
hour,0.7621,0.007,104.836,0.000,0.748,0.776
dist_0,0.0004,4.23e-05,8.499,0.000,0.000,0.000
dist_1,0.0007,7.42e-05,10.050,0.000,0.001,0.001
dist_2,0.0016,6.01e-05,26.747,0.000,0.001,0.002
dist_3,0.0021,5.4e-05,38.157,0.000,0.002,0.002
dist_4,0.0015,5.88e-05,25.813,0.000,0.001,0.002
dist_5,0.0004,6.98e-05,5.029,0.000,0.000,0.000

0,1,2,3
Omnibus:,924156.691,Durbin-Watson:,1.747
Prob(Omnibus):,0.0,Jarque-Bera (JB):,498620691.46
Skew:,5.326,Prob(JB):,0.0
Kurtosis:,124.044,Cond. No.,1540000.0


In [None]:
# Quantize datetime of Data dD
D['quantized_datetime'] = D['datetime'].apply(lambda x: x.replace(minute=0, second=0))
D['quantized_datetime'].groupby(D['quantized_datetime']).apply(len)

quantized_datetime
2023-05-06 01:00:00        6
2023-05-06 02:00:00        2
2023-05-06 03:00:00        1
2023-05-06 04:00:00        1
2023-05-06 16:00:00     2651
2023-05-06 17:00:00    27826
2023-05-06 18:00:00    60358
2023-05-06 19:00:00    30938
2023-05-06 20:00:00    28878
2023-05-06 21:00:00    25248
2023-05-06 22:00:00    21296
2023-05-06 23:00:00     9221
2023-05-07 00:00:00    14313
2023-05-07 01:00:00    47917
2023-05-07 02:00:00     6855
2023-05-07 03:00:00     4248
2023-05-07 04:00:00     1339
2023-05-07 05:00:00     2152
2023-05-07 06:00:00     5773
2023-05-07 07:00:00    16322
2023-05-07 08:00:00    18330
2023-05-07 09:00:00    19948
2023-05-07 10:00:00    20952
2023-05-07 11:00:00    20304
2023-05-07 12:00:00    19007
2023-05-07 13:00:00    20158
2023-05-07 14:00:00    20198
2023-05-07 15:00:00    20673
2023-05-07 16:00:00    21184
2023-05-07 17:00:00    22225
2023-05-07 18:00:00    22403
2023-05-07 19:00:00    21297
2023-05-07 20:00:00    20121
2023-05-07 21:00:00    1