In [2]:
import pandas as pd
import numpy as np
agency = pd.read_csv('../datasets/asana/gtfs_data/agency.txt')
routes = pd.read_csv('../datasets/asana/gtfs_data/routes_utf8.txt', sep="\t")
stop_times = pd.read_csv('../datasets/asana/gtfs_data/stop_times.txt', sep="\t")
stops = pd.read_csv('../datasets/asana/gtfs_data/stops.txt', sep="\t")
trips = pd.read_csv('../datasets/asana/gtfs_data/trips.txt', sep="\t")



In [3]:
modeling_structure_df = pd.DataFrame()

route_travel_time = pd.DataFrame()
arrival_time = pd.DataFrame()
delay_prediction = pd.DataFrame()
# most descriptive columns from the gtfs format for modeling that i currently identified are:
#  from stop_times : stop_id, arrival_time, departure_time
#  from stops : stop_id, stop_lat, stop_lon
#  from trips : shape_id, route_id (To relate the info, basically a join table)

# The features that i imagine i could extract to get more info are:

# average_stop_times (Would describe an average of time from stop to stop per route)
# a more defined and specific attributes below:

#avg_seconds_between_stops
#std_seconds_between_stops

# hour_of_day (Pretty descriptive name)
# stop_quantity (A sum of the bus stops)
# stop_density_500m (This measures how conglomerated a zone is in the 500m radious)


# gps_data = pd.read_csv('../datasets/asana/gps_data.csv') # thing is so big i need to put attributes here because checking takes long - id,deviceid,devicetime,latitude,longitude,speed,route_direction,route_id,route_guid,device_guid

# Fordward here im going to explain the structure of how this dataset is going to be analized

# I'm going to add steps to it: 
# 1st Step is going to ananlyze just structural, coordinates, stop density, etc. So i can see for example if using a delay in time target (Which will probably be the most used target in this notebook), i can see how much the data is giving in comparison 
# ------- (Here on step 1, will be using normal train_test_split and will be trained with diferent models and compared with CV)
# to 2nd Step which will only use context data such as speeds, times, etc. || i will search for most significant/descriptive attributes for each, and look into compare important stadistic values such as R^2 and others
# ------- (Here on step 2, will be using time_series_split with lags, so the model also finds relationship in between the difference in time)
# Finally on step 3, depending on which attributes tend to be the most meaningful and tune the best models to find a in between in overfitting and underfitting 


# Now onto other other targets its going to be the effective_speeds
# Using the GTFS-RT, finding the actual time taken to complete a route, we can find the delays in the actual schedulet time, which will be useful to create a "isDelayed" target using a specific treshold
# Clustering could be applied using the times, the route and locations and stop densities to find congestion zones



In [4]:
# First row im going to compute is the trip length in KM with sets of longitudes using the Haversine formula |||| reference : https://www.youtube.com/watch?v=cD6pUr_5RtE

In [5]:
import math

def distance_between(lat1, lon1, lat2, lon2):
    earthRadius = 6371
    dlat = degreesToRadians(lat2-lat1)
    dlon = degreesToRadians(lon2-lon1) 
    rad = math.sin(dlat/2) * math.sin(dlat/2) + math.cos(degreesToRadians(lat1)) * math.cos(degreesToRadians(lat2)) * math.sin(dlon/2) * math.sin(dlon/2)
    rad = 2 * math.atan2(math.sqrt(rad), math.sqrt(1-rad))
    distance = earthRadius * rad
    return distance
    
def degreesToRadians(deg):
    return deg * (math.pi / 180) 

In [6]:
#stops.head()


In [7]:
#trips.head()

In [8]:
stop_times.head()

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence
0,10406,07:20:44,07:20:44,5d11962d-6412-4c7c-984d-f4ee5eb63859,41
1,27104,20:14:37,20:14:57,5d11962d-6412-4c7c-984d-f4ee5eb63859,40
2,25687,17:45:24,17:45:43,5d11962d-6412-4c7c-984d-f4ee5eb63859,41
3,25541,10:42:59,10:42:59,5d11962d-6412-4c7c-984d-f4ee5eb63859,42
4,22444,13:40:10,13:41:00,5d11962d-6412-4c7c-984d-f4ee5eb63859,35


In [9]:
#routes.head()

In [10]:
sorted_stop_times = stop_times.sort_values(by=['trip_id', 'stop_sequence'], ascending=[True, True])


In [11]:
sorted_route_stops_coords = sorted_stop_times.merge(stops[["stop_id", "stop_lat", "stop_lon"]], on="stop_id", how="left")


In [12]:
sorted_route_stops_coords["prev_lat"] = sorted_route_stops_coords["stop_lat"].shift(1)
sorted_route_stops_coords["prev_lon"] = sorted_route_stops_coords["stop_lon"].shift(1)


In [13]:
sorted_route_stops_coords["segment_distance"] = sorted_route_stops_coords.apply(
    lambda row: distance_between(
        row["prev_lat"],
        row["prev_lon"],
        row["stop_lat"],
        row["stop_lon"]
    ) if pd.notnull(row["prev_lat"]) else 0,
    axis=1
)


In [14]:
route_distance_df = (
    sorted_route_stops_coords.groupby("trip_id")["segment_distance"]
      .sum()
      .reset_index(name="total_distance_km")
)


In [15]:
modeling_structure_df = trips

In [16]:

modeling_structure_df = modeling_structure_df.drop(columns=['service_id','vehicle_id', 'direction_id'])

In [17]:
modeling_structure_df = modeling_structure_df.merge(route_distance_df, on="trip_id", how="left")


In [18]:
stop_count = (
    stop_times.groupby("trip_id")['stop_sequence'].count()
    .reset_index(name="total_stops")
)

In [19]:
modeling_structure_df = modeling_structure_df.merge(stop_count, on="trip_id", how="left")


In [20]:
def calcEffectiveSpeed(start_time_s, end_time_s, total_distance_km):
    if end_time_s < start_time_s:  #this means it cycled to next day (which i find improbable, but still)
        end_time_s+=86400
    duration = end_time_s - start_time_s

    if duration <= 0:
        return np.nan #invalid data
    
    return total_distance_km * 3600/duration
    

In [21]:
modeling_structure_df["start_sec"] = pd.to_timedelta(modeling_structure_df["start_time"]).dt.total_seconds()
modeling_structure_df["end_sec"]   = pd.to_timedelta(modeling_structure_df["end_time"]).dt.total_seconds()
modeling_structure_df = modeling_structure_df.drop(columns=['start_time', 'end_time']) 

modeling_structure_df["effective_scheduled_speed_kmh"] = modeling_structure_df.apply(
    lambda row: calcEffectiveSpeed(
        row["start_sec"],
        row["end_sec"],
        row["total_distance_km"],
    ),
    axis=1
)


In [22]:
modeling_structure_df

Unnamed: 0,route_id,trip_id,total_distance_km,total_stops,start_sec,end_sec,effective_scheduled_speed_kmh
0,d626b854-27aa-41d4-8625-ebafa73d8f21,23809,37.516956,36,38521.0,44990.0,20.878195
1,d626b854-27aa-41d4-8625-ebafa73d8f21,23705,37.516956,36,28733.0,33312.0,29.495751
2,d626b854-27aa-41d4-8625-ebafa73d8f21,23547,21.245568,39,70904.0,75385.0,17.068522
3,d626b854-27aa-41d4-8625-ebafa73d8f21,23497,21.077345,36,62522.0,70901.0,9.055787
4,d626b854-27aa-41d4-8625-ebafa73d8f21,23455,21.353914,40,57835.0,62517.0,16.419071
...,...,...,...,...,...,...,...
19764,c453217f-9f9d-49ce-8b32-14a6b7013691,26011,38.988103,44,53273.0,58278.0,28.043391
19765,c453217f-9f9d-49ce-8b32-14a6b7013691,25978,20.239318,38,48387.0,53259.0,14.955161
19766,c453217f-9f9d-49ce-8b32-14a6b7013691,25951,25.623639,44,43346.0,48379.0,18.328055
19767,c453217f-9f9d-49ce-8b32-14a6b7013691,25902,36.667677,39,36459.0,43292.0,19.318548


In [23]:
routes_per_stop_df = (stop_times.groupby("stop_id")["trip_id"].count().reset_index(name="routes_per_stop"))
routes_per_stop_df["routes_per_stop"] = routes_per_stop_df["routes_per_stop"] - 1 #exclude current route 


merged = stop_times.merge(routes_per_stop_df, on="stop_id", how="left")

In [24]:
overlap_score = (
    merged.groupby('trip_id')["routes_per_stop"]
    .mean()
    .reset_index(name="overlap_score")
)

In [25]:
modeling_structure_df = modeling_structure_df.merge(overlap_score, on="trip_id", how="left")

In [26]:
modeling_structure_df["stop_density"] = (modeling_structure_df["total_stops"]-1) / modeling_structure_df["total_distance_km"] #substracting 1 to stops to measure the actual segments between stops

In [27]:
modeling_structure_df = modeling_structure_df.drop(columns=["total_stops", "start_sec", "end_sec"])

In [28]:
modeling_structure_df 

Unnamed: 0,route_id,trip_id,total_distance_km,effective_scheduled_speed_kmh,overlap_score,stop_density
0,d626b854-27aa-41d4-8625-ebafa73d8f21,23809,37.516956,20.878195,4220.777778,0.932911
1,d626b854-27aa-41d4-8625-ebafa73d8f21,23705,37.516956,29.495751,4220.777778,0.932911
2,d626b854-27aa-41d4-8625-ebafa73d8f21,23547,21.245568,17.068522,3834.871795,1.788608
3,d626b854-27aa-41d4-8625-ebafa73d8f21,23497,21.077345,9.055787,4220.777778,1.660551
4,d626b854-27aa-41d4-8625-ebafa73d8f21,23455,21.353914,16.419071,3873.925000,1.826363
...,...,...,...,...,...,...
19764,c453217f-9f9d-49ce-8b32-14a6b7013691,26011,38.988103,28.043391,4809.590909,1.102901
19765,c453217f-9f9d-49ce-8b32-14a6b7013691,25978,20.239318,14.955161,5183.394737,1.828125
19766,c453217f-9f9d-49ce-8b32-14a6b7013691,25951,25.623639,18.328055,4809.590909,1.678138
19767,c453217f-9f9d-49ce-8b32-14a6b7013691,25902,36.667677,19.318548,5237.820513,1.036335


In [29]:
modeling_structure_df[["total_distance_km", "stop_density", "overlap_score", "effective_scheduled_speed_kmh"]].corr() # stop density and total distance km are very correlated, but will keep it because 
                                                                                                                      #i will probably just use gradient boosting and it already handles it


Unnamed: 0,total_distance_km,stop_density,overlap_score,effective_scheduled_speed_kmh
total_distance_km,1.0,-0.93441,-0.062129,0.700862
stop_density,-0.93441,1.0,0.141396,-0.632685
overlap_score,-0.062129,0.141396,1.0,-0.216111
effective_scheduled_speed_kmh,0.700862,-0.632685,-0.216111,1.0


In [30]:
modeling_structure_df = modeling_structure_df.drop(columns=["route_id", "trip_id"])

In [31]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score, KFold

X = modeling_structure_df.drop(columns=["effective_scheduled_speed_kmh"])
y = modeling_structure_df["effective_scheduled_speed_kmh"]


In [32]:
linearRegression_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("model", LinearRegression())
])

In [33]:
xgb_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("model", XGBRegressor(
    n_estimators=200,
    learning_rate=0.3,
    max_depth=6,
    ))
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2
)
# First training with just structural attributes

xgb_scores = cross_val_score(xgb_pipeline, X_train, y_train, cv=5)
xgb_scores_mae = cross_val_score(xgb_pipeline, X_train, y_train, cv=5,scoring="neg_mean_absolute_error")
xgb_scores_rmae = cross_val_score(xgb_pipeline, X_train, y_train, cv=5,scoring='neg_mean_squared_error')
LR_scores = cross_val_score(linearRegression_pipeline, X_train, y_train, cv=5)
LR_scores_mae = cross_val_score(linearRegression_pipeline, X_train, y_train, cv=5,scoring="neg_mean_absolute_error")
LR_scores_rmae = cross_val_score(linearRegression_pipeline, X_train, y_train, cv=5,scoring='neg_mean_squared_error')

In [34]:
print(f"xgb R^2: {xgb_scores.mean()}, MAE: {xgb_scores_mae.mean()} RMAE: {xgb_scores_rmae.mean()}")
print(f"LR R^2: {LR_scores.mean()}, MAE: {LR_scores_mae.mean()}RMAE: {LR_scores_rmae.mean()}")


xgb R^2: 0.5653110207382841, MAE: -2.7998759329279586 RMAE: -15.255130119359759
LR R^2: 0.5309609713201695, MAE: -3.038137485136354RMAE: -16.466986493722928


In [35]:
# now feature engineering with time data GTFS-RT data: 
# Columns : id,deviceid,devicetime,latitude,longitude,speed,route_direction,route_id,route_guid,device_guid

In [36]:
segment_level_data = pd.read_csv('../datasets/asana/segment_level_data.csv')

In [38]:
segment_level_data

Unnamed: 0,date,deviceid,direction,segment,start_point,end_point,start_time,run_time_in_seconds,dwell_time_in_seconds,arrival_time,departure_time,trip_id,device_guid,start_guid,end_guid
0,2024-08-01,1038,2,39,2037,2038,2024-08-01 17:38:47,279,0,2024-08-01 17:43:26,2024-08-01 17:43:26,2306,04326bab-ee84-491d-8b3c-db5be265e756,3d570ad2-db12-47d1-b15e-a33dfa403c56,9c39bef4-78ba-41cc-af86-4e9cec3e06dd
1,2024-08-01,1038,2,6,2006,2004,2024-08-01 16:44:26,52,0,2024-08-01 16:45:18,2024-08-01 16:45:18,2306,04326bab-ee84-491d-8b3c-db5be265e756,88b7ca3b-3501-4efe-b33f-701453ffa277,28764497-a568-434f-83dd-932a3c91e199
2,2024-09-11,780,2,5,2004,2006,2024-09-11 16:16:14,37,0,2024-09-11 16:16:51,2024-09-11 16:16:51,19707,ad9f7b2b-5776-4de9-9b73-e28b5fe585fa,28764497-a568-434f-83dd-932a3c91e199,88b7ca3b-3501-4efe-b33f-701453ffa277
3,2024-09-11,789,2,5,2004,2006,2024-09-11 18:22:42,54,0,2024-09-11 18:23:36,2024-09-11 18:23:36,19767,998bdc88-e13b-41fc-951e-7dd31ae1179b,28764497-a568-434f-83dd-932a3c91e199,88b7ca3b-3501-4efe-b33f-701453ffa277
4,2024-09-11,791,2,5,2006,2004,2024-09-11 18:41:49,49,0,2024-09-11 18:42:38,2024-09-11 18:42:38,19777,1c2bf601-811b-4623-a976-bdc87dff3b3b,88b7ca3b-3501-4efe-b33f-701453ffa277,28764497-a568-434f-83dd-932a3c91e199
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
785971,2024-08-24,1119,1,5,1004,10021,2024-08-24 10:31:26,105,26,2024-08-24 10:33:11,2024-08-24 10:33:37,16230,10c1fd94-9082-46ac-89b7-ef458e0677b3,731aaea7-faff-4e78-b247-ae0f180e0052,6315f075-d165-40b9-b97c-aa60a49512d0
785972,2024-08-24,1119,2,42,2041,2042,2024-08-24 12:52:40,41,30,2024-08-24 12:53:21,2024-08-24 12:53:51,16298,10c1fd94-9082-46ac-89b7-ef458e0677b3,0dfeaf23-1068-4a93-9666-5cc804d389a1,e6719284-431b-4c10-bb30-4f2e99de2f98
785973,2024-08-24,1119,2,43,2042,238,2024-08-24 12:53:51,71,18,2024-08-24 12:55:02,2024-08-24 12:55:20,16298,10c1fd94-9082-46ac-89b7-ef458e0677b3,e6719284-431b-4c10-bb30-4f2e99de2f98,623fcc14-2417-4d31-84b9-ec08483a12ea
785974,2024-08-24,1119,2,37,2035,2036,2024-08-24 15:40:40,39,20,2024-08-24 15:41:19,2024-08-24 15:41:39,16404,10c1fd94-9082-46ac-89b7-ef458e0677b3,4cc233c3-db14-4e65-b96b-91271c045f99,ce8054ef-6b41-439f-b763-a770a69cf95d
