In [193]:
import pandas as pd
import numpy as np
agency = pd.read_csv('../datasets/asana/gtfs_data/agency.txt')
routes = pd.read_csv('../datasets/asana/gtfs_data/routes_utf8.txt', sep="\t")
stop_times = pd.read_csv('../datasets/asana/gtfs_data/stop_times.txt', sep="\t")
stops = pd.read_csv('../datasets/asana/gtfs_data/stops.txt', sep="\t")
trips = pd.read_csv('../datasets/asana/gtfs_data/trips.txt', sep="\t")



In [194]:
modeling_structure_df = pd.DataFrame()

route_travel_time = pd.DataFrame()
arrival_time = pd.DataFrame()
delay_prediction = pd.DataFrame()
# most descriptive columns from the gtfs format for modeling that i currently identified are:
#  from stop_times : stop_id, arrival_time, departure_time
#  from stops : stop_id, stop_lat, stop_lon
#  from trips : shape_id, route_id (To relate the info, basically a join table)

# The features that i imagine i could extract to get more info are:

# average_stop_times (Would describe an average of time from stop to stop per route)
# a more defined and specific attributes below:

#avg_seconds_between_stops
#std_seconds_between_stops

# hour_of_day (Pretty descriptive name)
# stop_quantity (A sum of the bus stops)
# stop_density_500m (This measures how conglomerated a zone is in the 500m radious)


# gps_data = pd.read_csv('../datasets/asana/gps_data.csv') # thing is so big i need to put attributes here because checking takes long - id,deviceid,devicetime,latitude,longitude,speed,route_direction,route_id,route_guid,device_guid

# Fordward here im going to explain the structure of how this dataset is going to be analized

# I'm going to add steps to it: 
# 1st Step is going to ananlyze just structural, coordinates, stop density, etc. So i can see for example if using a delay in time target (Which will probably be the most used target in this notebook), i can see how much the data is giving in comparison 
# ------- (Here on step 1, will be using normal train_test_split and will be trained with diferent models and compared with CV)
# to 2nd Step which will only use context data such as speeds, times, etc. || i will search for most significant/descriptive attributes for each, and look into compare important stadistic values such as R^2 and others
# ------- (Here on step 2, will be using time_series_split with lags, so the model also finds relationship in between the difference in time)
# Finally on step 3, depending on which attributes tend to be the most meaningful and tune the best models to find a in between in overfitting and underfitting 


# Now onto other other targets its going to be the effective_speeds
# Using the GTFS-RT, finding the actual time taken to complete a route, we can find the delays in the actual schedulet time, which will be useful to create a "isDelayed" target using a specific treshold
# Clustering could be applied using the times, the route and locations and stop densities to find congestion zones



In [195]:
# First row im going to compute is the trip length in KM with sets of longitudes using the Haversine formula |||| reference : https://www.youtube.com/watch?v=cD6pUr_5RtE

In [196]:
import math

def distance_between(lat1, lon1, lat2, lon2):
    earthRadius = 6371
    dlat = degreesToRadians(lat2-lat1)
    dlon = degreesToRadians(lon2-lon1) 
    rad = math.sin(dlat/2) * math.sin(dlat/2) + math.cos(degreesToRadians(lat1)) * math.cos(degreesToRadians(lat2)) * math.sin(dlon/2) * math.sin(dlon/2)
    rad = 2 * math.atan2(math.sqrt(rad), math.sqrt(1-rad))
    distance = earthRadius * rad
    return distance
    
def degreesToRadians(deg):
    return deg * (math.pi / 180) 

In [197]:
#stops.head()


In [198]:
#trips.head()

In [199]:
stop_times.head()

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence
0,10406,07:20:44,07:20:44,5d11962d-6412-4c7c-984d-f4ee5eb63859,41
1,27104,20:14:37,20:14:57,5d11962d-6412-4c7c-984d-f4ee5eb63859,40
2,25687,17:45:24,17:45:43,5d11962d-6412-4c7c-984d-f4ee5eb63859,41
3,25541,10:42:59,10:42:59,5d11962d-6412-4c7c-984d-f4ee5eb63859,42
4,22444,13:40:10,13:41:00,5d11962d-6412-4c7c-984d-f4ee5eb63859,35


In [200]:
#routes.head()

In [201]:
sorted_stop_times = stop_times.sort_values(by=['trip_id', 'stop_sequence'], ascending=[True, True])


In [202]:
sorted_route_stops_coords = sorted_stop_times.merge(stops[["stop_id", "stop_lat", "stop_lon"]], on="stop_id", how="left")


In [203]:
sorted_route_stops_coords["prev_lat"] = sorted_route_stops_coords["stop_lat"].shift(1)
sorted_route_stops_coords["prev_lon"] = sorted_route_stops_coords["stop_lon"].shift(1)


In [204]:
sorted_route_stops_coords["segment_distance"] = sorted_route_stops_coords.apply(
    lambda row: distance_between(
        row["prev_lat"],
        row["prev_lon"],
        row["stop_lat"],
        row["stop_lon"]
    ) if pd.notnull(row["prev_lat"]) else 0,
    axis=1
)


In [205]:
route_distance_df = (
    sorted_route_stops_coords.groupby("trip_id")["segment_distance"]
      .sum()
      .reset_index(name="total_distance_km")
)


In [206]:
modeling_structure_df = trips

In [207]:

modeling_structure_df = modeling_structure_df.drop(columns=['service_id','vehicle_id', 'direction_id'])

In [208]:
modeling_structure_df = modeling_structure_df.merge(route_distance_df, on="trip_id", how="left")


In [209]:
stop_count = (
    stop_times.groupby("trip_id")['stop_sequence'].count()
    .reset_index(name="total_stops")
)

In [210]:
modeling_structure_df = modeling_structure_df.merge(stop_count, on="trip_id", how="left")


In [211]:
def calcEffectiveSpeed(start_time_s, end_time_s, total_distance_km):
    if end_time_s < start_time_s:  #this means it cycled to next day (which i find improbable, but still)
        end_time_s+=86400
    duration = end_time_s - start_time_s

    if duration <= 0:
        return np.nan #invalid data
    
    return total_distance_km * 3600/duration
    

In [212]:
modeling_structure_df["start_sec"] = pd.to_timedelta(modeling_structure_df["start_time"]).dt.total_seconds()
modeling_structure_df["end_sec"]   = pd.to_timedelta(modeling_structure_df["end_time"]).dt.total_seconds()
modeling_structure_df = modeling_structure_df.drop(columns=['start_time', 'end_time']) 

modeling_structure_df["effective_scheduled_speed_kmh"] = modeling_structure_df.apply(
    lambda row: calcEffectiveSpeed(
        row["start_sec"],
        row["end_sec"],
        row["total_distance_km"],
    ),
    axis=1
)


In [213]:
modeling_structure_df

Unnamed: 0,route_id,trip_id,total_distance_km,total_stops,start_sec,end_sec,effective_scheduled_speed_kmh
0,d626b854-27aa-41d4-8625-ebafa73d8f21,23809,37.516956,36,38521.0,44990.0,20.878195
1,d626b854-27aa-41d4-8625-ebafa73d8f21,23705,37.516956,36,28733.0,33312.0,29.495751
2,d626b854-27aa-41d4-8625-ebafa73d8f21,23547,21.245568,39,70904.0,75385.0,17.068522
3,d626b854-27aa-41d4-8625-ebafa73d8f21,23497,21.077345,36,62522.0,70901.0,9.055787
4,d626b854-27aa-41d4-8625-ebafa73d8f21,23455,21.353914,40,57835.0,62517.0,16.419071
...,...,...,...,...,...,...,...
19764,c453217f-9f9d-49ce-8b32-14a6b7013691,26011,38.988103,44,53273.0,58278.0,28.043391
19765,c453217f-9f9d-49ce-8b32-14a6b7013691,25978,20.239318,38,48387.0,53259.0,14.955161
19766,c453217f-9f9d-49ce-8b32-14a6b7013691,25951,25.623639,44,43346.0,48379.0,18.328055
19767,c453217f-9f9d-49ce-8b32-14a6b7013691,25902,36.667677,39,36459.0,43292.0,19.318548


In [227]:
routes_per_stop_df = (stop_times.groupby("stop_id")["trip_id"].count().reset_index(name="routes_per_stop"))
routes_per_stop_df["routes_per_stop"] = routes_per_stop_df["routes_per_stop"] - 1 #exclude current route 


merged = stop_times.merge(routes_per_stop_df, on="stop_id", how="left")

In [228]:
overlap_score = (
    merged.groupby('trip_id')["routes_per_stop"]
    .mean()
    .reset_index(name="overlap_score")
)

In [229]:
modeling_structure_df = modeling_structure_df.merge(overlap_score, on="stop_id", how="left")

Unnamed: 0,trip_id,overlap_score
0,1,5125.707317
1,6,5067.142857
2,7,4071.685714
3,10,5470.828571
4,11,4950.046512
...,...,...
19764,29509,5129.860465
19765,29511,5017.658537
19766,29514,5041.250000
19767,29515,5175.523810
