In [217]:
import pandas as pd

agency = pd.read_csv('../datasets/asana/gtfs_data/agency.txt')
routes = pd.read_csv('../datasets/asana/gtfs_data/routes_utf8.txt', sep="\t")
stop_times = pd.read_csv('../datasets/asana/gtfs_data/stop_times.txt', sep="\t")
stops = pd.read_csv('../datasets/asana/gtfs_data/stops.txt', sep="\t")
trips = pd.read_csv('../datasets/asana/gtfs_data/trips.txt', sep="\t")



In [218]:
modeling_structure_df = pd.DataFrame()

route_travel_time = pd.DataFrame()
arrival_time = pd.DataFrame()
delay_prediction = pd.DataFrame()
# most descriptive columns from the gtfs format for modeling that i currently identified are:
#  from stop_times : stop_id, arrival_time, departure_time
#  from stops : stop_id, stop_lat, stop_lon
#  from trips : shape_id, route_id (To relate the info, basically a join table)

# The features that i imagine i could extract to get more info are:

# average_stop_times (Would describe an average of time from stop to stop per route)
# a more defined and specific attributes below:

#avg_seconds_between_stops
#std_seconds_between_stops

# hour_of_day (Pretty descriptive name)
# stop_quantity (A sum of the bus stops)
# stop_density_500m (This measures how conglomerated a zone is in the 500m radious)


# gps_data = pd.read_csv('../datasets/asana/gps_data.csv') # thing is so big i need to put attributes here because checking takes long - id,deviceid,devicetime,latitude,longitude,speed,route_direction,route_id,route_guid,device_guid

# Fordward here im going to explain the structure of how this dataset is going to be analized

# I'm going to add steps to it: 
# 1st Step is going to ananlyze just structural, coordinates, stop density, etc. So i can see for example if using a delay in time target (Which will probably be the most used target in this notebook), i can see how much the data is giving in comparison 
# ------- (Here on step 1, will be using normal train_test_split and will be trained with diferent models and compared with CV)
# to 2nd Step which will only use context data such as speeds, times, etc. || i will search for most significant/descriptive attributes for each, and look into compare important stadistic values such as R^2 and others
# ------- (Here on step 2, will be using time_series_split with lags, so the model also finds relationship in between the difference in time)
# Finally on step 3, depending on which attributes tend to be the most meaningful and tune the best models to find a in between in overfitting and underfitting 


# Now onto other other targets its going to be the effective_speeds
# Using the GTFS-RT, finding the actual time taken to complete a route, we can find the delays in the actual schedulet time, which will be useful to create a "isDelayed" target using a specific treshold
# Clustering could be applied using the times, the route and locations and stop densities to find congestion zones



In [219]:
# First row im going to compute is the trip length in KM with sets of longitudes using the Haversine formula |||| reference : https://www.youtube.com/watch?v=cD6pUr_5RtE

In [220]:
import math

def distance_between(lat1, lon1, lat2, lon2):
    earthRadius = 6371
    dlat = degreesToRadians(lat2-lat1)
    dlon = degreesToRadians(lon2-lon1) 
    rad = math.sin(dlat/2) * math.sin(dlat/2) + math.cos(degreesToRadians(lat1)) * math.cos(degreesToRadians(lat2)) * math.sin(dlon/2) * math.sin(dlon/2)
    rad = 2 * math.atan2(math.sqrt(rad), math.sqrt(1-rad))
    distance = earthRadius * rad
    return distance
    
def degreesToRadians(deg):
    return deg * (math.pi / 180) 

In [221]:
#stops.head()


In [222]:
#trips.head()

In [223]:
#stop_times.head()

In [224]:
#routes.head()

In [225]:
sorted_stop_times = stop_times.sort_values(by=['trip_id', 'stop_sequence'], ascending=[True, True])


In [226]:
sorted_route_stops_coords = sorted_stop_times.merge(stops[["stop_id", "stop_lat", "stop_lon"]], on="stop_id", how="left")


In [227]:
sorted_route_stops_coords["prev_lat"] = sorted_route_stops_coords["stop_lat"].shift(1)
sorted_route_stops_coords["prev_lon"] = sorted_route_stops_coords["stop_lon"].shift(1)


In [228]:
sorted_route_stops_coords["segment_distance"] = sorted_route_stops_coords.apply(
    lambda row: distance_between(
        row["prev_lat"],
        row["prev_lon"],
        row["stop_lat"],
        row["stop_lon"]
    ) if pd.notnull(row["prev_lat"]) else 0,
    axis=1
)


In [229]:
route_distance_df = (
    sorted_route_stops_coords.groupby("trip_id")["segment_distance"]
      .sum()
      .reset_index(name="total_distance_km")
)


In [230]:
modeling_structure_df = trips

In [231]:

modeling_structure_df = modeling_structure_df.drop(columns=['service_id','vehicle_id', 'direction_id'])

In [232]:
modeling_structure_df.merge(route_distance_df, on="trip_id", how="left")

Unnamed: 0,route_id,trip_id,start_time,end_time,total_distance_km
0,d626b854-27aa-41d4-8625-ebafa73d8f21,23809,10:42:01,12:29:50,37.516956
1,d626b854-27aa-41d4-8625-ebafa73d8f21,23705,07:58:53,09:15:12,37.516956
2,d626b854-27aa-41d4-8625-ebafa73d8f21,23547,19:41:44,20:56:25,21.245568
3,d626b854-27aa-41d4-8625-ebafa73d8f21,23497,17:22:02,19:41:41,21.077345
4,d626b854-27aa-41d4-8625-ebafa73d8f21,23455,16:03:55,17:21:57,21.353914
...,...,...,...,...,...
19764,c453217f-9f9d-49ce-8b32-14a6b7013691,26011,14:47:53,16:11:18,38.988103
19765,c453217f-9f9d-49ce-8b32-14a6b7013691,25978,13:26:27,14:47:39,20.239318
19766,c453217f-9f9d-49ce-8b32-14a6b7013691,25951,12:02:26,13:26:19,25.623639
19767,c453217f-9f9d-49ce-8b32-14a6b7013691,25902,10:07:39,12:01:32,36.667677
