## Step 1: Import required libraries and packages

In [1]:
# import required libraries
import pandas as pd
import numpy as np
import datetime
from geopy.distance import distance
import time
pd.set_option('mode.chained_assignment', None)

# set directories
resource_directory = "00_Ntbk_Resources\\00_DataCollectionProcessingCleaning\\PMPML_BusRoutes_July2019\\"
result_directory = "00_Ntbk_Resources\\01_DataAnalysis\\00_ProcessedData\\PMPML_BusRoutes_July2019\\"

## Step 2: Data Collection

source: http://opendata.punecorporation.org/Citizen/CitizenDatasets/Index

tags: PMPML

title: PMPML Bus Routes - July 2019

file type: .zip

size: 8.81 MB

## Step 3: Data Processing and Cleaning

In [2]:
# import trip/service calendar information into the dataframe
df_TripCalendar = pd.read_csv(f"{resource_directory}calendar.txt")
df_TripCalendar

Unnamed: 0,service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,start_date,end_date
0,1,1,1,1,1,1,1,1,20190802,20191031
1,2,1,1,1,1,1,1,0,20190802,20191031
2,3,0,0,0,0,0,1,0,20190802,20191031
3,4,1,1,1,0,1,1,0,20190802,20191031
4,5,0,0,0,0,0,1,1,20190802,20191031
5,6,1,1,1,1,1,0,0,20190802,20191031
6,7,0,0,0,0,0,0,1,20190802,20191031
7,8,0,0,0,1,0,0,0,20190802,20191031


In [3]:
# import bus stop information into the dataframe
df_BusStops = pd.read_csv(f"{resource_directory}stops.txt")
df_BusStops = df_BusStops.drop(columns=["zone_id","stop_url","stop_desc","stop_code","location_type","parent_station","stop_timezone","wheelchair_boarding"])
df_BusStops

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon
0,32769,Forest Office Road,18.51983,73.63158
1,32770,Kusalkar Putala,18.52889,73.82708
2,32771,Tower Line,18.67007,73.78613
3,32772,Nehrunagar Depot,18.62930,73.81942
4,32773,Jagtap Chowk,18.49153,73.90019
...,...,...,...,...
5619,32759,Stage Kramank 6,18.42101,73.85842
5620,32760,NCL Colony,18.54002,73.81678
5621,32761,Samarth Vidyalay,18.73266,73.66366
5622,32762,Aswee Trading Company,18.43444,73.86860


In [4]:
# check dataframe for NaN values
df_BusStops[df_BusStops.isna().any(axis=1)]

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon


In [5]:
# import bus route information into the dataframe
df_BusRouteShapes = pd.read_csv(f"{resource_directory}shapes.txt")
df_BusRouteShapes

Unnamed: 0,shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled
0,4476,18.57660,73.68828,1,
1,4476,18.57696,73.68822,2,
2,4476,18.57726,73.68800,3,
3,4476,18.57750,73.68781,4,
4,4476,18.57767,73.68779,5,
...,...,...,...,...,...
210303,3316,18.61872,73.85017,229,
210304,3316,18.62052,73.85071,230,
210305,3316,18.62176,73.85106,231,
210306,3316,18.62235,73.85118,232,


In [6]:
# calculate the distance for each route and add it to separate column of dataframe
df_temp = df_BusRouteShapes.groupby("shape_id").transform(lambda x : x.shift(1))[["shape_pt_lat", "shape_pt_lon"]]

list_dist_traveled = []
for a,b,c,d in zip(df_BusRouteShapes["shape_pt_lat"], df_BusRouteShapes["shape_pt_lon"], df_temp["shape_pt_lat"], df_temp["shape_pt_lon"]):
    if not np.isnan(c) :
        list_dist_traveled.append(distance([a,b],[c,d]).km)
    else :
        list_dist_traveled.append(0)
        
df_BusRouteShapes ["shape_dist_traveled"] = list_dist_traveled
df_BusRouteShapes["trip_distance"] = df_BusRouteShapes.groupby("shape_id")["shape_dist_traveled"].transform(sum)
df_BusRouteShapes

Unnamed: 0,shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled,trip_distance
0,4476,18.57660,73.68828,1,0.000000,24.580331
1,4476,18.57696,73.68822,2,0.040348,24.580331
2,4476,18.57726,73.68800,3,0.040521,24.580331
3,4476,18.57750,73.68781,4,0.033285,24.580331
4,4476,18.57767,73.68779,5,0.018935,24.580331
...,...,...,...,...,...,...
210303,3316,18.61872,73.85017,229,0.111507,22.772528
210304,3316,18.62052,73.85071,230,0.207227,22.772528
210305,3316,18.62176,73.85106,231,0.142135,22.772528
210306,3316,18.62235,73.85118,232,0.066522,22.772528


In [7]:
# import bus trip information into the dataframe and add trip distance information in a new column of dataframe
df_BusTrips = pd.read_csv(f"{resource_directory}trips.txt")
df_BusTrips = df_BusTrips.drop(columns=["trip_short_name","block_id","wheelchair_accessible","bikes_allowed","duty","duty_sequence_number","run_sequence_number"])
df_BusTrips = df_BusTrips.join(df_BusRouteShapes.set_index("shape_id").groupby("shape_id")[["trip_distance"]].max().reset_index("shape_id").set_index("shape_id"), on="shape_id")
df_BusTrips

Unnamed: 0,route_id,service_id,trip_id,trip_headsign,direction_id,shape_id,trip_distance
0,42,1,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,Katraj To Bhakti Shakti,0,4810,30.111607
1,42,1,NORMAL_42_Bhakti Shakti To Katraj_Down-2010_0,Bhakti Shakti To Katraj,0,4811,30.025115
2,366,1,NORMAL_366_Pune Station To Bhakti Shakti (Via ...,Pune Station To Bhakti Shakti (Via Yerwada),0,4794,22.519401
3,42,1,NORMAL_42_Katraj To Bhakti Shakti_Up-0655_0,Katraj To Bhakti Shakti,0,4810,30.111607
4,366,1,NORMAL_366_Bhakti Shakti To Pune Station (Via ...,Bhakti Shakti To Pune Station (Via Yerwada),0,4795,21.296043
...,...,...,...,...,...,...,...
21799,209_96,1,NORMAL_209_Saswad Bus Stand To Katraj_Down-1705_0,Saswad Bus Stand To Katraj,1,5245,25.970120
21800,209_96,1,NORMAL_209_Katraj To Saswad Bus Stand_Up-1845_0,Katraj To Saswad Bus Stand,1,5244,25.965253
21801,209_96,1,NORMAL_209_Saswad Bus Stand To Katraj_Down-2000_0,Saswad Bus Stand To Katraj,1,5245,25.970120
21802,209_96,1,NORMAL_209_Katraj To Saswad Bus Stand_Up-2130_0,Katraj To Saswad Bus Stand,1,5244,25.965253


In [8]:
# import bus stop times information into the dataframe
df_BusStopTimes = pd.read_csv(f"{resource_directory}stop_times.txt")
df_BusStopTimes = df_BusStopTimes.drop(columns=["stop_headsign","pickup_type","drop_off_type","shape_dist_traveled","timepoint"])
df_BusStopTimes

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence
0,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,07:40:00,07:40:00,38794,1
1,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,07:40:14,07:40:33,37062,2
2,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,07:41:36,07:41:53,35142,3
3,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,07:42:24,07:42:49,38796,4
4,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,07:44:07,07:44:29,38797,5
...,...,...,...,...,...
751847,NORMAL_209_Saswad Bus Stand To Katraj_Down-2240_0,23:18:14,23:18:27,38584,35
751848,NORMAL_209_Saswad Bus Stand To Katraj_Down-2240_0,23:18:56,23:19:09,33232,36
751849,NORMAL_209_Saswad Bus Stand To Katraj_Down-2240_0,23:19:36,23:19:53,38586,37
751850,NORMAL_209_Saswad Bus Stand To Katraj_Down-2240_0,23:19:54,23:20:04,38588,38


In [9]:
# information from this dataframe needs to verify because this dataframe has a lot of information and chances of human error are also high
df_BusStopTimes[df_BusStopTimes.isnull().any(axis=1)]

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence


To verify the information in dataframe we are going to run some filters and check whether wrong information is present or not.\
We are also going to fix the wrong information with the average value of logically correct information, while doing this we also need to take care that already correct information is not getting changed.\
1) Check whether arrival_time or departure time are correctly entered i.e. they are in 24hr format.\
2) Check duration between arrival and departure time for each stop, it should be realistic\
3) Check duration between stop to stop arrival time, it should not be more than 2.5 to 3hrs considering traffic conditions

In [10]:
# check whether arrival_time or departure time is correctly entered i.e. they are in 24hr format.
df_temp1 = df_BusStopTimes[(df_BusStopTimes["arrival_time"] > '23:59:59') | (df_BusStopTimes["departure_time"] > '23:59:59')]
df_temp1["trip_id"].unique()

array(['NORMAL_42_Bhakti Shakti To Katraj_Down-2310_0',
       'NORMAL_42_Bhakti Shakti To Katraj_Down-2330_0',
       'NORMAL_42_Bhakti Shakti To Katraj_Down-2225_0',
       'NORMAL_42_Katraj To Bhakti Shakti_Up-2320_0',
       'NORMAL_42_Bhakti Shakti To Katraj_Down-2250_0',
       'NORMAL_322_Ma Na Pa River Side To Akurdi Railway Station_Up-2325_0',
       'NORMAL_149_Shewalwadi To Bhakti Shakti_Up-2215_0',
       'NORMAL_119_Alandi To Ma Na Pa_Down-2315_0',
       'NORMAL_119_Alandi To Ma Na Pa_Down-2350_0',
       'NORMAL_208_Hinjawadi Maan Phase 3 To Hadapsar Gadital_Down-2350_0',
       'NORMAL_358_Rajgurunagar To Bhosari_Down-2330_0',
       'NORMAL_291_Hadapsar Gadital To Katraj_Down-2315_0',
       'NORMAL_358_Rajgurunagar To Bhosari_Down-2315_0',
       'NORMAL_358_Rajgurunagar To Bhosari_Down-2350_0',
       'NORMAL_149_Shewalwadi To Bhakti Shakti_Up-2230_0',
       'NORMAL_149_Shewalwadi To Bhakti Shakti_Up-2240_0',
       'NORMAL_149_Shewalwadi To Bhakti Shakti_Up-2250_0'

In [11]:
# there are some entries which have arrival or departure time greater than 23:59:59
# the wrong information is found where bus trips are started late at night and end on next day early in the morning
# so time mentioned beyond 23:59:59 is nothing but early morning on the next day
# let's check and replace such entries with correct time values

index_bad1 = df_BusStopTimes[(df_BusStopTimes["trip_id"].str[-6:-2] >= str(2040)) & (df_BusStopTimes["arrival_time"].str.startswith("24:") | df_BusStopTimes["arrival_time"].str.startswith("25:"))].index
index_bad2 = df_BusStopTimes[(df_BusStopTimes["trip_id"].str[-6:-2] >= str(2040)) & (df_BusStopTimes["departure_time"].str.startswith("24:") | df_BusStopTimes["departure_time"].str.startswith("25:"))].index

df_BusStopTimes.loc[index_bad1,"arrival_time"] = df_BusStopTimes.iloc[index_bad1]["arrival_time"].str.replace('^24:','00:').str.replace('^25:','01:').to_list()
df_BusStopTimes.loc[index_bad2,"departure_time"] = df_BusStopTimes.iloc[index_bad2]["departure_time"].str.replace('^24:','00:').str.replace('^25:','01:').to_list()

In [12]:
# for most of the trips from Swargate to Pune Station arrival and departure times are wrong which results in longer trip duration
# so lets filter out those bad trip ids and replace their arrival and departure times with average values of good trip_ids

df_temp1 = df_temp1[df_temp1["trip_id"].str.contains("NORMAL_5_Swargate To Pune Station_Up")]

list_BadTripIds = df_temp1["trip_id"].unique().tolist()

df_temp2 = df_BusStopTimes[df_BusStopTimes["trip_id"].str.contains("NORMAL_5_Swargate To Pune Station_Up")]
df_temp2.reset_index(inplace=True, drop =True)

df_temp3 = df_temp2[~ df_temp2["trip_id"].isin(list_BadTripIds)]
df_temp3.reset_index(inplace=True, drop =True)

df_temp3["deparr_time_diff"] = pd.to_timedelta(df_temp3["departure_time"]) - pd.to_timedelta(df_temp3["arrival_time"])
df_temp3["arr_time_diff"] = df_temp3.groupby("trip_id")["arrival_time"].transform(lambda x : pd.to_timedelta(x) - pd.to_timedelta(x.shift()))
df_temp3["avg_deparr_time_diff"] = df_temp3.groupby("stop_sequence")["deparr_time_diff"].transform(lambda x : pd.to_timedelta(x).sum()/ len(x))
df_temp3["avg_arr_time_diff"] = df_temp3.groupby("stop_sequence")["arr_time_diff"].transform(lambda x : pd.to_timedelta(x).sum()/ len(x))
df_temp3["avg_arr_time_diff"] = df_temp3.groupby("trip_id")["avg_arr_time_diff"].transform(lambda x : x.cumsum())

list_avg_deparr_time_diff = df_temp3[df_temp3["trip_id"] == "NORMAL_5_Swargate To Pune Station_Up-1010_0"]["avg_deparr_time_diff"].astype(str).str[7:15].tolist()
list_avg_arr_time_diff = df_temp3[df_temp3["trip_id"] == "NORMAL_5_Swargate To Pune Station_Up-1010_0"]["avg_arr_time_diff"].astype(str).str[7:15].tolist()

In [13]:
# this function will replace the wrong arrival and departure times of trips considering average duration found in other trips with correct trip ids
def changeTimings(df) :
    df_temp1 = pd.DataFrame()
    df_temp1["avg_arr_time_diff"] = list_avg_arr_time_diff
    
    list_arrival_time = (pd.to_timedelta(df["arrival_time"].tolist()[1]) + pd.to_timedelta(df_temp1["avg_arr_time_diff"])).astype(str).str[7:15].tolist()
    df["arrival_time"][2:] = list_arrival_time[2:]
    
    df_temp1["avg_deparr_time_diff"] = list_avg_deparr_time_diff
    df["departure_time"] = ((pd.to_timedelta(df.reset_index(drop=True)["arrival_time"]) + pd.to_timedelta(df_temp1["avg_deparr_time_diff"])).astype(str).str[7:15].tolist())

    return df

In [14]:
df_BusStopTimes[df_BusStopTimes["trip_id"].isin(list_BadTripIds)] = df_BusStopTimes[df_BusStopTimes["trip_id"].isin(list_BadTripIds)].groupby("trip_id").apply(lambda x: changeTimings(x))
df_BusStopTimes

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence
0,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,07:40:00,07:40:00,38794,1
1,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,07:40:14,07:40:33,37062,2
2,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,07:41:36,07:41:53,35142,3
3,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,07:42:24,07:42:49,38796,4
4,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,07:44:07,07:44:29,38797,5
...,...,...,...,...,...
751847,NORMAL_209_Saswad Bus Stand To Katraj_Down-2240_0,23:18:14,23:18:27,38584,35
751848,NORMAL_209_Saswad Bus Stand To Katraj_Down-2240_0,23:18:56,23:19:09,33232,36
751849,NORMAL_209_Saswad Bus Stand To Katraj_Down-2240_0,23:19:36,23:19:53,38586,37
751850,NORMAL_209_Saswad Bus Stand To Katraj_Down-2240_0,23:19:54,23:20:04,38588,38


In [15]:
# still, some entries need to be corrected individually
df_temp1 = df_BusStopTimes[df_BusStopTimes["trip_id"]=="NORMAL_181C_Na Ta Wadi To Kondhwa Bk_Up-1450_0"]
df_temp1["deparr_time_diff"] = pd.to_timedelta(df_temp1["departure_time"]) - pd.to_timedelta(df_temp1["arrival_time"])
df_temp1["arr_time_diff"] = pd.to_timedelta(df_temp1["arrival_time"]) - pd.to_timedelta(df_temp1["arrival_time"].shift())
df_temp1.iloc[0,6] = pd.to_timedelta("00:00:00")
df_temp1.iloc[1,6] = pd.to_timedelta("00:01:15")
df_temp1["arr_time_diff"] = df_temp1.groupby("trip_id")["arr_time_diff"].transform(lambda x: x.cumsum())
df_temp1["arrival_time"] = (pd.to_timedelta(df_temp1.iloc[0,1]) + pd.to_timedelta(df_temp1["arr_time_diff"])).astype(str).str[7:15].tolist()
df_temp1["departure_time"] = (pd.to_timedelta(df_temp1["arrival_time"]) + pd.to_timedelta(df_temp1["deparr_time_diff"])).astype(str).str[7:15].tolist()
df_temp1= df_temp1.drop(columns=["deparr_time_diff","arr_time_diff"])
df_BusStopTimes[df_BusStopTimes["trip_id"]=="NORMAL_181C_Na Ta Wadi To Kondhwa Bk_Up-1450_0"] = df_temp1
print(df_BusStopTimes[df_BusStopTimes["trip_id"]=="NORMAL_181C_Na Ta Wadi To Kondhwa Bk_Up-1450_0"])

df_temp1 = df_BusStopTimes[df_BusStopTimes["trip_id"]=="NORMAL_181C_Na Ta Wadi To Kondhwa Bk_Up-1450_0"]
df_temp1["deparr_time_diff"] = pd.to_timedelta(df_temp1["departure_time"]) - pd.to_timedelta(df_temp1["arrival_time"])
df_temp1["arr_time_diff"] = pd.to_timedelta(df_temp1["arrival_time"]) - pd.to_timedelta(df_temp1["arrival_time"].shift())
df_temp1.iloc[0,6] = pd.to_timedelta("00:00:00")
df_temp1.iloc[1,6] =  pd.to_timedelta("00:01:15")
df_temp1["arr_time_diff"] = df_temp1.groupby("trip_id")["arr_time_diff"].transform(lambda x: x.cumsum())
df_temp1["arrival_time"] = (pd.to_timedelta(df_temp1.iloc[0,1]) + pd.to_timedelta(df_temp1["arr_time_diff"])).astype(str).str[7:15].tolist()
df_temp1["departure_time"] = (pd.to_timedelta(df_temp1["arrival_time"]) + pd.to_timedelta(df_temp1["deparr_time_diff"])).astype(str).str[7:15].tolist()
df_temp1= df_temp1.drop(columns=["deparr_time_diff","arr_time_diff"])
df_BusStopTimes[df_BusStopTimes["trip_id"]=="NORMAL_181C_Na Ta Wadi To Kondhwa Bk_Up-1450_0"] = df_temp1
print(df_BusStopTimes[df_BusStopTimes["trip_id"]=="NORMAL_181C_Na Ta Wadi To Kondhwa Bk_Up-1450_0"])

df_temp1 = df_BusStopTimes[df_BusStopTimes["trip_id"]=="NORMAL_256_Someshwarwadi To PMC_Down-1945_0"]
df_temp1["deparr_time_diff"] = pd.to_timedelta(df_temp1["departure_time"]) - pd.to_timedelta(df_temp1["arrival_time"])
df_temp1["arr_time_diff"] = pd.to_timedelta(df_temp1["arrival_time"]) - pd.to_timedelta(df_temp1["arrival_time"].shift())
df_temp1.iloc[0,6] = pd.to_timedelta("00:00:00")
df_temp1.iloc[1,6] =  pd.to_timedelta("00:01:15")
df_temp1.iloc[2,6] =  pd.to_timedelta("00:01:15")
df_temp1.iloc[1,5] =  pd.to_timedelta("00:00:18")
df_temp1["arr_time_diff"] = df_temp1.groupby("trip_id")["arr_time_diff"].transform(lambda x: x.cumsum())
df_temp1["arrival_time"] = (pd.to_timedelta(df_temp1.iloc[0,1]) + pd.to_timedelta(df_temp1["arr_time_diff"])).astype(str).str[7:15].tolist()
df_temp1["departure_time"] = (pd.to_timedelta(df_temp1["arrival_time"]) + pd.to_timedelta(df_temp1["deparr_time_diff"])).astype(str).str[7:15].tolist()
df_temp1= df_temp1.drop(columns=["deparr_time_diff","arr_time_diff"])
df_BusStopTimes[df_BusStopTimes["trip_id"]=="NORMAL_256_Someshwarwadi To PMC_Down-1945_0"] = df_temp1
print(df_BusStopTimes[df_BusStopTimes["trip_id"]=="NORMAL_256_Someshwarwadi To PMC_Down-1945_0"])

                                               trip_id arrival_time  \
448428  NORMAL_181C_Na Ta Wadi To Kondhwa Bk_Up-1450_0     14:50:00   
448429  NORMAL_181C_Na Ta Wadi To Kondhwa Bk_Up-1450_0     14:51:15   
448430  NORMAL_181C_Na Ta Wadi To Kondhwa Bk_Up-1450_0     14:54:30   
448431  NORMAL_181C_Na Ta Wadi To Kondhwa Bk_Up-1450_0     14:55:43   
448432  NORMAL_181C_Na Ta Wadi To Kondhwa Bk_Up-1450_0     14:57:30   
448433  NORMAL_181C_Na Ta Wadi To Kondhwa Bk_Up-1450_0     14:59:14   
448434  NORMAL_181C_Na Ta Wadi To Kondhwa Bk_Up-1450_0     15:00:32   
448435  NORMAL_181C_Na Ta Wadi To Kondhwa Bk_Up-1450_0     15:02:49   
448436  NORMAL_181C_Na Ta Wadi To Kondhwa Bk_Up-1450_0     15:05:00   
448437  NORMAL_181C_Na Ta Wadi To Kondhwa Bk_Up-1450_0     15:07:28   
448438  NORMAL_181C_Na Ta Wadi To Kondhwa Bk_Up-1450_0     15:09:56   
448439  NORMAL_181C_Na Ta Wadi To Kondhwa Bk_Up-1450_0     15:11:11   
448440  NORMAL_181C_Na Ta Wadi To Kondhwa Bk_Up-1450_0     15:11:57   
448441

In [16]:
# verify once again before proceeding to next filter
df_temp1 = df_BusStopTimes[(df_BusStopTimes["arrival_time"] > '23:59:59') | (df_BusStopTimes["departure_time"] > '23:59:59')]
df_temp1

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence


In [17]:
# as arrival and departure values are within range we can change the data type of both columns to datetime
df_BusStopTimes["arrival_time"] = df_BusStopTimes["arrival_time"].astype('datetime64[ns]')
df_BusStopTimes["departure_time"] = df_BusStopTimes["departure_time"].astype('datetime64[ns]')
df_BusStopTimes

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence
0,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,2020-07-31 07:40:00,2020-07-31 07:40:00,38794,1
1,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,2020-07-31 07:40:14,2020-07-31 07:40:33,37062,2
2,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,2020-07-31 07:41:36,2020-07-31 07:41:53,35142,3
3,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,2020-07-31 07:42:24,2020-07-31 07:42:49,38796,4
4,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,2020-07-31 07:44:07,2020-07-31 07:44:29,38797,5
...,...,...,...,...,...
751847,NORMAL_209_Saswad Bus Stand To Katraj_Down-2240_0,2020-07-31 23:18:14,2020-07-31 23:18:27,38584,35
751848,NORMAL_209_Saswad Bus Stand To Katraj_Down-2240_0,2020-07-31 23:18:56,2020-07-31 23:19:09,33232,36
751849,NORMAL_209_Saswad Bus Stand To Katraj_Down-2240_0,2020-07-31 23:19:36,2020-07-31 23:19:53,38586,37
751850,NORMAL_209_Saswad Bus Stand To Katraj_Down-2240_0,2020-07-31 23:19:54,2020-07-31 23:20:04,38588,38


In [18]:
# this function will update the date of entry to next day if arrival time is less than its previous arrival time
def changeDate(df):    
    changeIndex = df[df["arrival_time"].shift(-1) < df["arrival_time"]].index + 1
    if changeIndex.any() :
        df.loc[changeIndex[0] : df.tail(1).index[0],"arrival_time"] = df.loc[changeIndex[0] : df.tail(1).index[0],"arrival_time"] + datetime.timedelta(days=1)
    
    changeIndex = df[df["departure_time"].shift(-1) < df["departure_time"]].index + 1
    if changeIndex.any() :
        df.loc[changeIndex[0] : df.tail(1).index[0],"departure_time"] = df.loc[changeIndex[0] : df.tail(1).index[0],"departure_time"] + datetime.timedelta(days=1)
    
    return df

In [19]:
df_BusStopTimes = df_BusStopTimes.groupby("trip_id").apply(lambda x: changeDate(x))
df_BusStopTimes

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence
0,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,2020-07-31 07:40:00,2020-07-31 07:40:00,38794,1
1,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,2020-07-31 07:40:14,2020-07-31 07:40:33,37062,2
2,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,2020-07-31 07:41:36,2020-07-31 07:41:53,35142,3
3,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,2020-07-31 07:42:24,2020-07-31 07:42:49,38796,4
4,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,2020-07-31 07:44:07,2020-07-31 07:44:29,38797,5
...,...,...,...,...,...
751847,NORMAL_209_Saswad Bus Stand To Katraj_Down-2240_0,2020-07-31 23:18:14,2020-07-31 23:18:27,38584,35
751848,NORMAL_209_Saswad Bus Stand To Katraj_Down-2240_0,2020-07-31 23:18:56,2020-07-31 23:19:09,33232,36
751849,NORMAL_209_Saswad Bus Stand To Katraj_Down-2240_0,2020-07-31 23:19:36,2020-07-31 23:19:53,38586,37
751850,NORMAL_209_Saswad Bus Stand To Katraj_Down-2240_0,2020-07-31 23:19:54,2020-07-31 23:20:04,38588,38


In [20]:
# let's add a column to the dataframe which will show arrival time difference between two consecutive stops
# also, add column showing duration between arrival and departure time for each stop
df_BusStopTimes["stp2stp_arrival_time"] = df_BusStopTimes.groupby("trip_id")["arrival_time"].transform(lambda x : x.shift(-1) - x)
df_BusStopTimes["deparr_time_diff"] = df_BusStopTimes["departure_time"] - df_BusStopTimes["arrival_time"]
df_BusStopTimes

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stp2stp_arrival_time,deparr_time_diff
0,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,2020-07-31 07:40:00,2020-07-31 07:40:00,38794,1,00:00:14,00:00:00
1,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,2020-07-31 07:40:14,2020-07-31 07:40:33,37062,2,00:01:22,00:00:19
2,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,2020-07-31 07:41:36,2020-07-31 07:41:53,35142,3,00:00:48,00:00:17
3,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,2020-07-31 07:42:24,2020-07-31 07:42:49,38796,4,00:01:43,00:00:25
4,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,2020-07-31 07:44:07,2020-07-31 07:44:29,38797,5,00:01:21,00:00:22
...,...,...,...,...,...,...,...
751847,NORMAL_209_Saswad Bus Stand To Katraj_Down-2240_0,2020-07-31 23:18:14,2020-07-31 23:18:27,38584,35,00:00:42,00:00:13
751848,NORMAL_209_Saswad Bus Stand To Katraj_Down-2240_0,2020-07-31 23:18:56,2020-07-31 23:19:09,33232,36,00:00:40,00:00:13
751849,NORMAL_209_Saswad Bus Stand To Katraj_Down-2240_0,2020-07-31 23:19:36,2020-07-31 23:19:53,38586,37,00:00:18,00:00:17
751850,NORMAL_209_Saswad Bus Stand To Katraj_Down-2240_0,2020-07-31 23:19:54,2020-07-31 23:20:04,38588,38,00:01:31,00:00:10


In [21]:
# take a look at the maximum and 75% values of the stop to stop arrival duration as well as departure and arrival time difference
df_BusStopTimes.describe()

Unnamed: 0,stop_id,stop_sequence,stp2stp_arrival_time,deparr_time_diff
count,751852.0,751852.0,730048,751852
mean,33090.180385,20.695383,0 days 00:01:38.374975,0 days 00:00:24.387662
std,11749.454833,14.596249,0 days 00:02:44.696241,0 days 00:01:15.318114
min,386.0,1.0,0 days 00:00:00,0 days 00:00:00
25%,33443.0,9.0,0 days 00:00:53,0 days 00:00:12
50%,38773.0,18.0,0 days 00:01:17,0 days 00:00:20
75%,39276.0,29.0,0 days 00:01:54,0 days 00:00:30
max,40516.0,87.0,0 days 06:18:15,0 days 06:17:18


In [22]:
# there are entries where the time difference between arrival and departure time is greater than 6hrs, i.e. not possible 
# though 75% value is 30 seconds, for the safer side we'll assume duration as 3minutes and replace those with 75% value
# a similar technique can be applied for a stop to stop arrival time as well

df_BusStopTimes.loc[df_BusStopTimes[df_BusStopTimes["deparr_time_diff"] > datetime.timedelta(minutes = 3)].index,"deparr_time_diff"] =  df_BusStopTimes.describe().loc["75%", "deparr_time_diff"]
df_BusStopTimes.loc[df_BusStopTimes[df_BusStopTimes["stp2stp_arrival_time"] > datetime.timedelta(minutes = 30)].index,"stp2stp_arrival_time"]=  df_BusStopTimes.describe().loc["75%", "stp2stp_arrival_time"]
df_BusStopTimes

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stp2stp_arrival_time,deparr_time_diff
0,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,2020-07-31 07:40:00,2020-07-31 07:40:00,38794,1,00:00:14,00:00:00
1,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,2020-07-31 07:40:14,2020-07-31 07:40:33,37062,2,00:01:22,00:00:19
2,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,2020-07-31 07:41:36,2020-07-31 07:41:53,35142,3,00:00:48,00:00:17
3,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,2020-07-31 07:42:24,2020-07-31 07:42:49,38796,4,00:01:43,00:00:25
4,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,2020-07-31 07:44:07,2020-07-31 07:44:29,38797,5,00:01:21,00:00:22
...,...,...,...,...,...,...,...
751847,NORMAL_209_Saswad Bus Stand To Katraj_Down-2240_0,2020-07-31 23:18:14,2020-07-31 23:18:27,38584,35,00:00:42,00:00:13
751848,NORMAL_209_Saswad Bus Stand To Katraj_Down-2240_0,2020-07-31 23:18:56,2020-07-31 23:19:09,33232,36,00:00:40,00:00:13
751849,NORMAL_209_Saswad Bus Stand To Katraj_Down-2240_0,2020-07-31 23:19:36,2020-07-31 23:19:53,38586,37,00:00:18,00:00:17
751850,NORMAL_209_Saswad Bus Stand To Katraj_Down-2240_0,2020-07-31 23:19:54,2020-07-31 23:20:04,38588,38,00:01:31,00:00:10


In [23]:
# now we have to update arrival and departure times according to updated/modified durations
df_BusStopTimes["stp2stp_arrival_time"] = df_BusStopTimes.groupby("trip_id")["stp2stp_arrival_time"].transform(lambda x : x.cumsum().shift().fillna(datetime.timedelta(minutes = 0)))
df_BusStopTimes

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stp2stp_arrival_time,deparr_time_diff
0,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,2020-07-31 07:40:00,2020-07-31 07:40:00,38794,1,00:00:00,00:00:00
1,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,2020-07-31 07:40:14,2020-07-31 07:40:33,37062,2,00:00:14,00:00:19
2,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,2020-07-31 07:41:36,2020-07-31 07:41:53,35142,3,00:01:36,00:00:17
3,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,2020-07-31 07:42:24,2020-07-31 07:42:49,38796,4,00:02:24,00:00:25
4,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,2020-07-31 07:44:07,2020-07-31 07:44:29,38797,5,00:04:07,00:00:22
...,...,...,...,...,...,...,...
751847,NORMAL_209_Saswad Bus Stand To Katraj_Down-2240_0,2020-07-31 23:18:14,2020-07-31 23:18:27,38584,35,00:38:14,00:00:13
751848,NORMAL_209_Saswad Bus Stand To Katraj_Down-2240_0,2020-07-31 23:18:56,2020-07-31 23:19:09,33232,36,00:38:56,00:00:13
751849,NORMAL_209_Saswad Bus Stand To Katraj_Down-2240_0,2020-07-31 23:19:36,2020-07-31 23:19:53,38586,37,00:39:36,00:00:17
751850,NORMAL_209_Saswad Bus Stand To Katraj_Down-2240_0,2020-07-31 23:19:54,2020-07-31 23:20:04,38588,38,00:39:54,00:00:10


In [24]:
# this function will update the arrival time with the help of first entry and stop to stop arrival time duration value
def updateArrivalTime(df):
    df.loc[:,"arrival_time"] = df.iloc[0,1] +  df["stp2stp_arrival_time"]
    return df

In [25]:
df_BusStopTimes = df_BusStopTimes.groupby("trip_id").apply(lambda x: updateArrivalTime(x))
df_BusStopTimes

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stp2stp_arrival_time,deparr_time_diff
0,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,2020-07-31 07:40:00,2020-07-31 07:40:00,38794,1,00:00:00,00:00:00
1,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,2020-07-31 07:40:14,2020-07-31 07:40:33,37062,2,00:00:14,00:00:19
2,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,2020-07-31 07:41:36,2020-07-31 07:41:53,35142,3,00:01:36,00:00:17
3,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,2020-07-31 07:42:24,2020-07-31 07:42:49,38796,4,00:02:24,00:00:25
4,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,2020-07-31 07:44:07,2020-07-31 07:44:29,38797,5,00:04:07,00:00:22
...,...,...,...,...,...,...,...
751847,NORMAL_209_Saswad Bus Stand To Katraj_Down-2240_0,2020-07-31 23:18:14,2020-07-31 23:18:27,38584,35,00:38:14,00:00:13
751848,NORMAL_209_Saswad Bus Stand To Katraj_Down-2240_0,2020-07-31 23:18:56,2020-07-31 23:19:09,33232,36,00:38:56,00:00:13
751849,NORMAL_209_Saswad Bus Stand To Katraj_Down-2240_0,2020-07-31 23:19:36,2020-07-31 23:19:53,38586,37,00:39:36,00:00:17
751850,NORMAL_209_Saswad Bus Stand To Katraj_Down-2240_0,2020-07-31 23:19:54,2020-07-31 23:20:04,38588,38,00:39:54,00:00:10


In [26]:
# similarly, update departure time as well
df_BusStopTimes.loc[:,"departure_time"] = df_BusStopTimes["arrival_time"]  +  df_BusStopTimes["deparr_time_diff"]
df_BusStopTimes

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stp2stp_arrival_time,deparr_time_diff
0,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,2020-07-31 07:40:00,2020-07-31 07:40:00,38794,1,00:00:00,00:00:00
1,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,2020-07-31 07:40:14,2020-07-31 07:40:33,37062,2,00:00:14,00:00:19
2,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,2020-07-31 07:41:36,2020-07-31 07:41:53,35142,3,00:01:36,00:00:17
3,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,2020-07-31 07:42:24,2020-07-31 07:42:49,38796,4,00:02:24,00:00:25
4,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,2020-07-31 07:44:07,2020-07-31 07:44:29,38797,5,00:04:07,00:00:22
...,...,...,...,...,...,...,...
751847,NORMAL_209_Saswad Bus Stand To Katraj_Down-2240_0,2020-07-31 23:18:14,2020-07-31 23:18:27,38584,35,00:38:14,00:00:13
751848,NORMAL_209_Saswad Bus Stand To Katraj_Down-2240_0,2020-07-31 23:18:56,2020-07-31 23:19:09,33232,36,00:38:56,00:00:13
751849,NORMAL_209_Saswad Bus Stand To Katraj_Down-2240_0,2020-07-31 23:19:36,2020-07-31 23:19:53,38586,37,00:39:36,00:00:17
751850,NORMAL_209_Saswad Bus Stand To Katraj_Down-2240_0,2020-07-31 23:19:54,2020-07-31 23:20:04,38588,38,00:39:54,00:00:10


In [27]:
df_BusStopTimes["trip_bgn_time"] = df_BusStopTimes.groupby("trip_id")["arrival_time"].transform("first")
df_BusStopTimes["trip_end_time"] = df_BusStopTimes.groupby("trip_id")["arrival_time"].transform("last")
df_BusStopTimes["trip_duration"] = df_BusStopTimes["trip_end_time"] - df_BusStopTimes["trip_bgn_time"]
df_BusStopTimes.sort_values("trip_duration", ascending = True)

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stp2stp_arrival_time,deparr_time_diff,trip_bgn_time,trip_end_time,trip_duration
387741,NORMAL_107_Gharkul Vasahat Warje To Pimplegura...,2020-07-31 05:30:00,2020-07-31 05:30:00,35441,38,00:00:00,00:00:00,2020-07-31 05:30:00,2020-07-31 05:30:00,00:00:00
224277,NORMAL_328_Nigdi To Rupeenagar_Up-0700_0,2020-07-31 07:02:02,2020-07-31 07:02:02,39333,6,00:02:02,00:00:00,2020-07-31 07:00:00,2020-07-31 07:02:02,00:02:02
224276,NORMAL_328_Nigdi To Rupeenagar_Up-0700_0,2020-07-31 07:01:01,2020-07-31 07:01:14,39590,5,00:01:01,00:00:13,2020-07-31 07:00:00,2020-07-31 07:02:02,00:02:02
224275,NORMAL_328_Nigdi To Rupeenagar_Up-0700_0,2020-07-31 07:00:39,2020-07-31 07:00:49,39334,4,00:00:39,00:00:10,2020-07-31 07:00:00,2020-07-31 07:02:02,00:02:02
224274,NORMAL_328_Nigdi To Rupeenagar_Up-0700_0,2020-07-31 07:00:09,2020-07-31 07:00:17,34755,3,00:00:09,00:00:08,2020-07-31 07:00:00,2020-07-31 07:02:02,00:02:02
...,...,...,...,...,...,...,...,...,...,...
259430,NORMAL_376_Bhakti Shakti To Katraj (Vai YCM)_D...,2020-07-31 17:46:25,2020-07-31 17:46:48,39435,27,00:56:25,00:00:23,2020-07-31 16:50:00,2020-07-31 19:44:30,02:54:30
259431,NORMAL_376_Bhakti Shakti To Katraj (Vai YCM)_D...,2020-07-31 17:47:52,2020-07-31 17:48:05,33220,28,00:57:52,00:00:13,2020-07-31 16:50:00,2020-07-31 19:44:30,02:54:30
259432,NORMAL_376_Bhakti Shakti To Katraj (Vai YCM)_D...,2020-07-31 17:50:02,2020-07-31 17:50:18,31508,29,01:00:02,00:00:16,2020-07-31 16:50:00,2020-07-31 19:44:30,02:54:30
259434,NORMAL_376_Bhakti Shakti To Katraj (Vai YCM)_D...,2020-07-31 17:57:47,2020-07-31 17:57:57,36091,31,01:07:47,00:00:10,2020-07-31 16:50:00,2020-07-31 19:44:30,02:54:30


In [28]:
# let's drop this trip_id as it has only one stop_sequence
df_BusStopTimes = df_BusStopTimes.drop(df_BusStopTimes[df_BusStopTimes["trip_id"] == "NORMAL_107_Gharkul Vasahat Warje To Pimplegurav_Down-0530_0"].index[0])
df_BusStopTimes.reset_index(drop=True, inplace=True)
df_BusStopTimes

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stp2stp_arrival_time,deparr_time_diff,trip_bgn_time,trip_end_time,trip_duration
0,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,2020-07-31 07:40:00,2020-07-31 07:40:00,38794,1,00:00:00,00:00:00,2020-07-31 07:40:00,2020-07-31 08:55:09,01:15:09
1,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,2020-07-31 07:40:14,2020-07-31 07:40:33,37062,2,00:00:14,00:00:19,2020-07-31 07:40:00,2020-07-31 08:55:09,01:15:09
2,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,2020-07-31 07:41:36,2020-07-31 07:41:53,35142,3,00:01:36,00:00:17,2020-07-31 07:40:00,2020-07-31 08:55:09,01:15:09
3,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,2020-07-31 07:42:24,2020-07-31 07:42:49,38796,4,00:02:24,00:00:25,2020-07-31 07:40:00,2020-07-31 08:55:09,01:15:09
4,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,2020-07-31 07:44:07,2020-07-31 07:44:29,38797,5,00:04:07,00:00:22,2020-07-31 07:40:00,2020-07-31 08:55:09,01:15:09
...,...,...,...,...,...,...,...,...,...,...
751846,NORMAL_209_Saswad Bus Stand To Katraj_Down-2240_0,2020-07-31 23:18:14,2020-07-31 23:18:27,38584,35,00:38:14,00:00:13,2020-07-31 22:40:00,2020-07-31 23:21:25,00:41:25
751847,NORMAL_209_Saswad Bus Stand To Katraj_Down-2240_0,2020-07-31 23:18:56,2020-07-31 23:19:09,33232,36,00:38:56,00:00:13,2020-07-31 22:40:00,2020-07-31 23:21:25,00:41:25
751848,NORMAL_209_Saswad Bus Stand To Katraj_Down-2240_0,2020-07-31 23:19:36,2020-07-31 23:19:53,38586,37,00:39:36,00:00:17,2020-07-31 22:40:00,2020-07-31 23:21:25,00:41:25
751849,NORMAL_209_Saswad Bus Stand To Katraj_Down-2240_0,2020-07-31 23:19:54,2020-07-31 23:20:04,38588,38,00:39:54,00:00:10,2020-07-31 22:40:00,2020-07-31 23:21:25,00:41:25


In [29]:
# let's verify the dataframe once again (check max and 75% values)
df_BusStopTimes["stp2stp_arrival_time"] = df_BusStopTimes.groupby("trip_id")["arrival_time"].transform(lambda x : x.shift(-1) - x)
df_BusStopTimes["deparr_time_diff"] = df_BusStopTimes["departure_time"] - df_BusStopTimes["arrival_time"]
df_BusStopTimes.describe()

Unnamed: 0,stop_id,stop_sequence,stp2stp_arrival_time,deparr_time_diff,trip_duration
count,751851.0,751851.0,730048,751851,751851
mean,33090.177259,20.69536,0 days 00:01:35.521301,0 days 00:00:22.797243,0 days 01:02:18.654693
std,11749.462334,14.596245,0 days 00:01:20.296582,0 days 00:00:17.612509,0 days 00:27:01.217420
min,386.0,1.0,0 days 00:00:00,0 days 00:00:00,0 days 00:02:02
25%,33443.0,9.0,0 days 00:00:53,0 days 00:00:12,0 days 00:42:05
50%,38773.0,18.0,0 days 00:01:17,0 days 00:00:20,0 days 00:57:43
75%,39276.0,29.0,0 days 00:01:54,0 days 00:00:30,0 days 01:20:35
max,40516.0,87.0,0 days 00:29:59,0 days 00:03:00,0 days 02:54:30


In [30]:
# drop the duration columns as they are not required anymore 
df_BusStopTimes.drop(["stp2stp_arrival_time", "deparr_time_diff", "trip_bgn_time", "trip_end_time", "trip_duration"], axis =1, inplace =True)
df_BusStopTimes

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence
0,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,2020-07-31 07:40:00,2020-07-31 07:40:00,38794,1
1,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,2020-07-31 07:40:14,2020-07-31 07:40:33,37062,2
2,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,2020-07-31 07:41:36,2020-07-31 07:41:53,35142,3
3,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,2020-07-31 07:42:24,2020-07-31 07:42:49,38796,4
4,NORMAL_42_Katraj To Bhakti Shakti_Up-0740_0,2020-07-31 07:44:07,2020-07-31 07:44:29,38797,5
...,...,...,...,...,...
751846,NORMAL_209_Saswad Bus Stand To Katraj_Down-2240_0,2020-07-31 23:18:14,2020-07-31 23:18:27,38584,35
751847,NORMAL_209_Saswad Bus Stand To Katraj_Down-2240_0,2020-07-31 23:18:56,2020-07-31 23:19:09,33232,36
751848,NORMAL_209_Saswad Bus Stand To Katraj_Down-2240_0,2020-07-31 23:19:36,2020-07-31 23:19:53,38586,37
751849,NORMAL_209_Saswad Bus Stand To Katraj_Down-2240_0,2020-07-31 23:19:54,2020-07-31 23:20:04,38588,38


In [31]:
# now that we've repaired the dataframes we should save it to resources folder for further analysis
df_TripCalendar.to_csv(f"{result_directory}calendar.txt", index=False)
df_BusStops.to_csv(f"{result_directory}stops.txt", index=False)
df_BusRouteShapes.to_csv(f"{result_directory}shapes.txt", index=False)
df_BusTrips.to_csv(f"{result_directory}trips.txt", index=False)
df_BusStopTimes.to_csv(f"{result_directory}stop_times.txt", index=False)