In [1]:
!pip install geopy



In [2]:
'''To import the required packages.'''
import pandas as pd
import numpy as np
import networkx as nx
import collections
import matplotlib.pyplot as plt
import math
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

# Settings


In [3]:
'''To display all output results of a Jupyter cell.'''
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [4]:
'''To ensure that the output results of extensive output results are not truncated.'''
#pd.options.display.max_rows = 4000

'To ensure that the output results of extensive output results are not truncated.'

# **Belgian railway system**

# Import of the Belgian railway datasets

In [5]:
'''To register the GitHub link with the Belgian data as a variable.'''
datalink = "https://raw.githubusercontent.com/polkuleuven/Thesis_Train/main/gtfs_train_Belgium_1503/"

'To register the GitHub link with the Belgian data as a variable.'

In [6]:
'''Import all the GTFS data'''

#To import the agency dataset that contains limited information about Belgian NMBS/SNCB railway agency
agency = pd.read_csv(datalink + "agency.txt", sep=",")
#To import the stops dataset that contains information about the ids, the names and the geographical coordinates of the Belgian railway stations.
stops = pd.read_csv(datalink + "stops.txt", sep=",")
#To import the translations dataset that provides the French-, Dutch-, German- and English-language translations of the Belgian railway stations.
translations = pd.read_csv(datalink + "translations.txt", sep=",")
#To import the transfers dataset that gives the minimum transfer time to switch routes at each Belgian railway station.
transfers = pd.read_csv(datalink + "transfers.txt", sep=",")
#To import the routes dataset that provides the id, the name and the type of vehicle used for all Belgian railway routes.
routes = pd.read_csv(datalink + "routes.txt", sep=",")
#To import the trips dataset that gives for all routes an overview of the trips and the headsigns of these trips belonging to the Belgian railway route.
#The service_id is an indication of all the dates this trip is valid (consultable in the calendar_dates dataset).
trips = pd.read_csv(datalink + "trips.txt", sep=",")
#To import the stop_times dataset that gives for all trips an overview of the ids of the stations served and the sequence in which these stations are served. 
#In addition, for all the trips the arrival and departure times at the stations served are given.
stop_times = pd.read_csv(datalink + "stop_times.txt", sep=",")
#To import the calendar dataset that gives the first and last date of all data observations.
calendar = pd.read_csv(datalink + "calendar.txt", sep=",")
#To import the calendar_dates dataset that gives for each service_id all the exact dates when that service_id is valid.
calendar_dates = pd.read_csv(datalink + "calendar_dates.txt", sep=",")
#???
stop_time_overrides = pd.read_csv(datalink + "stop_time_overrides.txt", sep=",")

'Import all the GTFS data'

# Cleaning of the Belgian railway data

''' To clean the stops df.  (1) ''' 
#####To eliminate the stop_ids in the stops dataset that contain an underscore or that start with a character 'S'. 
stops_cleaned = stops[(~stops['stop_id'].str.contains('_')) & (~stops['stop_id'].str.contains('S'))]

#####To modify the object datatype of the stop_id column to the numpy int64 datatype
stops_cleaned.loc[:,'stop_id'] = stops_cleaned.loc[:,'stop_id'].astype(np.int64)

##### To remove the accents from the stop_name and to change to uppercase
stops_cleaned.loc[:,'stop_name'] = stops_cleaned.loc[:,'stop_name'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
stops_cleaned.loc[:,'stop_name'] = stops_cleaned.loc[:,'stop_name'].str.upper()

''' To clean the stops df.  (2) ''' 
##### To initialize the Nominatim API to get the location from the input string 
geolocator = Nominatim(user_agent="application")
reverse = RateLimiter(geolocator.reverse, min_delay_seconds=0.2)

##### To get the location with the geolocator.reverse() function and to extract the country from the location instance
country_list = []
for index, row in stops_cleaned.iterrows():
    latitude = row['stop_lat']
    longitude = row['stop_lon']
    # To assign the latitude and longitude into a geolocator.reverse() method
    location = reverse((latitude, longitude), language='en', exactly_one=True)
    # To get the country from the given list and parsed into a dictionary with raw function()
    address = location.raw['address']
    country = address.get('country', '')
    country_list.append(country)

##### To add the values of country_list as a new attribute country 
stops_cleaned.loc[:,'country'] = country_list
stops_cleaned

##### To calculate the total number of Belgian stations in the stops_cleaned dataset
belgian_stops_Belgium = stops_cleaned[stops_cleaned['country'] == 'Belgium']
belgian_stops_Belgium_series = stops_cleaned.loc[stops_cleaned['country'] == 'Belgium', 'stop_name']

stops_cleaned.to_csv(r'/Users/pol/Desktop/CSV_export/stops_cleaned_Belgium.csv', index = False, header=True, encoding='utf-8-sig')

belgian_stops_Belgium_series.to_csv(r'/Users/pol/Desktop/CSV_export/belgian_stops_Belgium_series.csv', index = False, header=True, encoding='utf-8-sig')

In [7]:
'''Imports the cleaned version of the stops with their country'''
stops_cleaned = pd.read_csv("https://raw.githubusercontent.com/polkuleuven/Thesis_Train/main/stops_cleaned/stops_cleaned_Belgium.csv", sep=",")
belgian_stops_Belgium_series = pd.read_csv("https://raw.githubusercontent.com/polkuleuven/Thesis_Train/main/country_stops_series/belgian_stops_Belgium_series.csv", sep=",")['stop_name']

'import the cleaned version of the stops with their country'

In [8]:
'''To clean the trips df'''
#To merge a selection of the trips dataset and a selection of the routes dataset on route_id
trip_route_short_name = pd.merge(trips[['route_id','service_id','trip_id', 'trip_headsign']], routes[['route_id', 'route_short_name', 'route_long_name']], on='route_id')

#To select the trips that belong to the routes that have a route_short_name that begins with an 'S' or is equal to 'IC', 'L' or 'P.'''
allowed_route_type = {'IC', 'L', 'P'}
filtered_trips = trip_route_short_name[(trip_route_short_name['route_short_name'].isin(allowed_route_type)) | (trip_route_short_name['route_short_name'].str.startswith('S'))]
filtered_trips = filtered_trips.drop(columns=['route_short_name'])

# To remove the accents from the route_long_name and to change to uppercase
filtered_trips['route_long_name'] = filtered_trips['route_long_name'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
filtered_trips['route_long_name'] = filtered_trips['route_long_name'].str.upper()

# To remove the accents from the trip_headsign and to change to uppercase
filtered_trips['trip_headsign'] = filtered_trips['trip_headsign'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
filtered_trips['trip_headsign'] = filtered_trips['trip_headsign'].str.upper()
filtered_trips

' To clean the trips df'

Unnamed: 0,route_id,service_id,trip_id,trip_headsign,route_long_name
4845,115,14,88____:007::8885704:8885001:4:523:20210418,TOURNAI,TOURNAI -- MOUSCRON
4846,115,14,88____:007::8885704:8885001:4:623:20210418,TOURNAI,TOURNAI -- MOUSCRON
4847,115,14,88____:007::8885704:8885001:4:723:20210418,TOURNAI,TOURNAI -- MOUSCRON
4848,115,14,88____:007::8885704:8885001:4:823:20210418,TOURNAI,TOURNAI -- MOUSCRON
4849,115,14,88____:007::8885704:8885001:4:923:20210418,TOURNAI,TOURNAI -- MOUSCRON
...,...,...,...,...,...
30831,734,25,88____:007::8821105:8812005:22:1323:20210418,BRUXELLES-NORD,DEN HAAG HS (NL) -- BRUXELLES-NORD
30832,734,25,88____:007::8812005:8400131:23:1618:20210418,DEN HAAG HS (NL),DEN HAAG HS (NL) -- BRUXELLES-NORD
30833,734,25,84____:007::8400131:8400280:3:1720:20210418,DEN HAAG HS (NL),DEN HAAG HS (NL) -- BRUXELLES-NORD
30834,734,25,84____:007::8400280:8821105:4:1600:20210418,BRUXELLES-NORD,DEN HAAG HS (NL) -- BRUXELLES-NORD


In [51]:
'''Filters the dats from the selected begin to the end date'''
#here we used 4 months
begin_date = 20210314
end_date = 20210713
filtered_calendar_dates = calendar_dates.copy()
filtered_calendar_dates = filtered_calendar_dates.drop(filtered_calendar_dates[(filtered_calendar_dates['date'] > end_date) |(filtered_calendar_dates['date'] < begin_date)].index)
filtered_calendar_dates

'filter the dats from the selected begin to the end date'

Unnamed: 0,service_id,date,exception_type
0,1,20210314,1
1,2,20210315,1
2,2,20210316,1
3,2,20210317,1
4,2,20210318,1
...,...,...,...
487564,0,20210709,1
487565,0,20210710,1
487566,0,20210711,1
487567,0,20210712,1


# Exploratory data analysis with the Belgian railway data

In [10]:
'''To calculate the number of unique route_ids before removing the routes with a route_short_name that does not begin with an S and is not 'IC', 'L', or 'P'.'''
initial_set_routes = {r for r in routes['route_id']}
len(initial_set_routes)

"To calculate the number of unique route_ids before removing the routes with a route_short_name that does not begin with an S and is not 'IC', 'L', or 'P'."

734

In [11]:
'''To calculate the number of unique route_ids after removing the routes with a route_short_name that does not begin with an S and is not 'IC', 'L', or 'P'.'''
set_routes = {r for r in filtered_trips['route_id']}
len(set_routes)

"To calculate the number of unique route_ids after removing the routes with a route_short_name that does not begin with an S and is not 'IC', 'L', or 'P'."

589

In [12]:
'''To calculate the total number of stations in the stops_cleaned dataset'''
set_stations = {s for s in stops_cleaned['stop_id']}
len(set_stations)

'To calculate the total number of stations in the stops_cleaned dataset'

608

In [13]:
'''To calculate the total number of Belgian stations in the stops_cleaned dataset'''
len(belgian_stops_Belgium_series)

'To calculate the total number of Belgian stations in the stops_cleaned dataset'

563

# **Preparation for the L-space representation of the Belgian railway system**


In [14]:
'''To merge a selection of the stops_cleaned dataset with a selection of the stop_times dataset'''
stops_cleaned_stop_times_merge = pd.merge(stop_times[['trip_id','arrival_time', 'departure_time','stop_id','stop_sequence']], stops_cleaned[['stop_id', 'stop_name', 'stop_lat', 'stop_lon']], on='stop_id')
stops_cleaned_stop_times_merge

'To merge a selection of the stops_cleaned dataset with a selection of the stop_times dataset'

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_name,stop_lat,stop_lon
0,88____:049::8892338:8892205:6:1925:20210314,17:35:00,17:35:00,8892338,1,LA PANNE,51.07740,2.601966
1,88____:049::8892338:8892205:6:720:20210314,05:30:00,05:30:00,8892338,1,LA PANNE,51.07740,2.601966
2,88____:049::8892338:8892205:6:820:20210314,06:30:00,06:30:00,8892338,1,LA PANNE,51.07740,2.601966
3,88____:049::8892338:8892205:6:920:20210314,07:30:00,07:30:00,8892338,1,LA PANNE,51.07740,2.601966
4,88____:049::8892338:8892205:6:1020:20210314,08:30:00,08:30:00,8892338,1,LA PANNE,51.07740,2.601966
...,...,...,...,...,...,...,...,...
460669,87____:005::8727100:8814308:2:1434:20210424,12:31:00,12:31:00,8727100,1,PARIS NORD (FR),48.86667,2.333333
460670,87____:005::8727100:8814308:2:1432:20211211,12:31:00,12:31:00,8727100,1,PARIS NORD (FR),48.86667,2.333333
460671,88____:005::8814001:8727100:3:1756:20211210,17:56:00,17:56:00,8727100,3,PARIS NORD (FR),48.86667,2.333333
460672,87____:005::8727100:8814308:2:2111:20210430,18:49:00,18:49:00,8727100,1,PARIS NORD (FR),48.86667,2.333333


In [15]:
#CHANGED
'''To merge a selection of the stops_cleaned_stop_times_merge dataset with the filtered_trips dataset.'''
stops_cleaned_stop_times_trips_merge = pd.merge(filtered_trips, stops_cleaned_stop_times_merge, on='trip_id')
stops_cleaned_stop_times_trips_merge = stops_cleaned_stop_times_trips_merge.sort_values(by=['route_id', 'trip_id', 'stop_sequence'])
stops_cleaned_stop_times_trips_merge

'To merge a selection of the stops_cleaned_stop_times_merge dataset with the filtered_trips dataset.'

Unnamed: 0,route_id,service_id,trip_id,trip_headsign,route_long_name,arrival_time,departure_time,stop_id,stop_sequence,stop_name,stop_lat,stop_lon
100,115,14,88____:007::8885001:8885704:4:1052:20210418,MOUSCRON,TOURNAI -- MOUSCRON,10:36:00,10:36:00,8885001,1,TOURNAI,50.61313,3.396940
101,115,14,88____:007::8885001:8885704:4:1052:20210418,MOUSCRON,TOURNAI -- MOUSCRON,10:40:00,10:40:00,8885068,2,FROYENNES,50.62989,3.354835
102,115,14,88____:007::8885001:8885704:4:1052:20210418,MOUSCRON,TOURNAI -- MOUSCRON,10:47:00,10:47:00,8885753,3,HERSEAUX,50.71390,3.245961
103,115,14,88____:007::8885001:8885704:4:1052:20210418,MOUSCRON,TOURNAI -- MOUSCRON,10:52:00,10:52:00,8885704,4,MOUSCRON,50.74100,3.228449
260,115,42,88____:007::8885001:8885704:4:1052:20210530,MOUSCRON,TOURNAI -- MOUSCRON,10:36:00,10:36:00,8885001,1,TOURNAI,50.61313,3.396940
...,...,...,...,...,...,...,...,...,...,...,...,...
429814,734,25,88____:007::8821105:8812005:22:1723:20210418,BRUXELLES-NORD,DEN HAAG HS (NL) -- BRUXELLES-NORD,17:05:00,17:05:00,8811254,21,KORTENBERG,50.89307,4.543300
429813,734,25,88____:007::8821105:8812005:22:1723:20210418,BRUXELLES-NORD,DEN HAAG HS (NL) -- BRUXELLES-NORD,17:07:00,17:07:00,8811247,22,NOSSEGEM,50.88331,4.506110
429818,734,25,88____:007::8821105:8812005:22:1723:20210418,BRUXELLES-NORD,DEN HAAG HS (NL) -- BRUXELLES-NORD,17:10:00,17:12:00,8819406,23,BRUSSELS AIRPORT-ZAVENTEM,50.89646,4.482072
429812,734,25,88____:007::8821105:8812005:22:1723:20210418,BRUXELLES-NORD,DEN HAAG HS (NL) -- BRUXELLES-NORD,17:20:00,17:20:00,8811007,24,SCHAERBEEK,50.87851,4.378640


In [231]:
#NEW
'''Creates the a dataframe with the departure time form the first stop sequence and with the one from last stop sequence for each trip_id'''
departure_time_first = stops_cleaned_stop_times_trips_merge.reset_index().loc[stops_cleaned_stop_times_trips_merge.reset_index().groupby(['trip_id'])['stop_sequence'].idxmin()][['route_id', 'trip_id', 'departure_time']].copy()
departure_time_first = departure_time_first.rename(columns = {'departure_time': 'departure_time_first'})
departure_time_last = stops_cleaned_stop_times_trips_merge.reset_index().loc[stops_cleaned_stop_times_trips_merge.reset_index().groupby(['trip_id'])['stop_sequence'].idxmax()][['route_id', 'trip_id', 'departure_time']].copy()
departure_time_last = departure_time_last.rename(columns = {'departure_time': 'departure_time_last'})
departure_times = departure_time_first.merge(departure_time_last[['trip_id', 'departure_time_last']], on='trip_id')
departure_times

'Create the a dataframe with the departure time of the first stop sequence and another one with the last stop sequence for each trip_id'

Unnamed: 0,route_id,trip_id,departure_time_first,departure_time_last
0,469,80____:046::8015345:8849023:2:1011:20210328,10:04:00,10:11:00
1,469,80____:046::8015345:8849023:2:1011:20210402,10:04:00,10:11:00
2,469,80____:046::8015345:8849023:2:1011:20210405,10:04:00,10:11:00
3,469,80____:046::8015345:8849023:2:1011:20210411,10:04:00,10:11:00
4,469,80____:046::8015345:8849023:2:1011:20210416,10:04:00,10:11:00
...,...,...,...,...
25720,474,88____:L73::8843208:8841004:5:834:20211210,08:15:00,08:36:00
25721,474,88____:L73::8843208:8841400:13:1709:20210402,16:15:00,17:09:00
25722,474,88____:L73::8843208:8841400:13:1809:20210402,17:15:00,18:09:00
25723,474,88____:L73::8843208:8841400:13:809:20210402,07:15:00,08:09:00


In [16]:
'''To create a route_sequence dataset that gives for each trip_id that belongs to a route the sequence of stations served'''
route_sequence = stops_cleaned_stop_times_trips_merge.groupby(['route_id','route_long_name','trip_headsign','trip_id','stop_sequence'], as_index=False)[['stop_name', 'stop_lat', 'stop_lon']].last()
route_sequence

'To create a route_sequence dataset that gives for each trip_id that belongs to a route the sequence of stations served'

Unnamed: 0,route_id,route_long_name,trip_headsign,trip_id,stop_sequence,stop_name,stop_lat,stop_lon
0,115,TOURNAI -- MOUSCRON,MOUSCRON,88____:007::8885001:8885704:4:1052:20210418,1,TOURNAI,50.61313,3.396940
1,115,TOURNAI -- MOUSCRON,MOUSCRON,88____:007::8885001:8885704:4:1052:20210418,2,FROYENNES,50.62989,3.354835
2,115,TOURNAI -- MOUSCRON,MOUSCRON,88____:007::8885001:8885704:4:1052:20210418,3,HERSEAUX,50.71390,3.245961
3,115,TOURNAI -- MOUSCRON,MOUSCRON,88____:007::8885001:8885704:4:1052:20210418,4,MOUSCRON,50.74100,3.228449
4,115,TOURNAI -- MOUSCRON,MOUSCRON,88____:007::8885001:8885704:4:1052:20210530,1,TOURNAI,50.61313,3.396940
...,...,...,...,...,...,...,...,...
429816,734,DEN HAAG HS (NL) -- BRUXELLES-NORD,DEN HAAG HS (NL),88____:007::8812005:8400131:23:1618:20210418,19,ANVERS-BERCHEM,51.19923,4.432219
429817,734,DEN HAAG HS (NL) -- BRUXELLES-NORD,DEN HAAG HS (NL),88____:007::8812005:8400131:23:1618:20210418,20,ANVERS-CENTRAL,51.21720,4.421098
429818,734,DEN HAAG HS (NL) -- BRUXELLES-NORD,DEN HAAG HS (NL),88____:007::8812005:8400131:23:1618:20210418,21,ANVERS-LUCHTBAL,51.24413,4.425033
429819,734,DEN HAAG HS (NL) -- BRUXELLES-NORD,DEN HAAG HS (NL),88____:007::8812005:8400131:23:1618:20210418,22,NOORDERKEMPEN (BRECHT),51.35683,4.632200


In [17]:
''' To groupby the trip_id and to order the stop_sequence in an ascending order
Otherwise, different hash values could correspond to a same stop_sequence (since the stop_sequences of some
routes are initially in descending order while other stop_sequences are in ascending order)'''

trip_stop_sequence_sorted = stops_cleaned_stop_times_merge.groupby(['trip_id'], as_index=False).apply(lambda x: x.sort_values('stop_sequence'))
trip_stop_sequence_sorted

Unnamed: 0,Unnamed: 1,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_name,stop_lat,stop_lon
0,83181,80____:046::8015345:8849023:2:1011:20210328,10:04:00,10:04:00,8015345,1,AACHEN HBF (DE),50.77083,6.105277
0,82814,80____:046::8015345:8849023:2:1011:20210328,10:11:00,10:11:00,8849023,2,HERGENRATH-FRONTIERE,50.71896,6.041269
1,83179,80____:046::8015345:8849023:2:1011:20210402,10:04:00,10:04:00,8015345,1,AACHEN HBF (DE),50.77083,6.105277
1,82810,80____:046::8015345:8849023:2:1011:20210402,10:11:00,10:11:00,8849023,2,HERGENRATH-FRONTIERE,50.71896,6.041269
2,83186,80____:046::8015345:8849023:2:1011:20210405,10:04:00,10:04:00,8015345,1,AACHEN HBF (DE),50.77083,6.105277
...,...,...,...,...,...,...,...,...,...
30835,212160,88____:L73::8843208:8841400:13:909:20210402,08:54:00,08:54:00,8841467,9,FEXHE-LE-HAUT-CLOCHER,50.66405,5.398450
30835,211763,88____:L73::8843208:8841400:13:909:20210402,08:57:00,08:57:00,8841459,10,MOMALLE,50.66991,5.367600
30835,211366,88____:L73::8843208:8841400:13:909:20210402,09:01:00,09:01:00,8841442,11,REMICOURT,50.67861,5.321410
30835,210969,88____:L73::8843208:8841400:13:909:20210402,09:05:00,09:05:00,8841434,12,BLERET,50.68507,5.286398


In [18]:
'''To put the stop_names per trip_id in a list'''
trip_stop_sequence = trip_stop_sequence_sorted.groupby('trip_id')['stop_name'].apply(lambda group_series: group_series.tolist()).reset_index()
trip_stop_sequence.rename(columns={'stop_name':'stop_sequence'}, inplace=True)
trip_stop_sequence

Unnamed: 0,trip_id,stop_sequence
0,80____:046::8015345:8849023:2:1011:20210328,"[AACHEN HBF (DE), HERGENRATH-FRONTIERE]"
1,80____:046::8015345:8849023:2:1011:20210402,"[AACHEN HBF (DE), HERGENRATH-FRONTIERE]"
2,80____:046::8015345:8849023:2:1011:20210405,"[AACHEN HBF (DE), HERGENRATH-FRONTIERE]"
3,80____:046::8015345:8849023:2:1011:20210411,"[AACHEN HBF (DE), HERGENRATH-FRONTIERE]"
4,80____:046::8015345:8849023:2:1011:20210416,"[AACHEN HBF (DE), HERGENRATH-FRONTIERE]"
...,...,...
30831,88____:L73::8843208:8841004:5:834:20211210,"[FLEMALLE-HAUTE, SERAING, OUGREE, Y.RENORY, LI..."
30832,88____:L73::8843208:8841400:13:1709:20210402,"[FLEMALLE-HAUTE, SERAING, OUGREE, Y.RENORY, LI..."
30833,88____:L73::8843208:8841400:13:1809:20210402,"[FLEMALLE-HAUTE, SERAING, OUGREE, Y.RENORY, LI..."
30834,88____:L73::8843208:8841400:13:809:20210402,"[FLEMALLE-HAUTE, SERAING, OUGREE, Y.RENORY, LI..."


In [19]:
'''To calculate the hash value for the stop sequence of each trip_id'''

#To copy the filtered_trips dataset
trips_hash = trip_stop_sequence.copy()

#calculates the hash of the stop sequence in both order (ascending and descending)
trips_hash['hash'] = trips_hash['stop_sequence'].apply(lambda x: hash(tuple(x)))
trips_hash['hash_inverse'] = trips_hash['stop_sequence'].apply(lambda x: hash(tuple(x[::-1])))

'To calculate the hash value for the stop sequence of each trip_id'

In [20]:
# To add the stop_sequence of stations to the filtered_trips dataset by joining on trip_id
trips_hash_stop_sequence = pd.merge(filtered_trips, trips_hash, on='trip_id', how='left')

# To put the columns in a more logical order
trips_hash_stop_sequence = trips_hash_stop_sequence[['route_id', 'route_long_name','service_id','trip_headsign','trip_id','hash', 'hash_inverse','stop_sequence']]
trips_hash_stop_sequence

Unnamed: 0,route_id,route_long_name,service_id,trip_headsign,trip_id,hash,hash_inverse,stop_sequence
0,115,TOURNAI -- MOUSCRON,14,TOURNAI,88____:007::8885704:8885001:4:523:20210418,-8715344402839177113,7308961383809034169,"[MOUSCRON, HERSEAUX, FROYENNES, TOURNAI]"
1,115,TOURNAI -- MOUSCRON,14,TOURNAI,88____:007::8885704:8885001:4:623:20210418,-8715344402839177113,7308961383809034169,"[MOUSCRON, HERSEAUX, FROYENNES, TOURNAI]"
2,115,TOURNAI -- MOUSCRON,14,TOURNAI,88____:007::8885704:8885001:4:723:20210418,-8715344402839177113,7308961383809034169,"[MOUSCRON, HERSEAUX, FROYENNES, TOURNAI]"
3,115,TOURNAI -- MOUSCRON,14,TOURNAI,88____:007::8885704:8885001:4:823:20210418,-8715344402839177113,7308961383809034169,"[MOUSCRON, HERSEAUX, FROYENNES, TOURNAI]"
4,115,TOURNAI -- MOUSCRON,14,TOURNAI,88____:007::8885704:8885001:4:923:20210418,-8715344402839177113,7308961383809034169,"[MOUSCRON, HERSEAUX, FROYENNES, TOURNAI]"
...,...,...,...,...,...,...,...,...
25720,734,DEN HAAG HS (NL) -- BRUXELLES-NORD,25,BRUXELLES-NORD,88____:007::8821105:8812005:22:1323:20210418,3901824248109027820,8572366340337062507,"[NOORDERKEMPEN (BRECHT), ANVERS-LUCHTBAL, ANVE..."
25721,734,DEN HAAG HS (NL) -- BRUXELLES-NORD,25,DEN HAAG HS (NL),88____:007::8812005:8400131:23:1618:20210418,340386202383150578,-6422827118264685072,"[BRUXELLES-NORD, SCHAERBEEK, BRUSSELS AIRPORT-..."
25722,734,DEN HAAG HS (NL) -- BRUXELLES-NORD,25,DEN HAAG HS (NL),84____:007::8400131:8400280:3:1720:20210418,-1170946074405319360,2506171550043311866,"[BREDA (NL), ROTTERDAM CS (NL), DEN HAAG HS (NL)]"
25723,734,DEN HAAG HS (NL) -- BRUXELLES-NORD,25,BRUXELLES-NORD,84____:007::8400280:8821105:4:1600:20210418,-9198331999539632466,-7009744345638251698,"[DEN HAAG HS (NL), ROTTERDAM CS (NL), BREDA (N..."


In [233]:
'''Merges the trips_hash_stop_sequence with the departure_times'''
trips_hash_stop_sequence_departure = trips_hash_stop_sequence.merge(departure_times[['trip_id','departure_time_first','departure_time_last']], on='trip_id')
trips_hash_stop_sequence_departure

'Merges the trips_hash_stop_sequence with the departure_times'

Unnamed: 0,route_id,route_long_name,service_id,trip_headsign,trip_id,hash,hash_inverse,stop_sequence,departure_time_first,departure_time_last
0,115,TOURNAI -- MOUSCRON,14,TOURNAI,88____:007::8885704:8885001:4:523:20210418,-8715344402839177113,7308961383809034169,"[MOUSCRON, HERSEAUX, FROYENNES, TOURNAI]",05:07:00,05:23:00
1,115,TOURNAI -- MOUSCRON,14,TOURNAI,88____:007::8885704:8885001:4:623:20210418,-8715344402839177113,7308961383809034169,"[MOUSCRON, HERSEAUX, FROYENNES, TOURNAI]",06:07:00,06:23:00
2,115,TOURNAI -- MOUSCRON,14,TOURNAI,88____:007::8885704:8885001:4:723:20210418,-8715344402839177113,7308961383809034169,"[MOUSCRON, HERSEAUX, FROYENNES, TOURNAI]",07:07:00,07:23:00
3,115,TOURNAI -- MOUSCRON,14,TOURNAI,88____:007::8885704:8885001:4:823:20210418,-8715344402839177113,7308961383809034169,"[MOUSCRON, HERSEAUX, FROYENNES, TOURNAI]",08:07:00,08:23:00
4,115,TOURNAI -- MOUSCRON,14,TOURNAI,88____:007::8885704:8885001:4:923:20210418,-8715344402839177113,7308961383809034169,"[MOUSCRON, HERSEAUX, FROYENNES, TOURNAI]",09:07:00,09:23:00
...,...,...,...,...,...,...,...,...,...,...
25720,734,DEN HAAG HS (NL) -- BRUXELLES-NORD,25,BRUXELLES-NORD,88____:007::8821105:8812005:22:1323:20210418,3901824248109027820,8572366340337062507,"[NOORDERKEMPEN (BRECHT), ANVERS-LUCHTBAL, ANVE...",12:01:00,13:23:00
25721,734,DEN HAAG HS (NL) -- BRUXELLES-NORD,25,DEN HAAG HS (NL),88____:007::8812005:8400131:23:1618:20210418,340386202383150578,-6422827118264685072,"[BRUXELLES-NORD, SCHAERBEEK, BRUSSELS AIRPORT-...",14:37:00,16:26:00
25722,734,DEN HAAG HS (NL) -- BRUXELLES-NORD,25,DEN HAAG HS (NL),84____:007::8400131:8400280:3:1720:20210418,-1170946074405319360,2506171550043311866,"[BREDA (NL), ROTTERDAM CS (NL), DEN HAAG HS (NL)]",16:26:00,17:20:00
25723,734,DEN HAAG HS (NL) -- BRUXELLES-NORD,25,BRUXELLES-NORD,84____:007::8400280:8821105:4:1600:20210418,-9198331999539632466,-7009744345638251698,"[DEN HAAG HS (NL), ROTTERDAM CS (NL), BREDA (N...",14:40:00,16:01:00


In [234]:
'''To count the number of dates for each service_id'''
service_id_df = filtered_calendar_dates.groupby(['service_id'])[['service_id']].count().rename(columns={'service_id':'count_service_id'}).reset_index()
service_id_df

Unnamed: 0,service_id,count_service_id
0,0,122
1,1,1
2,2,5
3,3,3
4,4,35
...,...,...
4262,6231,70
4263,6232,92
4264,6233,77
4265,6234,86


In [235]:
'''Regroups the days per service id in a set'''
service_id_dates = filtered_calendar_dates.groupby('service_id')['date'].apply(lambda group_series: set(group_series.tolist())).reset_index()
service_id_dates.rename(columns={'date':'dates'}, inplace=True)
service_id_dates = service_id_dates.merge(service_id_df, on='service_id', how='left')
service_id_dates

Unnamed: 0,service_id,dates,count_service_id
0,0,"{20210701, 20210702, 20210703, 20210704, 20210...",122
1,1,{20210314},1
2,2,"{20210315, 20210316, 20210317, 20210318, 20210...",5
3,3,"{20210320, 20210321, 20210314}",3
4,4,"{20210315, 20210316, 20210317, 20210318, 20210...",35
...,...,...,...
4262,6231,"{20210701, 20210702, 20210705, 20210706, 20210...",70
4263,6232,"{20210701, 20210702, 20210703, 20210704, 20210...",92
4264,6233,"{20210701, 20210702, 20210703, 20210705, 20210...",77
4265,6234,"{20210701, 20210702, 20210704, 20210705, 20210...",86


In [254]:
#New
'''To put the different trip_ids in a list after joining and add the departure_time first and last'''
common_columns = ['route_id','route_long_name','hash', 'hash_inverse', 'service_id']
route_hash_freq = trips_hash_stop_sequence_departure.groupby(common_columns)['trip_id'].apply(lambda group_series: group_series.tolist()).reset_index()
route_hash_freq_dep_first = trips_hash_stop_sequence_departure.groupby(common_columns)['departure_time_first'].apply(lambda group_series: group_series.tolist()).reset_index()
route_hash_freq_dep_last = trips_hash_stop_sequence_departure.groupby(common_columns)['departure_time_last'].apply(lambda group_series: group_series.tolist()).reset_index()
route_hash_freq = route_hash_freq.merge(route_hash_freq_dep_first, on= common_columns)
route_hash_freq = route_hash_freq.merge(route_hash_freq_dep_last, on= common_columns)
route_hash_freq

'To put the different trip_ids in a list after joining and add the departure_time first and last'

Unnamed: 0,route_id,route_long_name,hash,hash_inverse,service_id,trip_id,departure_time_first,departure_time_last
0,115,TOURNAI -- MOUSCRON,-8715344402839177113,7308961383809034169,14,"[88____:007::8885704:8885001:4:523:20210418, 8...","[05:07:00, 06:07:00, 07:07:00, 08:07:00, 09:07...","[05:23:00, 06:23:00, 07:23:00, 08:23:00, 09:23..."
1,115,TOURNAI -- MOUSCRON,-8715344402839177113,7308961383809034169,25,[88____:007::8885704:8885001:4:23:20210418],[00:07:00],[00:23:00]
2,115,TOURNAI -- MOUSCRON,-8715344402839177113,7308961383809034169,31,"[88____:007::8885704:8885001:4:1723:20210417, ...","[17:07:00, 18:07:00, 19:07:00, 20:07:00, 21:07...","[17:23:00, 18:23:00, 19:23:00, 20:23:00, 21:23..."
3,115,TOURNAI -- MOUSCRON,-8715344402839177113,7308961383809034169,42,"[88____:007::8885704:8885001:4:523:20210530, 8...","[05:07:00, 06:07:00, 07:07:00, 08:07:00, 09:07...","[05:23:00, 06:23:00, 07:23:00, 08:23:00, 09:23..."
4,115,TOURNAI -- MOUSCRON,-8715344402839177113,7308961383809034169,93,[88____:007::8885704:8885001:4:23:20210530],[00:07:00],[00:23:00]
...,...,...,...,...,...,...,...,...
5994,733,DEN HAAG HS (NL) -- BRUXELLES-MIDI,8401325233569951713,4034954373303033338,1566,"[88____:007::8821105:8814001:22:1716:20211211,...","[16:01:00, 20:01:00]","[17:16:00, 21:16:00]"
5995,734,DEN HAAG HS (NL) -- BRUXELLES-NORD,-9198331999539632466,-7009744345638251698,25,"[84____:007::8400280:8821105:4:1200:20210418, ...","[10:40:00, 14:40:00]","[12:01:00, 16:01:00]"
5996,734,DEN HAAG HS (NL) -- BRUXELLES-NORD,-1170946074405319360,2506171550043311866,25,"[84____:007::8400131:8400280:3:1320:20210418, ...","[12:26:00, 16:26:00]","[13:20:00, 17:20:00]"
5997,734,DEN HAAG HS (NL) -- BRUXELLES-NORD,340386202383150578,-6422827118264685072,25,"[88____:007::8812005:8400131:23:1218:20210418,...","[10:37:00, 14:37:00]","[12:26:00, 16:26:00]"


In [255]:
'''To add the sequence of stops to the route_hash_freq dataset'''
route_hash_freq = pd.merge(route_hash_freq, trips_hash_stop_sequence[['route_id','hash', 'hash_inverse', 'service_id','stop_sequence']], on=['route_id', 'hash', 'hash_inverse', 'service_id'], how='left')
route_hash_freq = route_hash_freq.drop_duplicates( subset = ['route_id', 'hash', 'service_id'], keep = 'first')

route_hash_freq

'To add the sequence of stops to the route_hash_freq dataset'

Unnamed: 0,route_id,route_long_name,hash,hash_inverse,service_id,trip_id,departure_time_first,departure_time_last,stop_sequence
0,115,TOURNAI -- MOUSCRON,-8715344402839177113,7308961383809034169,14,"[88____:007::8885704:8885001:4:523:20210418, 8...","[05:07:00, 06:07:00, 07:07:00, 08:07:00, 09:07...","[05:23:00, 06:23:00, 07:23:00, 08:23:00, 09:23...","[MOUSCRON, HERSEAUX, FROYENNES, TOURNAI]"
12,115,TOURNAI -- MOUSCRON,-8715344402839177113,7308961383809034169,25,[88____:007::8885704:8885001:4:23:20210418],[00:07:00],[00:23:00],"[MOUSCRON, HERSEAUX, FROYENNES, TOURNAI]"
13,115,TOURNAI -- MOUSCRON,-8715344402839177113,7308961383809034169,31,"[88____:007::8885704:8885001:4:1723:20210417, ...","[17:07:00, 18:07:00, 19:07:00, 20:07:00, 21:07...","[17:23:00, 18:23:00, 19:23:00, 20:23:00, 21:23...","[MOUSCRON, HERSEAUX, FROYENNES, TOURNAI]"
20,115,TOURNAI -- MOUSCRON,-8715344402839177113,7308961383809034169,42,"[88____:007::8885704:8885001:4:523:20210530, 8...","[05:07:00, 06:07:00, 07:07:00, 08:07:00, 09:07...","[05:23:00, 06:23:00, 07:23:00, 08:23:00, 09:23...","[MOUSCRON, HERSEAUX, FROYENNES, TOURNAI]"
32,115,TOURNAI -- MOUSCRON,-8715344402839177113,7308961383809034169,93,[88____:007::8885704:8885001:4:23:20210530],[00:07:00],[00:23:00],"[MOUSCRON, HERSEAUX, FROYENNES, TOURNAI]"
...,...,...,...,...,...,...,...,...,...
25715,733,DEN HAAG HS (NL) -- BRUXELLES-MIDI,8401325233569951713,4034954373303033338,1566,"[88____:007::8821105:8814001:22:1716:20211211,...","[16:01:00, 20:01:00]","[17:16:00, 21:16:00]","[NOORDERKEMPEN (BRECHT), ANVERS-LUCHTBAL, ANVE..."
25717,734,DEN HAAG HS (NL) -- BRUXELLES-NORD,-9198331999539632466,-7009744345638251698,25,"[84____:007::8400280:8821105:4:1200:20210418, ...","[10:40:00, 14:40:00]","[12:01:00, 16:01:00]","[DEN HAAG HS (NL), ROTTERDAM CS (NL), BREDA (N..."
25719,734,DEN HAAG HS (NL) -- BRUXELLES-NORD,-1170946074405319360,2506171550043311866,25,"[84____:007::8400131:8400280:3:1320:20210418, ...","[12:26:00, 16:26:00]","[13:20:00, 17:20:00]","[BREDA (NL), ROTTERDAM CS (NL), DEN HAAG HS (NL)]"
25721,734,DEN HAAG HS (NL) -- BRUXELLES-NORD,340386202383150578,-6422827118264685072,25,"[88____:007::8812005:8400131:23:1218:20210418,...","[10:37:00, 14:37:00]","[12:26:00, 16:26:00]","[BRUXELLES-NORD, SCHAERBEEK, BRUSSELS AIRPORT-..."


In [256]:
'''To calculate the number of trip ids in the list of trip_ids and to add it as a new column'''
number_trip_ids = []
for list_trip_ids in route_hash_freq['trip_id']:
    count = len(list_trip_ids)
    number_trip_ids.append(count)
route_hash_freq['number_trip_ids'] = number_trip_ids

route_hash_freq

'To calculate the number of trip ids in the list of trip_ids and to add it as a new column'

Unnamed: 0,route_id,route_long_name,hash,hash_inverse,service_id,trip_id,departure_time_first,departure_time_last,stop_sequence,number_trip_ids
0,115,TOURNAI -- MOUSCRON,-8715344402839177113,7308961383809034169,14,"[88____:007::8885704:8885001:4:523:20210418, 8...","[05:07:00, 06:07:00, 07:07:00, 08:07:00, 09:07...","[05:23:00, 06:23:00, 07:23:00, 08:23:00, 09:23...","[MOUSCRON, HERSEAUX, FROYENNES, TOURNAI]",12
12,115,TOURNAI -- MOUSCRON,-8715344402839177113,7308961383809034169,25,[88____:007::8885704:8885001:4:23:20210418],[00:07:00],[00:23:00],"[MOUSCRON, HERSEAUX, FROYENNES, TOURNAI]",1
13,115,TOURNAI -- MOUSCRON,-8715344402839177113,7308961383809034169,31,"[88____:007::8885704:8885001:4:1723:20210417, ...","[17:07:00, 18:07:00, 19:07:00, 20:07:00, 21:07...","[17:23:00, 18:23:00, 19:23:00, 20:23:00, 21:23...","[MOUSCRON, HERSEAUX, FROYENNES, TOURNAI]",7
20,115,TOURNAI -- MOUSCRON,-8715344402839177113,7308961383809034169,42,"[88____:007::8885704:8885001:4:523:20210530, 8...","[05:07:00, 06:07:00, 07:07:00, 08:07:00, 09:07...","[05:23:00, 06:23:00, 07:23:00, 08:23:00, 09:23...","[MOUSCRON, HERSEAUX, FROYENNES, TOURNAI]",12
32,115,TOURNAI -- MOUSCRON,-8715344402839177113,7308961383809034169,93,[88____:007::8885704:8885001:4:23:20210530],[00:07:00],[00:23:00],"[MOUSCRON, HERSEAUX, FROYENNES, TOURNAI]",1
...,...,...,...,...,...,...,...,...,...,...
25715,733,DEN HAAG HS (NL) -- BRUXELLES-MIDI,8401325233569951713,4034954373303033338,1566,"[88____:007::8821105:8814001:22:1716:20211211,...","[16:01:00, 20:01:00]","[17:16:00, 21:16:00]","[NOORDERKEMPEN (BRECHT), ANVERS-LUCHTBAL, ANVE...",2
25717,734,DEN HAAG HS (NL) -- BRUXELLES-NORD,-9198331999539632466,-7009744345638251698,25,"[84____:007::8400280:8821105:4:1200:20210418, ...","[10:40:00, 14:40:00]","[12:01:00, 16:01:00]","[DEN HAAG HS (NL), ROTTERDAM CS (NL), BREDA (N...",2
25719,734,DEN HAAG HS (NL) -- BRUXELLES-NORD,-1170946074405319360,2506171550043311866,25,"[84____:007::8400131:8400280:3:1320:20210418, ...","[12:26:00, 16:26:00]","[13:20:00, 17:20:00]","[BREDA (NL), ROTTERDAM CS (NL), DEN HAAG HS (NL)]",2
25721,734,DEN HAAG HS (NL) -- BRUXELLES-NORD,340386202383150578,-6422827118264685072,25,"[88____:007::8812005:8400131:23:1218:20210418,...","[10:37:00, 14:37:00]","[12:26:00, 16:26:00]","[BRUXELLES-NORD, SCHAERBEEK, BRUSSELS AIRPORT-...",2


In [257]:
#CHANGED
'''To merge the route_hash_freq df with the service_id_dates to get the sets of corresponding dates'''
route_hash_service_freq = pd.merge(route_hash_freq, service_id_dates, on='service_id', how='inner')
route_hash_service_freq

'To merge the route_hash_freq df with the service_id_dates to get the sets of corresponding dates'

Unnamed: 0,route_id,route_long_name,hash,hash_inverse,service_id,trip_id,departure_time_first,departure_time_last,stop_sequence,number_trip_ids,dates,count_service_id
0,115,TOURNAI -- MOUSCRON,-8715344402839177113,7308961383809034169,14,"[88____:007::8885704:8885001:4:523:20210418, 8...","[05:07:00, 06:07:00, 07:07:00, 08:07:00, 09:07...","[05:23:00, 06:23:00, 07:23:00, 08:23:00, 09:23...","[MOUSCRON, HERSEAUX, FROYENNES, TOURNAI]",12,"{20210417, 20210418}",2
1,115,TOURNAI -- MOUSCRON,7308961383809034169,-8715344402839177113,14,"[88____:007::8885001:8885704:4:552:20210418, 8...","[05:36:00, 06:36:00, 07:36:00, 08:36:00, 09:36...","[05:52:00, 06:52:00, 07:52:00, 08:52:00, 09:52...","[TOURNAI, FROYENNES, HERSEAUX, MOUSCRON]",12,"{20210417, 20210418}",2
2,121,LUTTRE -- CHARLEROI-SUD,-1358542462748784293,-1132779382835063756,14,"[88____:007::8872009:8871308:5:455:20210418, 8...","[04:36:00, 05:36:00, 06:36:00, 07:36:00, 08:36...","[04:55:00, 05:55:00, 06:55:00, 07:55:00, 08:55...","[CHARLEROI-SUD, MARCHIENNE-AU-PONT, ROUX, COUR...",20,"{20210417, 20210418}",2
3,121,LUTTRE -- CHARLEROI-SUD,-1132779382835063756,-1358542462748784293,14,"[88____:007::8871308:8872009:5:522:20210418, 8...","[05:05:00, 06:05:00, 07:05:00, 08:05:00, 09:05...","[05:22:00, 06:22:00, 07:22:00, 08:22:00, 09:22...","[LUTTRE, COURCELLES-MOTTE, ROUX, MARCHIENNE-AU...",19,"{20210417, 20210418}",2
4,125,OPWIJK -- LOKEREN,2592523831447460144,-1532388794183579375,14,"[88____:007::8893401:8894201:3:845:20210418, 8...","[08:31:00, 09:31:00, 10:31:00, 11:31:00, 12:31...","[08:45:00, 09:45:00, 10:45:00, 11:45:00, 12:45...","[TERMONDE, ZELE, LOKEREN]",15,"{20210417, 20210418}",2
...,...,...,...,...,...,...,...,...,...,...,...,...
5811,733,DEN HAAG HS (NL) -- BRUXELLES-MIDI,900702404119183476,-8903452509635242662,1553,[88____:007::8814001:8400131:23:818:20211211],[06:44:00],[08:26:00],"[BRUXELLES-MIDI, BRUXELLES-CHAPELLE, BRUXELLES...",1,"{20210703, 20210320, 20210321, 20210704, 20210...",35
5812,726,AMSTERDAM CS (NL) -- BRUXELLES-MIDI,900702404119183476,-8903452509635242662,1558,[88____:007::8814001:8400131:23:918:20211211],[07:44:00],[09:26:00],"[BRUXELLES-MIDI, BRUXELLES-CHAPELLE, BRUXELLES...",1,"{20210701, 20210702, 20210703, 20210704, 20210...",119
5813,726,AMSTERDAM CS (NL) -- BRUXELLES-MIDI,6091822811892309655,-8364218555468505001,1558,[84____:007::8400131:8400058:4:1038:20211211],[09:26:00],[10:38:00],"[BREDA (NL), ROTTERDAM CS (NL), SCHIPHOL (NL),...",1,"{20210701, 20210702, 20210703, 20210704, 20210...",119
5814,726,AMSTERDAM CS (NL) -- BRUXELLES-MIDI,900702404119183476,-8903452509635242662,1559,[88____:007::8814001:8400131:23:1118:20211211],[09:44:00],[11:26:00],"[BRUXELLES-MIDI, BRUXELLES-CHAPELLE, BRUXELLES...",1,"{20210703, 20210320, 20210321, 20210704, 20210...",33


In [258]:
#CHANGED => maybe delete
'''Groups the service_id together for each route_id and hash combination'''
route_hash_service_freq_copy = route_hash_service_freq.copy()
route_hash_service_freq_copy = route_hash_service_freq_copy.drop(['dates', 'count_service_id'], axis = 1)
for index, combi_route_id_hash in route_hash_service_freq_copy.groupby(['route_id','hash'], as_index = False)['service_id'].last().iterrows():
    set_service_id = set(route_hash_service_freq_copy.loc[(route_hash_service_freq_copy['route_id'] == combi_route_id_hash['route_id']) & (route_hash_service_freq_copy['hash'] == combi_route_id_hash['hash'])]['service_id'])
    route_hash_service_freq_copy.loc[(route_hash_service_freq_copy['route_id'] == combi_route_id_hash['route_id']) & (route_hash_service_freq_copy['hash'] == combi_route_id_hash['hash']),['service_id']] = set_service_id
route_hash_service_freq_copy

'Groups the service_id together for each route_id and hash combination'

Unnamed: 0,route_id,route_long_name,hash,hash_inverse,service_id,trip_id,departure_time_first,departure_time_last,stop_sequence,number_trip_ids
0,115,TOURNAI -- MOUSCRON,-8715344402839177113,7308961383809034169,"{42, 14, 25, 122, 93, 31}","[88____:007::8885704:8885001:4:523:20210418, 8...","[05:07:00, 06:07:00, 07:07:00, 08:07:00, 09:07...","[05:23:00, 06:23:00, 07:23:00, 08:23:00, 09:23...","[MOUSCRON, HERSEAUX, FROYENNES, TOURNAI]",12
1,115,TOURNAI -- MOUSCRON,7308961383809034169,-8715344402839177113,"{42, 14, 25, 122, 93, 31}","[88____:007::8885001:8885704:4:552:20210418, 8...","[05:36:00, 06:36:00, 07:36:00, 08:36:00, 09:36...","[05:52:00, 06:52:00, 07:52:00, 08:52:00, 09:52...","[TOURNAI, FROYENNES, HERSEAUX, MOUSCRON]",12
2,121,LUTTRE -- CHARLEROI-SUD,-1358542462748784293,-1132779382835063756,"{154, 14}","[88____:007::8872009:8871308:5:455:20210418, 8...","[04:36:00, 05:36:00, 06:36:00, 07:36:00, 08:36...","[04:55:00, 05:55:00, 06:55:00, 07:55:00, 08:55...","[CHARLEROI-SUD, MARCHIENNE-AU-PONT, ROUX, COUR...",20
3,121,LUTTRE -- CHARLEROI-SUD,-1132779382835063756,-1358542462748784293,"{154, 14}","[88____:007::8871308:8872009:5:522:20210418, 8...","[05:05:00, 06:05:00, 07:05:00, 08:05:00, 09:05...","[05:22:00, 06:22:00, 07:22:00, 08:22:00, 09:22...","[LUTTRE, COURCELLES-MOTTE, ROUX, MARCHIENNE-AU...",19
4,125,OPWIJK -- LOKEREN,2592523831447460144,-1532388794183579375,"{28, 14}","[88____:007::8893401:8894201:3:845:20210418, 8...","[08:31:00, 09:31:00, 10:31:00, 11:31:00, 12:31...","[08:45:00, 09:45:00, 10:45:00, 11:45:00, 12:45...","[TERMONDE, ZELE, LOKEREN]",15
...,...,...,...,...,...,...,...,...,...,...
5811,733,DEN HAAG HS (NL) -- BRUXELLES-MIDI,900702404119183476,-8903452509635242662,"{33, 1565, 361, 268, 1553, 212, 1563, 29}",[88____:007::8814001:8400131:23:818:20211211],[06:44:00],[08:26:00],"[BRUXELLES-MIDI, BRUXELLES-CHAPELLE, BRUXELLES...",1
5812,726,AMSTERDAM CS (NL) -- BRUXELLES-MIDI,900702404119183476,-8903452509635242662,"{33, 1565, 361, 268, 1553, 212, 341, 1558, 155...",[88____:007::8814001:8400131:23:918:20211211],[07:44:00],[09:26:00],"[BRUXELLES-MIDI, BRUXELLES-CHAPELLE, BRUXELLES...",1
5813,726,AMSTERDAM CS (NL) -- BRUXELLES-MIDI,6091822811892309655,-8364218555468505001,"{1565, 361, 268, 1553, 212, 341, 1558, 1559, 2...",[84____:007::8400131:8400058:4:1038:20211211],[09:26:00],[10:38:00],"[BREDA (NL), ROTTERDAM CS (NL), SCHIPHOL (NL),...",1
5814,726,AMSTERDAM CS (NL) -- BRUXELLES-MIDI,900702404119183476,-8903452509635242662,"{33, 1565, 361, 268, 1553, 212, 341, 1558, 155...",[88____:007::8814001:8400131:23:1118:20211211],[09:44:00],[11:26:00],"[BRUXELLES-MIDI, BRUXELLES-CHAPELLE, BRUXELLES...",1


In [259]:
'''Get the distinct stop sequences for creating the possible roads combinations later on'''
distinct_stop_sequences = route_hash_service_freq_copy.drop_duplicates(subset = ["route_id", 'hash'])[['route_id','hash','stop_sequence', 'service_id']]
distinct_stop_sequences

'Get the distinct stop sequences for creating the possible roads combinations later on'

Unnamed: 0,route_id,hash,stop_sequence,service_id
0,115,-8715344402839177113,"[MOUSCRON, HERSEAUX, FROYENNES, TOURNAI]","{42, 14, 25, 122, 93, 31}"
1,115,7308961383809034169,"[TOURNAI, FROYENNES, HERSEAUX, MOUSCRON]","{42, 14, 25, 122, 93, 31}"
2,121,-1358542462748784293,"[CHARLEROI-SUD, MARCHIENNE-AU-PONT, ROUX, COUR...","{154, 14}"
3,121,-1132779382835063756,"[LUTTRE, COURCELLES-MOTTE, ROUX, MARCHIENNE-AU...","{154, 14}"
4,125,2592523831447460144,"[TERMONDE, ZELE, LOKEREN]","{28, 14}"
...,...,...,...,...
5760,708,286669048106850340,"[ARLON, VIVILLE, STOCKEM, HABAY, MARBEHAN, NEU...",{1545}
5764,717,-5470904147367018403,"[POPERINGE, YPRES, COMINES, WERVIK, MENIN, WEV...",{1549}
5771,726,737182940829732028,"[ROTTERDAM CS (NL), DEN HAAG HS (NL), SCHIPHOL...",{1563}
5776,733,759975243806517983,"[ROTTERDAM CS (NL), DEN HAAG HS (NL)]","{1553, 1563, 1565}"


## Functions for the route creation

In [517]:
'''Some functions to better factorise the functions in the coming cells'''

def select_stop_sequences(stop_sequences_df, route_id):
    '''retruns the stop sequences with the selected route_id'''
    return stop_sequences_df[stop_sequences_df['route_id'] == route_id].copy()

#NEW
def take_leftovers_list_c_from_intersection_AAndB(list_a, list_b, list_c):
    '''take the indexes of the intersection of list a with list b and retain the elments of list c with that index'''
    ind_dict = dict((k,i) for i,k in enumerate(list_a))
    return [list_c[ind_dict[x]] for x in (set(list_a).intersection(list_b))]

#NEW => to delete?
def frequency_hash_sequence(route_hash_service_freq, route_id, hash_sequence):
    '''returns the frequency of the constructed route'''
    #filter the df on the route_id
    route_hash_service_freq_route_id = route_hash_service_freq[route_hash_service_freq['route_id'] == route_id].copy()[['hash','dates','departure_time_first','departure_time_last']]
    #set a default key in order to do the cross-join
    route_hash_service_freq_route_id['key'] = 0
    for index_hash, hash_value in enumerate(hash_sequence):
        route_hash_service_freq_route_id_hash = route_hash_service_freq_route_id[route_hash_service_freq_route_id['hash'] == hash_value].copy()
        if index_hash == 0:
            sequence_creation = route_hash_service_freq_route_id_hash
        else:
            sequence_creation = sequence_creation.merge(route_hash_service_freq_route_id_hash, on='key')
            #get only the rows that have a common day and common time
            sequence_creation['dates'] = [set(a).intersection(b) for a, b in zip(sequence_creation['dates_x'], sequence_creation['dates_y'])]
            #drop the rows with no common date
            sequence_creation = sequence_creation[sequence_creation['dates'].map(lambda d: len(d)) > 0]
            #take only the remaining departure_time_last
            sequence_creation['departure_time_last'] = [take_leftovers_list_c_from_intersection_AAndB(a, b, c) for a, b, c in zip(sequence_creation['departure_time_first'], sequence_creation['departure_time_last_x'], sequence_creation['departure_time_last_y'])]
            #drops the rows with no common time
            sequence_creation = sequence_creation[sequence_creation['departure_time_last'].map(lambda d: len(d)) > 0]            
            sequence_creation = sequence_creation.drop(['departure_time_last_x', 'departure_time_last_y', 'dates_x', 'dates_y'], axis=1)

        #delete this row because is not used for the next iteration
        sequence_creation = sequence_creation.drop(['departure_time_first', 'hash'], axis=1)
    if not sequence_creation.empty:
        sequence_creation['number_dates'] = sequence_creation['dates'].str.len()
        sequence_creation['number_times'] = sequence_creation['departure_time_last'].str.len()
        sequence_creation = sequence_creation.drop(['key', 'dates', 'departure_time_last'], axis=1)
        #calculate the total frequency
        return (sequence_creation['number_dates'] * sequence_creation['number_times']).sum()
    else:
        return 0
    

'Some functions to better factorise the functions in the coming cells'

In [501]:
frequency_hash_sequence(route_hash_service_freq.copy(), 733, [900702404119183476, -4488299857125108449, 759975243806517983])

hello---
['08:26:00', '12:26:00', '16:26:00']
hello---
['16:26:00']
hello---
['12:26:00']
hello---
['08:26:00']
hello---
['16:26:00']
hello---
['08:26:00']
hello---
['12:26:00']
hello---
['16:26:00']
hello---
['08:26:00']
hello---
['13:02:00']
hello---
['09:02:00']
hello---
['17:02:00']
hello---
['09:02:00']
hello---
['13:02:00']
hello---
['17:02:00']
hello---
['09:02:00']


71

In [557]:
route_creation_frequency_single[route_creation_frequency_single['route_id'] == 115]

Unnamed: 0,route_id,hash,stop_sequence,service_id,frequency
0,115,[-8715344402839177113],"[MOUSCRON, HERSEAUX, FROYENNES, TOURNAI]","{42, 14, 25, 122, 93, 31}",64
1,115,[7308961383809034169],"[TOURNAI, FROYENNES, HERSEAUX, MOUSCRON]","{42, 14, 25, 122, 93, 31}",64


In [560]:
stop_sequences_df = route_hash_service_freq #[route_hash_service_freq['route_id'] == 115]
index_of_extendable, index_of_begin_sequences, index_of_complete_sequences = get_extention_indexes_new(stop_sequences_df)
route_creation = possible_sequences_construction_new(stop_sequences_df, index_of_extendable, index_of_begin_sequences, index_of_complete_sequences)
route_creation = add_full_sequences_new(stop_sequences_df, route_creation, index_of_complete_sequences)
route_creation = add_unused_sequences_new(stop_sequences_df, route_creation)

route_creation

     route_id                 hash                             stop_sequence  \
0         115 -8715344402839177113  [MOUSCRON, HERSEAUX, FROYENNES, TOURNAI]   
1         115  7308961383809034169  [TOURNAI, FROYENNES, HERSEAUX, MOUSCRON]   
36        115 -8715344402839177113  [MOUSCRON, HERSEAUX, FROYENNES, TOURNAI]   
37        115  7308961383809034169  [TOURNAI, FROYENNES, HERSEAUX, MOUSCRON]   
109       115 -8715344402839177113  [MOUSCRON, HERSEAUX, FROYENNES, TOURNAI]   
110       115  7308961383809034169  [TOURNAI, FROYENNES, HERSEAUX, MOUSCRON]   
148       115 -8715344402839177113  [MOUSCRON, HERSEAUX, FROYENNES, TOURNAI]   
149       115  7308961383809034169  [TOURNAI, FROYENNES, HERSEAUX, MOUSCRON]   
183       115 -8715344402839177113  [MOUSCRON, HERSEAUX, FROYENNES, TOURNAI]   
184       115  7308961383809034169  [TOURNAI, FROYENNES, HERSEAUX, MOUSCRON]   
200       115 -8715344402839177113  [MOUSCRON, HERSEAUX, FROYENNES, TOURNAI]   
201       115  7308961383809034169  [TOU

      route_id                 hash  \
11         243 -7148625589826827818   
12         243 -2716123538852776006   
163        243  4276939365648555575   
474        243 -7150316855204782899   
477        243  7195041732240843477   
910        243 -2875955484167781122   
1018       243 -5135827936711626168   
1438       243 -7150316855204782899   
1520       243  2328365511239321847   
1636       243 -7150316855204782899   
1704       243  2328365511239321847   
2447       243  7195041732240843477   
2616       243 -5135827936711626168   
3001       243 -1343108316726836048   
3235       243  1825679146225433049   
3247       243 -7150316855204782899   
3248       243  7195041732240843477   
3278       243 -5135827936711626168   
3772       243 -5135827936711626168   
3773       243  2328365511239321847   
3785       243 -7150316855204782899   
3792       243 -7150316855204782899   
3843       243 -5135827936711626168   
3859       243  1883510672781335899   
3860       243  241781429

      route_id                 hash  \
35         517 -1478523252589975382   
79         517 -1478523252589975382   
80         517  4083424369975548313   
143        517 -1478523252589975382   
144        517  4083424369975548313   
379        517 -1478523252589975382   
380        517  4083424369975548313   
436        517 -1478523252589975382   
579        517  1761474427411238948   
1200       517  4083424369975548313   
1585       517 -1478523252589975382   
1586       517  4083424369975548313   
3326       517  1761474427411238948   
5466       517  4083424369975548313   
5511       517  5856645604067534896   
5520       517  4083424369975548313   
5521       517  4083424369975548313   
5522       517  4083424369975548313   

                                  stop_sequence  \
35    [LIBRAMONT, BERTRIX, FLORENVILLE, VIRTON]   
79    [LIBRAMONT, BERTRIX, FLORENVILLE, VIRTON]   
80    [VIRTON, FLORENVILLE, BERTRIX, LIBRAMONT]   
143   [LIBRAMONT, BERTRIX, FLORENVILLE, VIRTON]   
144

      route_id                 hash  \
55         242  4059692498839559876   
473        242  7076205979970266383   
1017       242  4059692498839559876   
2031       242 -4248384411814824933   
3346       242  7076205979970266383   
3379       242  4059692498839559876   
3380       242  5908755923806116860   
3758       242 -7675070388608888177   
3759       242 -7675070388608888177   
3760       242  4059692498839559876   
3761       242 -7675070388608888177   
3762       242  4059692498839559876   
3763       242 -7675070388608888177   
3765       242 -7675070388608888177   
3766       242 -7675070388608888177   
3767       242 -7675070388608888177   
3768       242 -2255249650968979007   
3769       242 -2255249650968979007   
3770       242  4059692498839559876   
3771       242  4059692498839559876   
3780       242  4059692498839559876   
3781       242  7076205979970266383   
3784       242  7076205979970266383   

                                          stop_sequence  \
55  

5554                     [17:43:00, 19:43:00, 21:43:00]  
      route_id                 hash  \
82         520  1445668724565158456   
83         520  8841783036238445570   
146        520  8841783036238445570   
735        520  1445668724565158456   
785        520  1445668724565158456   
1463       520  1445668724565158456   
1772       520  1445668724565158456   
1773       520  8841783036238445570   
1792       520  1445668724565158456   
2201       520  1445668724565158456   
2762       520  1445668724565158456   
2918       520  1445668724565158456   
2919       520  8841783036238445570   
4525       520  1445668724565158456   
4526       520  8841783036238445570   
4539       520  1445668724565158456   
4598       520  8841783036238445570   
5528       520  1445668724565158456   
5533       520  8841783036238445570   
5549       520  8841783036238445570   
5555       520  8841783036238445570   

                                          stop_sequence  \
82    [LIBRAMONT, BERTRI

      route_id                 hash  \
132        384  7283122547148435007   
857        384  2670605744160045325   
2964       384 -3528387876644013233   
2966       384  2670605744160045325   

                                          stop_sequence                 dates  \
132   [SAINT-GHISLAIN, QUAREGNON, JEMAPPES, MONS, NI...            {20210417}   
857   [TOURNAI, ANTOING, MAUBRAY, CALLENELLE, PERUWE...  {20210328, 20210327}   
2964  [LIERS, MILMORT, HERSTAL, LIEGE-SAINT-LAMBERT,...            {20210327}   
2966  [TOURNAI, ANTOING, MAUBRAY, CALLENELLE, PERUWE...            {20210327}   

                                    departure_time_last  
132                                          [09:45:00]  
857                                [16:45:00, 17:45:00]  
2964  [09:31:00, 10:31:00, 11:31:00, 12:31:00, 13:31...  
2966  [10:45:00, 11:45:00, 12:45:00, 13:45:00, 14:45...  
     route_id                 hash  \
135       391 -1390942414262072276   
136       391 -10004269403048379

      route_id                 hash  \
173        326 -6750927614857218961   
3279       326 -6978736182281429644   
3280       326 -6750927614857218961   
3774       326 -6978736182281429644   
3775       326 -6750927614857218961   

                                          stop_sequence  \
173   [BRUXELLES-MIDI, BRUXELLES-CHAPELLE, BRUXELLES...   
3279  [SCHAERBEEK, BRUXELLES-NORD, BRUXELLES-CONGRES...   
3280  [BRUXELLES-MIDI, BRUXELLES-CHAPELLE, BRUXELLES...   
3774  [SCHAERBEEK, BRUXELLES-NORD, BRUXELLES-CONGRES...   
3775  [BRUXELLES-MIDI, BRUXELLES-CHAPELLE, BRUXELLES...   

                                                  dates departure_time_last  
173                                {20210529, 20210530}          [05:55:00]  
3279  {20210529, 20210530, 20210321, 20210327, 20210...          [22:21:00]  
3280  {20210529, 20210530, 20210321, 20210327, 20210...          [06:55:00]  
3774  {20210529, 20210530, 20210314, 20210321, 20210...          [23:52:00]  
3775  {20210529, 202

4698  [08:51:00, 09:51:00, 10:51:00, 13:51:00, 14:51...  
      route_id                 hash  \
242        372 -8387043959675013325   
243        372 -6080752863069817388   
244        372  1719328663977354485   
245        372  4451535014980073537   
523        372 -8387043959675013325   
524        372 -6080752863069817388   
525        372  1719328663977354485   
526        372  4451535014980073537   
1656       372 -5473781485371436671   
1657       372  3281612140154028252   
1658       372  4948517410590740151   
1659       372  5022361716855961007   
2297       372 -6080752863069817388   
2638       372 -6080752863069817388   
3122       372  1719328663977354485   
3164       372  6341906606078114547   
3238       372  6092025540634190733   
3858       372  4451535014980073537   
4396       372 -6080752863069817388   
4737       372 -8387043959675013325   
4864       372 -6080752863069817388   
4867       372  1719328663977354485   
4871       372 -6080752863069817388   
4872  

2734                                         [07:27:00]  
      route_id                 hash  \
3027       186 -9111977460614303161   

                                    stop_sequence  \
3027  [BLANKENBERGE, BRUGES-SAINT-PIERRE, BRUGES]   

                                                  dates departure_time_last  
3027  {20210701, 20210702, 20210703, 20210704, 20210...          [22:26:00]  
      route_id                 hash  \
294        211 -6664271691313289715   
295        211  2315590778465703681   
1059       211  2315590778465703681   
1270       211  2315590778465703681   

                                          stop_sequence  \
294   [ANVERS-CENTRAL, ANVERS-BERCHEM, ANVERS-SUD, Z...   
295   [GAND-SAINT-PIERRE, GENTBRUGGE, GAND-DAMPOORT,...   
1059  [GAND-SAINT-PIERRE, GENTBRUGGE, GAND-DAMPOORT,...   
1270  [GAND-SAINT-PIERRE, GENTBRUGGE, GAND-DAMPOORT,...   

                                                  dates  \
294                                {20210320, 202

4567  {20210701, 20210702, 20210703, 20210706, 20210...            [01:00:00]  
      route_id                 hash  \
322        417 -4932554118009358877   
4889       417  5097434673052559346   

                                          stop_sequence                 dates  \
322   [OTTIGNIES, CEROUX-MOUSTY, COURT-SAINT-ETIENNE...  {20210320, 20210321}   
4889  [TAMINES, AISEAU, FARCIENNES, LE CAMPINAIRE, C...  {20210328, 20210530}   

                                    departure_time_last  
322            [08:01:00, 10:01:00, 12:01:00, 14:01:00]  
4889  [08:52:00, 10:52:00, 12:52:00, 14:52:00, 16:52...  
      route_id                 hash  \
338        498  3638471393402577837   
427        498    70385153017668285   
727        498  2661483625860810638   
1184       498 -5915631635308354685   
1186       498    70385153017668285   
1767       498  2661483625860810638   
1789       498  7088116581807816291   
2501       498  1695345922806741328   
2502       498  70881165818078162

5517                                         [19:55:00]  
      route_id                 hash  \
387        664  7756980657873966633   
443        664  5506090888823098952   
886        664 -5902038809981084636   
4076       664  7756980657873966633   
5737       664  7756980657873966633   

                                          stop_sequence  \
387   [GAND-SAINT-PIERRE, DE PINTE, EKE-NAZARETH, GA...   
443   [EEKLO, WAARSCHOOT, SLEIDINGE, EVERGEM, WONDEL...   
886   [EEKLO, WAARSCHOOT, SLEIDINGE, EVERGEM, WONDEL...   
4076  [GAND-SAINT-PIERRE, DE PINTE, EKE-NAZARETH, GA...   
5737  [GAND-SAINT-PIERRE, DE PINTE, EKE-NAZARETH, GA...   

                                                  dates departure_time_last  
387                                {20210424, 20210425}          [22:27:00]  
443                                          {20210314}          [21:36:00]  
886                                {20210328, 20210327}          [22:27:00]  
4076  {20210529, 20210626, 20210508, 202

      route_id                 hash  \
438        531 -7286257291072569518   
439        531 -1801329991852335875   
949        531  8253333483432718156   
2663       531 -7286257291072569518   
4721       531 -1801329991852335875   

                                          stop_sequence  \
438   [LOKEREN, SINAAI, BELSELE, SAINT-NICOLAS, NIEU...   
439   [ANVERS-CENTRAL, ANVERS-BERCHEM, ANVERS-SUD, Z...   
949   [MELSELE, BEVEREN, NIEUWKERKEN-WAAS, SAINT-NIC...   
2663  [LOKEREN, SINAAI, BELSELE, SAINT-NICOLAS, NIEU...   
4721  [ANVERS-CENTRAL, ANVERS-BERCHEM, ANVERS-SUD, Z...   

                                                  dates  \
438                                          {20210314}   
439                                          {20210314}   
949                                {20210523, 20210524}   
2663                                         {20210314}   
4721  {20210403, 20210404, 20210314, 20210320, 20210...   

                                    departure_time_last

     route_id                 hash  \
493       289 -5734841131219186381   

                                         stop_sequence  \
493  [BRUXELLES-NORD, BRUXELLES-CONGRES, BRUXELLES-...   

                                                 dates departure_time_last  
493  {20210701, 20210702, 20210705, 20210706, 20210...          [24:30:00]  
      route_id                 hash  \
494        290  -692947577573549985   
495        290  7267626483043736005   
1547       290 -7007145753817247723   
1843       290 -7328805606577428848   
1844       290   469305911742773617   
3712       290 -7007145753817247723   
3727       290 -7007145753817247723   
3891       290 -6567444460946742377   
4389       290 -6567444460946742377   
4393       290 -6567444460946742377   
4394       290  7267626483043736005   
4395       290  7267626483043736005   

                                          stop_sequence  \
494   [BRUGES, OOSTKAMP, BEERNEM, MARIA-AALTER, AALT...   
495   [BRUSSELS AIRPORT-ZA

2132                                         {20210508}          [23:30:00]  
      route_id                 hash  \
529        392 -1103428246735778235   
3701       392 -5573211191298000631   
4942       392 -5573211191298000631   

                                          stop_sequence  \
529   [CHARLEROI-SUD, MARCHIENNE-AU-PONT, FORCHIES, ...   
3701  [MONS, NIMY, OBOURG, HAVRE, THIEU, BRACQUEGNIE...   
4942  [MONS, NIMY, OBOURG, HAVRE, THIEU, BRACQUEGNIE...   

                                                  dates departure_time_last  
529   {20210701, 20210702, 20210705, 20210706, 20210...          [05:21:00]  
3701  {20210315, 20210316, 20210317, 20210318, 20210...          [23:19:00]  
4942  {20210701, 20210702, 20210705, 20210706, 20210...          [23:17:00]  
      route_id                 hash  \
530        401 -7706471906757626645   
531        401  2749481481042993289   
4955       401 -7706471906757626645   
4956       401  2749481481042993289   
4957       401 -77064

563  {20210701, 20210702, 20210705, 20210706, 20210...  [06:23:00, 07:23:00]  
     route_id                 hash  \
564       496 -2792675665818819656   

                                         stop_sequence  \
564  [GAND-SAINT-PIERRE, MERELBEKE, MELLE, KWATRECH...   

                                                 dates departure_time_last  
564  {20210701, 20210702, 20210705, 20210706, 20210...          [05:29:00]  
      route_id                 hash  \
565        497 -6262885092226317167   
566        497 -3060952569695407696   
1183       497 -6262885092226317167   
1335       497 -6262885092226317167   
2649       497 -3060952569695407696   
3907       497  2640872082649394657   
5401       497  7755435164769936652   

                                          stop_sequence  \
565   [BRUGES, OOSTKAMP, BEERNEM, MARIA-AALTER, AALT...   
566   [MALINES, KAPELLE-OP-DEN-BOS, LONDERZEEL, MALD...   
1183  [BRUGES, OOSTKAMP, BEERNEM, MARIA-AALTER, AALT...   
1335  [BRUGES, OOSTKAMP,

3930                                         [06:12:00]  
     route_id                 hash  \
603       577  5510691155063958306   

                                         stop_sequence  \
603  [NOORDERKEMPEN (BRECHT), ANVERS-LUCHTBAL, ANVE...   

                                                 dates departure_time_last  
603  {20210701, 20210702, 20210705, 20210706, 20210...          [07:51:00]  
     route_id                 hash  \
604       580  2014086824462885914   

                                         stop_sequence  \
604  [LIERRE, KESSEL, NIJLEN, BOUWEL, WOLFSTEE, HER...   

                                                 dates   departure_time_last  
604  {20210701, 20210702, 20210705, 20210706, 20210...  [06:53:00, 07:53:00]  
      route_id                 hash  \
605        581 -8180143146111207349   
606        581 -1523163781393359900   
4272       581 -8180143146111207349   
4273       581 -1523163781393359900   

                                          stop

4752                                [12:46:00]  
      route_id                 hash  \
640        674   759477633652129621   
641        674  7316748874184494770   
1256       674  8421876872232231798   
1383       674  8421876872232231798   
3220       674 -4652674864162047935   
3221       674  6030685233485451679   

                                          stop_sequence  \
640   [GAND-SAINT-PIERRE, DENDERLEEUW, IDDERGEM, OKE...   
641   [GRAMMONT, SCHENDELBEKE, IDEGEM, ZANDBERGEN, A...   
1256  [GRAMMONT, SCHENDELBEKE, IDEGEM, ZANDBERGEN, A...   
1383  [GRAMMONT, SCHENDELBEKE, IDEGEM, ZANDBERGEN, A...   
3220  [GRAMMONT, LIERDE, ZOTTEGEM, BALEGEM-ZUID, BAL...   
3221  [GAND-SAINT-PIERRE, MERELBEKE, MELLE, GONTRODE...   

                                                  dates   departure_time_last  
640   {20210701, 20210702, 20210705, 20210706, 20210...  [17:43:00, 18:46:00]  
641   {20210701, 20210702, 20210705, 20210706, 20210...  [07:13:00, 08:10:00]  
1256  {20210401, 202104

     route_id                 hash  \
699       379 -1541100530165919701   
700       379    79041498109709026   

                                         stop_sequence                 dates  \
699  [NAMUR, MARCHE-LES-DAMES, NAMECHE, SCLAIGNEAUX...  {20210403, 20210404}   
700  [HERSTAL, LIEGE-SAINT-LAMBERT, LIEGE-CARRE, LI...  {20210403, 20210404}   

    departure_time_last  
699          [07:35:00]  
700          [23:28:00]  
      route_id                 hash  \
701        385 -1866056280358760445   
702        385  8535903351610990334   
1760       385  8535903351610990334   
1780       385  8535903351610990334   

                                          stop_sequence                 dates  \
701   [MOUSCRON, HERSEAUX, FROYENNES, TOURNAI, ANTOI...  {20210403, 20210404}   
702   [HERSTAL, LIEGE-SAINT-LAMBERT, LIEGE-CARRE, LI...  {20210403, 20210404}   
1760  [HERSTAL, LIEGE-SAINT-LAMBERT, LIEGE-CARRE, LI...            {20210403}   
1780  [HERSTAL, LIEGE-SAINT-LAMBERT, LIEGE-CAR

      route_id                 hash  \
765        352 -3797960825527799830   
920        352 -6016354482995755432   
3481       352 -5040078956426861262   
3482       352  5316736968006793368   

                                          stop_sequence  \
765   [VILVORDE, HAREN, BORDET, EVERE, MEISER, BRUXE...   
920   [HAL, HUIZINGEN, BEERSEL, MOENSBERG, SAINT-JOB...   
3481  [HAL, HUIZINGEN, BEERSEL, MOENSBERG, SAINT-JOB...   
3482  [MALINES, WEERDE, EPPEGEM, VILVORDE, HAREN, BO...   

                                                  dates  \
765                                {20210410, 20210411}   
920                                {20210522, 20210523}   
3481  {20210406, 20210407, 20210408, 20210409, 20210...   
3482  {20210406, 20210407, 20210408, 20210409, 20210...   

                                    departure_time_last  
765   [09:15:00, 10:15:00, 11:15:00, 12:15:00, 13:15...  
920   [08:23:00, 09:23:00, 10:23:00, 11:23:00, 12:23...  
3481  [06:23:00, 07:23:00, 10:23:00, 1

      route_id                 hash  \
922        406   729818255339550965   
923        406  6611278159959142801   
1502       406   729818255339550965   
2107       406   729818255339550965   
2108       406  6611278159959142801   
2647       406   729818255339550965   
4996       406   729818255339550965   
4997       406   729818255339550965   
4998       406  6611278159959142801   
4999       406   729818255339550965   
5000       406  6611278159959142801   

                                          stop_sequence  \
922   [BRAINE-L'ALLEUD, WATERLOO, DE HOEK, RHODE-SAI...   
923   [LOUVAIN, HERENT, VELTEM, ERPS-KWERPS, KORTENB...   
1502  [BRAINE-L'ALLEUD, WATERLOO, DE HOEK, RHODE-SAI...   
2107  [BRAINE-L'ALLEUD, WATERLOO, DE HOEK, RHODE-SAI...   
2108  [LOUVAIN, HERENT, VELTEM, ERPS-KWERPS, KORTENB...   
2647  [BRAINE-L'ALLEUD, WATERLOO, DE HOEK, RHODE-SAI...   
4996  [BRAINE-L'ALLEUD, WATERLOO, DE HOEK, RHODE-SAI...   
4997  [BRAINE-L'ALLEUD, WATERLOO, DE HOEK, RHODE-SAI...   


3449                                         [01:16:00]  
      route_id                 hash  \
1071       247 -3623093996996615824   
1278       247 -3623093996996615824   
3370       247  1860214891460483536   
3513       247  8874147621072114305   
3537       247  8874147621072114305   
3543       247  8874147621072114305   
3651       247  3617934638352651837   
3694       247  3617934638352651837   
3883       247 -6000714408855093777   
3887       247 -3402725969907686330   
3890       247 -3402725969907686330   
3894       247  3617934638352651837   
3908       247  3617934638352651837   
3909       247  3617934638352651837   
3910       247  3617934638352651837   
3912       247  3617934638352651837   
3913       247  3617934638352651837   
3914       247  3617934638352651837   
3915       247  8874147621072114305   
3916       247  8874147621072114305   
3918       247  8874147621072114305   
3919       247  8874147621072114305   
3920       247  8874147621072114305   
3921  

4615                                         [12:06:00]  
      route_id                 hash  \
1106       322 -7286257291072569518   
1107       322 -1801329991852335875   
1291       322 -7286257291072569518   
1292       322 -1801329991852335875   
2829       322 -7286257291072569518   
4616       322 -7286257291072569518   

                                          stop_sequence  \
1106  [LOKEREN, SINAAI, BELSELE, SAINT-NICOLAS, NIEU...   
1107  [ANVERS-CENTRAL, ANVERS-BERCHEM, ANVERS-SUD, Z...   
1291  [LOKEREN, SINAAI, BELSELE, SAINT-NICOLAS, NIEU...   
1292  [ANVERS-CENTRAL, ANVERS-BERCHEM, ANVERS-SUD, Z...   
2829  [LOKEREN, SINAAI, BELSELE, SAINT-NICOLAS, NIEU...   
4616  [LOKEREN, SINAAI, BELSELE, SAINT-NICOLAS, NIEU...   

                                                  dates  \
1106  {20210401, 20210402, 20210315, 20210316, 20210...   
1107  {20210401, 20210402, 20210315, 20210316, 20210...   
1291  {20210701, 20210702, 20210705, 20210706, 20210...   
1292  {20210701, 2

1318  {20210701, 20210702, 20210705, 20210706, 20210...          [19:03:00]  
      route_id                 hash  \
1144       432 -2290536969035540576   
1145       432  7252804817160899390   
1319       432 -2290536969035540576   
1320       432  7252804817160899390   
3239       432 -2290536969035540576   
3240       432  7252804817160899390   
3429       432 -2290536969035540576   
4773       432  7252804817160899390   
4776       432  7252804817160899390   

                                          stop_sequence  \
1144  [LA LOUVIERE-SUD, LA LOUVIERE- CENTRE, FAMILLE...   
1145  [BRAINE-LE-COMTE, ECAUSSINNES, MARCHE-LEZ-ECAU...   
1319  [LA LOUVIERE-SUD, LA LOUVIERE- CENTRE, FAMILLE...   
1320  [BRAINE-LE-COMTE, ECAUSSINNES, MARCHE-LEZ-ECAU...   
3239  [LA LOUVIERE-SUD, LA LOUVIERE- CENTRE, FAMILLE...   
3240  [BRAINE-LE-COMTE, ECAUSSINNES, MARCHE-LEZ-ECAU...   
3429  [LA LOUVIERE-SUD, LA LOUVIERE- CENTRE, FAMILLE...   
4773  [BRAINE-LE-COMTE, ECAUSSINNES, MARCHE-LEZ-ECAU...   


4439                               [12:29:00, 17:29:00]  
      route_id                 hash  \
1199       515  8510478760374635243   
4221       515  8020541696657389527   

                                          stop_sequence  \
1199  [BERTRIX, LIBRAMONT, NEUFCHATEAU, MARBEHAN, HA...   
4221  [LIBRAMONT, NEUFCHATEAU, MARBEHAN, HABAY, STOC...   

                                                  dates departure_time_last  
1199  {20210401, 20210402, 20210315, 20210316, 20210...          [07:48:00]  
4221  {20210314, 20210703, 20210320, 20210321, 20210...          [07:55:00]  
      route_id                 hash  \
1208       560  4335531464781846185   
1209       560  4945299133244709015   
3196       560  4945299133244709015   
5035       560  4335531464781846185   
5036       560  4945299133244709015   

                                          stop_sequence  \
1208  [ZEEBRUGGE-DORP, LISSEWEGE, BRUGES-SAINT-PIERR...   
1209  [BRUGES, BRUGES-SAINT-PIERRE, LISSEWEGE, ZEEBR...   


4190  {20210701, 20210702, 20210705, 20210706, 20210...  [07:23:00, 08:31:00]  
      route_id                 hash  \
1242       645 -7780857357282629291   
1243       645  4961196333448603422   
1372       645 -7780857357282629291   
1373       645  4961196333448603422   

                                          stop_sequence  \
1242  [SCHAERBEEK, BRUXELLES-NORD, BRUXELLES-CONGRES...   
1243  [BINCHE, LEVAL, LA LOUVIERE-SUD, LA LOUVIERE- ...   
1372  [SCHAERBEEK, BRUXELLES-NORD, BRUXELLES-CONGRES...   
1373  [BINCHE, LEVAL, LA LOUVIERE-SUD, LA LOUVIERE- ...   

                                                  dates departure_time_last  
1242  {20210401, 20210402, 20210315, 20210316, 20210...          [17:56:00]  
1243  {20210401, 20210402, 20210315, 20210316, 20210...          [08:23:00]  
1372  {20210701, 20210702, 20210705, 20210706, 20210...          [17:56:00]  
1373  {20210701, 20210702, 20210705, 20210706, 20210...          [08:23:00]  
      route_id                 hash  \

      route_id                 hash  \
1416       626 -2305193505387945937   
1557       626 -2305193505387945937   
1910       626 -2305193505387945937   
2245       626 -2305193505387945937   
4514       626 -2305193505387945937   
5721       626 -2305193505387945937   

                                        stop_sequence  \
1416  [DINANT, YVOIR, GODINNE, LUSTIN, JAMBES, NAMUR]   
1557  [DINANT, YVOIR, GODINNE, LUSTIN, JAMBES, NAMUR]   
1910  [DINANT, YVOIR, GODINNE, LUSTIN, JAMBES, NAMUR]   
2245  [DINANT, YVOIR, GODINNE, LUSTIN, JAMBES, NAMUR]   
4514  [DINANT, YVOIR, GODINNE, LUSTIN, JAMBES, NAMUR]   
5721  [DINANT, YVOIR, GODINNE, LUSTIN, JAMBES, NAMUR]   

                                                  dates   departure_time_last  
1416  {20210401, 20210315, 20210316, 20210317, 20210...  [05:35:00, 22:08:00]  
1557                                         {20210402}  [05:35:00, 22:08:00]  
1910  {20210406, 20210407, 20210408, 20210409, 20210...  [05:37:00, 22:10:00]  
2245  

      route_id                 hash  \
1874       442 -3148461516341206667   
1875       442  2627423209949497269   

                                          stop_sequence  \
1874  [FLEURUS, LODELINSART, CHARLEROI-OUEST, CHARLE...   
1875  [JAMBES, NAMUR, RONET, FLAWINNE, FLOREFFE, FRA...   

                                                  dates  \
1874  {20210406, 20210407, 20210408, 20210409, 20210...   
1875  {20210406, 20210407, 20210408, 20210409, 20210...   

                                    departure_time_last  
1874                               [07:18:00, 08:18:00]  
1875  [18:07:00, 19:07:00, 20:07:00, 21:07:00, 22:07...  
      route_id                 hash  \
1876       443 -3148461516341206667   
1877       443  2627423209949497269   
3052       443  4375977577959409201   
5120       443  3515659196469645103   

                                          stop_sequence  \
1876  [FLEURUS, LODELINSART, CHARLEROI-OUEST, CHARLE...   
1877  [JAMBES, NAMUR, RONET, FLAWINNE,

2027  {20210318, 20210415}          [16:01:00]  
      route_id                 hash  \
2032       254  8950460413564848175   
2051       254   712157407827530822   
4064       254   712157407827530822   

                                          stop_sequence                 dates  \
2032  [BRAINE-L'ALLEUD, WATERLOO, DE HOEK, RHODE-SAI...  {20210328, 20210329}   
2051  [BRUXELLES-NORD, BRUXELLES-CONGRES, BRUXELLES-...            {20210328}   
4064  [BRUXELLES-NORD, BRUXELLES-CONGRES, BRUXELLES-...  {20210329, 20210531}   

     departure_time_last  
2032          [00:41:00]  
2051          [23:42:00]  
4064          [00:42:00]  
      route_id                 hash  \
2035       162 -6633254760460470495   
2036       162  5465036412654406693   

                                          stop_sequence       dates  \
2035  [TOURNAI, ANTOING, MAUBRAY, CALLENELLE, PERUWE...  {20210328}   
2036  [TAMINES, AISEAU, FARCIENNES, LE CAMPINAIRE, C...  {20210328}   

                             

2462  {20210705, 20210706, 20210707, 20210708, 20210...          [09:09:00]  
      route_id                 hash  \
2465       301  6731895357411274545   
2529       301  6731895357411274545   
2543       301 -9156168843335207179   
2544       301  6731895357411274545   

                                          stop_sequence  \
2465  [LOUVAIN, WEZEMAAL, AARSCHOT, LANGDORP, TESTEL...   
2529  [LOUVAIN, WEZEMAAL, AARSCHOT, LANGDORP, TESTEL...   
2543  [HASSELT, SCHULEN, DIEST, ZICHEM, TESTELT, LAN...   
2544  [LOUVAIN, WEZEMAAL, AARSCHOT, LANGDORP, TESTEL...   

                                                  dates  \
2465  {20210705, 20210706, 20210707, 20210708, 20210...   
2529  {20210315, 20210316, 20210317, 20210318, 20210...   
2543  {20210315, 20210316, 20210317, 20210318, 20210...   
2544  {20210315, 20210316, 20210317, 20210318, 20210...   

                                    departure_time_last  
2465                                         [07:03:00]  
2529              

2932  {20210412, 20210413, 20210414, 20210415, 20210...          [07:08:00]  
      route_id                hash                      stop_sequence  \
3012       238 -825236123809923080  [NEERPELT, OVERPELT, LOMMEL, MOL]   

           dates departure_time_last  
3012  {20210516}          [20:03:00]  
      route_id                 hash  \
3016       690 -6474997686714604748   

                                          stop_sequence       dates  \
3016  [HERENTALS, OLEN, GEEL, MOL, BALEN, BOURG-LEOP...  {20210516}   

     departure_time_last  
3016          [22:05:00]  
      route_id                 hash  \
3017       691 -8041852355316761340   
5751       691  6408324240033621551   

                                          stop_sequence  \
3017  [ESSEN, WILDERT, KALMTHOUT, KIJKUIT, HEIDE, KA...   
5751  [ESSEN, WILDERT, KALMTHOUT, KIJKUIT, HEIDE, KA...   

                                                  dates departure_time_last  
3017                                         {2

3225  {20210315, 20210316, 20210317, 20210318, 20210...          [08:40:00]  
      route_id                 hash  \
3226       699 -4519169726510306106   

                                          stop_sequence  \
3226  [NAMUR, MARCHE-LES-DAMES, NAMECHE, SCLAIGNEAUX...   

                                                  dates departure_time_last  
3226  {20210315, 20210316, 20210317, 20210318, 20210...          [17:28:00]  
      route_id                 hash  \
3227       715 -1259991509826176158   

                                          stop_sequence  \
3227  [MONS, NIMY, OBOURG, HAVRE, THIEU, BRACQUEGNIE...   

                                                  dates departure_time_last  
3227  {20210315, 20210316, 20210317, 20210318, 20210...          [16:56:00]  
      route_id                 hash  \
3228       720 -6643052053965815338   

                                          stop_sequence  \
3228  [ZOTTEGEM, HILLEGEM, HERZELE, TERHAGEN, BURST,...   

                

4292  {20210315, 20210316, 20210317, 20210318, 20210...          [07:40:00]  
      route_id                 hash  \
4296       661 -7349000070474917115   

                                          stop_sequence  \
4296  [BLANKENBERGE, BRUGES-SAINT-PIERRE, BRUGES, ZE...   

                                                  dates   departure_time_last  
4296  {20210315, 20210316, 20210317, 20210318, 20210...  [08:22:00, 17:23:00]  
      route_id                 hash                 stop_sequence  \
4302       675 -6116864292679239510  [ZOTTEGEM, LIERDE, GRAMMONT]   
4753       675  7752717367554616876  [GRAMMONT, LIERDE, ZOTTEGEM]   

                                                  dates departure_time_last  
4302  {20210315, 20210316, 20210317, 20210318, 20210...          [16:27:00]  
4753  {20210630, 20210505, 20210602, 20210317, 20210...          [12:41:00]  
      route_id                 hash  \
4469       649 -1947425326507634380   
5028       649 -1947425326507634380   
5730 

Unnamed: 0,route_id,hash,stop_sequence,frequency
0,115,[7308961383809034169],"[TOURNAI, FROYENNES, HERSEAUX, MOUSCRON]",24.0
1,115,[-8715344402839177113],"[MOUSCRON, HERSEAUX, FROYENNES, TOURNAI]",24.0
2,115,[7308961383809034169],"[TOURNAI, FROYENNES, HERSEAUX, MOUSCRON]",7.0
3,115,[-8715344402839177113],"[MOUSCRON, HERSEAUX, FROYENNES, TOURNAI]",7.0
4,115,[7308961383809034169],"[TOURNAI, FROYENNES, HERSEAUX, MOUSCRON]",1.0
...,...,...,...,...
4573,733,"[900702404119183476, -1170946074405319360]","[BRUXELLES-MIDI, BRUXELLES-CHAPELLE, BRUXELLES...",5.0
4574,733,"[8382849686158626730, 8401325233569951713]","[ROTTERDAM CS (NL), BREDA (NL), NOORDERKEMPEN ...",3.0
4575,733,"[301682476470293667, 8382849686158626730, 8401...","[DEN HAAG HS (NL), ROTTERDAM CS (NL), BREDA (N...",72.0
4576,734,"[340386202383150578, -1170946074405319360]","[BRUXELLES-NORD, SCHAERBEEK, BRUSSELS AIRPORT-...",2.0


In [485]:
'''Finds the routes that can be either extended from behind or from after and those which are complete sequences'''
#NEW
def get_extention_indexes_new(stop_sequences_df):
    '''returns the tree indexes: index_of_extendable, index_of_begin_sequences, index_of_complete_sequences'''
    #intiate the dictionnaries, that will be used to retrieve different rows later on
    index_of_extendable = {}
    index_of_begin_sequences = {}
    index_of_complete_sequences = {}
    for route_id in stop_sequences_df['route_id'].unique():
        #select the route with the route_id selected by the loop iteration
        route_sequences_route_id = select_stop_sequences(stop_sequences_df, route_id)
        for index_trip, trip in route_sequences_route_id.iterrows():
            #checks the extentions possible for the trip that can follow after its last stop
            possible_extentions_after = route_sequences_route_id[route_sequences_route_id['stop_sequence'].apply(lambda x: any(item for item in [trip['stop_sequence'][-1]] if (item == x[0]) and not(set(x[1:]) & set(trip['stop_sequence']))))].copy()
            #checks that those extentions have a common date as the trip
            possible_extentions_after = possible_extentions_after[possible_extentions_after['dates'].apply(lambda x: any(item for item in trip['dates'] if item in x))].copy()   
            if not possible_extentions_after.empty:
                #checks that those extentions have a matching time schedule as the trip
                possible_extentions_after = possible_extentions_after[possible_extentions_after['departure_time_first'].apply(lambda x: any(item for item in trip['departure_time_last'] if item in x))].copy()
            #checks the extentions possible for the trip that can follow before its first stop
            possible_extentions_behind = route_sequences_route_id[route_sequences_route_id['stop_sequence'].apply(lambda x: any(item for item in [trip['stop_sequence'][0]] if (item == x[-1]) and not(set(x[:-1]) & set(trip['stop_sequence']))))].copy()        
            #checks that those extentions have a common date as the trip
            possible_extentions_behind = possible_extentions_behind[possible_extentions_behind['dates'].apply(lambda x: any(item for item in trip['dates'] if item in x))].copy()
            if not possible_extentions_behind.empty:    
                #checks that those extentions have a matching time schedule as the trip
                possible_extentions_behind = possible_extentions_behind[possible_extentions_behind['departure_time_last'].apply(lambda x: any(item for item in trip['departure_time_first'] if item in x))].copy()
            #put all the sequences that can be extended either from the beginning either from the end together
            possible_extentions = possible_extentions_after.append(possible_extentions_behind, ignore_index = True)
            if not possible_extentions.empty:
                if route_id not in index_of_extendable:
                    index_of_extendable[route_id] = []
                index_of_extendable[route_id].append(index_trip)
                if possible_extentions_behind.empty:
                    if route_id not in index_of_begin_sequences:
                        index_of_begin_sequences[route_id] = []
                    index_of_begin_sequences[route_id].append(index_trip)
            elif possible_extentions.empty:
                if route_id not in index_of_complete_sequences:
                    index_of_complete_sequences[route_id] = []
                index_of_complete_sequences[route_id].append(index_trip)
                
    return index_of_extendable, index_of_begin_sequences, index_of_complete_sequences

'Finds the routes that can be either extended from behind or from after and those which are complete sequences'

In [552]:
'''Creates all the sequences of routes possible to reconstruct the real route'''
#NEW
def possible_sequences_construction_new(stop_sequences_df, index_of_extendable, index_of_begin_sequences, index_of_complete_sequences):
    '''returns the first part of the route_creation, two others need to be added'''
    import copy
    #create an empty df for the process of route creation
    route_creation  = pd.DataFrame()
    for route_id in index_of_extendable:
        #checks if some parts are begin sequences, if not, then we can't build routes with multiple sequences
        if route_id in index_of_begin_sequences:
            #create a copy of the df with only the route considered in the loop iteration
            routes_with_route_id = select_stop_sequences(stop_sequences_df, route_id)
            #set default frequency to NaN
            routes_with_route_id['frequency'] = np.nan
            #create a df where only the routes that have an end stop as their first element of the sequence
            route_creation_route_id = routes_with_route_id.loc[index_of_begin_sequences[route_id]][['route_id', 'hash', 'stop_sequence', 'dates', 'departure_time_last','frequency']]
            #create a df with the exentable sequences for that route_id
            route_creation_extensions_route_id = routes_with_route_id.loc[index_of_extendable[route_id]][['route_id', 'hash', 'stop_sequence', 'dates', 'departure_time_first', 'departure_time_last','frequency']]    
            #make the hash column as a column of lists
            route_creation_route_id['hash'] = route_creation_route_id['hash'].apply(lambda x: [x])
            route_creation_route_id = route_creation_route_id.reset_index(drop=True)
            #to stop the while loop when all the routes are complete in the df for the route_id of the loop iteration
            complete_routes = 0
            #print(route_creation_route_id)
            while complete_routes < len(route_creation_route_id.index):
                #use a deepcopy to not impact the iterrows of the main loop
                route_creation_deep_copy = copy.deepcopy(route_creation_route_id)
                for index_original, route_part in route_creation_deep_copy.iterrows():
                    #create a dataframe of the possible extentions for each route_part
                    #select an extention only if the extention is the next part of the route and also that no other station are repeated in the sequence if this extention is added(otherwise it might cause an infinite loop)
                    possible_extentions = route_creation_extensions_route_id[route_creation_extensions_route_id['stop_sequence'].apply(lambda x: any(item for item in [route_part['stop_sequence'][-1]] if (item == x[0]) and not(set(x[1:]) & set(route_part['stop_sequence']))))].copy()
                    #checks that those extentions have a common date as the trip
                    possible_extentions = possible_extentions[possible_extentions['dates'].apply(lambda x: any(item for item in route_part['dates'] if item in x))].copy()   
                    if not possible_extentions.empty:
                        #checks that those extentions have a matching time schedule as the trip
                        possible_extentions = possible_extentions[possible_extentions['departure_time_first'].apply(lambda x: any(item for item in route_part['departure_time_last'] if item in x))].copy()
                    #checks whether any extention fullfilling the criterias has been found
                    #print(possible_extentions)
                    if not possible_extentions.empty:
                        #if so, extend it with every single possibilities
                        for index_extention, possible_extention in possible_extentions.iterrows():
                            #must create a deepcopy, otherwise the orignal hash list will change as well (mutable)
                            updated_hash = copy.deepcopy(route_part['hash'])
                            updated_hash.append(possible_extention['hash'])
                            updated_route_sequence = route_part['stop_sequence'] + possible_extention['stop_sequence'][1:]
                            common_dates = possible_extention['dates'] & route_part['dates']
                            #print(list(possible_extentions['departure_time_first'])[0], list(route_part['departure_time_last']), list(possible_extentions['departure_time_last'])[0])
                            new_departure_time_last = take_leftovers_list_c_from_intersection_AAndB(list(possible_extentions['departure_time_first'])[0], list(route_part['departure_time_last']), list(possible_extentions['departure_time_last'])[0])
                            #print(new_departure_time_last)
                            new_frequency = len(new_departure_time_last) * len(common_dates)
                            route_creation_route_id.loc[max(route_creation_route_id.index)+1] = [route_id, updated_hash, updated_route_sequence, common_dates, new_departure_time_last, new_frequency]
                        #then delete the route with the index (see loop here above)
                        route_creation_route_id = route_creation_route_id.drop(index = index_original)            
                    #the route can't be extended anymore
                    else:
                        complete_routes += 1
            #adds all the possible routes created with the trips of the route_id of the main loop
            route_creation = route_creation.append(route_creation_route_id, ignore_index = True)
    if 'departure_time_last' in route_creation.columns:
        route_creation = route_creation.drop(['dates', 'departure_time_last'], axis=1)
    route_creation = route_creation.reindex(columns=['route_id','hash','stop_sequence', 'frequency'])
    return route_creation

'Creates all the sequences of routes possible to reconstruct the real route'

In [555]:
'''Adds the full sequences to the route_creation dataframe'''
#NEW
def add_full_sequences_new(stop_sequences_df, route_creation, index_of_complete_sequences):
    '''returns the second part of the route_creation, one other needs to be added'''
    for route_id in index_of_complete_sequences:
        #findes all the complete sequences for that route_id
        copy_complete_sequences_df = stop_sequences_df.loc[index_of_complete_sequences[route_id]][['route_id','hash','stop_sequence', 'dates', 'departure_time_last']].copy()
        print(copy_complete_sequences_df)
        copy_complete_sequences_df['number_dates'] = copy_complete_sequences_df['dates'].apply(lambda x: len(x))
        copy_complete_sequences_df['number_times'] = copy_complete_sequences_df['departure_time_last'].apply(lambda x: len(x))
        copy_complete_sequences_df['frequency'] = copy_complete_sequences_df['number_dates']* copy_complete_sequences_df['number_times'] 
        copy_complete_sequences_df = copy_complete_sequences_df.drop(['dates', 'departure_time_last', 'number_dates', 'number_times'], axis=1)
        copy_complete_sequences_df['hash'] = copy_complete_sequences_df['hash'].apply(lambda x: [x])
        #adds each of them in the route_creation dataframe
        for index_complete_sequence, complete_sequence in copy_complete_sequences_df.iterrows():
            route_creation = route_creation.append(complete_sequence, ignore_index = True)
    route_creation = route_creation.sort_values(by=['route_id'], ignore_index = True)
    return route_creation 

'Adds the full sequences to the route_creation dataframe'

In [558]:
'''Adds the sequences that were not yet added in the route_creation dataframe'''
#NEW
def add_unused_sequences_new(stop_sequences_df, route_creation):
    '''returns the third part of the route_creation'''
    for route_id in stop_sequences_df['route_id'].unique():
        if route_id in route_creation['route_id'].unique():
            #get a set of the hashes that were employed to create the routes for that route_id
            used_sequences_hash = set(route_creation[route_creation['route_id'] == route_id].apply(lambda x: pd.Series(x['hash']),axis=1).stack().reset_index(level=1, drop=True))
            #get a tuple of all the route sequences for that route_id
            used_sequences = tuple(route_creation[route_creation['route_id'] == route_id]['stop_sequence'])
            copy_sequences_route_id = select_stop_sequences(stop_sequences_df, route_id)[['route_id','hash','stop_sequence', 'dates', 'departure_time_last']]
            copy_sequences_route_id['number_dates'] = copy_sequences_route_id['dates'].apply(lambda x: len(x))
            copy_sequences_route_id['number_times'] = copy_sequences_route_id['departure_time_last'].apply(lambda x: len(x))
            copy_sequences_route_id['frequency'] = copy_sequences_route_id['number_dates']* copy_sequences_route_id['number_times'] 
            copy_sequences_route_id = copy_sequences_route_id.drop(['dates', 'departure_time_last', 'number_dates', 'number_times'], axis=1)
            copy_sequences_route_id['hash'] = copy_sequences_route_id['hash'].apply(lambda x: [x]) 
            #adds the hashes that were not employed in any route creations for that route_id
            for index_trip, trip in copy_sequences_route_id.iterrows():
                #first element of the list because there is always only one element
                if trip['hash'][0] not in used_sequences_hash:
                    #checks that the sequence is not a sublist of any existing sequences
                    is_subsequence = False
                    for sequence in used_sequences:
                        if set(trip['stop_sequence']).issubset(sequence):
                            is_subsequence = True
                    if not is_subsequence:
                        route_creation = route_creation.append(trip, ignore_index = True)
    return route_creation

'Adds the sequences that were not yet added in the route_creation dataframe'

In [30]:
'''Finds the routes that can be either extended from behind or from after and those which are complete sequences'''

def get_extention_indexes(stop_sequences_df):
    '''returns the tree indexes: index_of_extendable, index_of_begin_sequences, index_of_complete_sequences'''
    #intiate the dictionnaries, that will be used to retrieve different rows later on
    index_of_extendable = {}
    index_of_begin_sequences = {}
    index_of_complete_sequences = {}
    for route_id in stop_sequences_df['route_id'].unique():
        #select the route with the route_id selected by the loop iteration
        route_sequences_route_id = select_stop_sequences(stop_sequences_df, route_id)
        for index_trip, trip in route_sequences_route_id.iterrows():
            #checks the extentions possible for the trip that can follow after its last stop
            possible_extentions_after = route_sequences_route_id[route_sequences_route_id['stop_sequence'].apply(lambda x: any(item for item in [trip['stop_sequence'][-1]] if (item == x[0]) and not(set(x[1:]) & set(trip['stop_sequence']))))].copy()
            #checks that those extentions have a common service_id as the trip
            possible_extentions_after = possible_extentions_after[possible_extentions_after['service_id'].apply(lambda x: any(item for item in trip['service_id'] if item in x))].copy()
            #checks the extentions possible for the trip that can follow before its first stop
            possible_extentions_behind = route_sequences_route_id[route_sequences_route_id['stop_sequence'].apply(lambda x: any(item for item in [trip['stop_sequence'][0]] if (item == x[-1]) and not(set(x[:-1]) & set(trip['stop_sequence']))))].copy()        
            #checks that those extentions have a common service_id as the trip
            possible_extentions_behind = possible_extentions_behind[possible_extentions_behind['service_id'].apply(lambda x: any(item for item in trip['service_id'] if item in x))].copy()
            #put all the sequences that can be extended either from the beginning either from the end together
            possible_extentions = possible_extentions_after.append(possible_extentions_behind, ignore_index = True)
            if not possible_extentions.empty:
                if route_id not in index_of_extendable:
                    index_of_extendable[route_id] = []
                index_of_extendable[route_id].append(index_trip)
                if possible_extentions_behind.empty:
                    if route_id not in index_of_begin_sequences:
                        index_of_begin_sequences[route_id] = []
                    index_of_begin_sequences[route_id].append(index_trip)
            elif possible_extentions.empty:
                if route_id not in index_of_complete_sequences:
                    index_of_complete_sequences[route_id] = []
                index_of_complete_sequences[route_id].append(index_trip)
                
    return index_of_extendable, index_of_begin_sequences, index_of_complete_sequences

'Finds the routes that can be either extended from behind or from after and those which are complete sequences'

In [31]:
'''Creates all the sequences of routes possible to reconstruct the real route'''

def possible_sequences_construction(stop_sequences_df, index_of_extendable, index_of_begin_sequences, index_of_complete_sequences):
    '''returns the first part of the route_creation, two others need to be added'''
    import copy
    #create an empty df for the process of route creation
    route_creation  = pd.DataFrame()
    for route_id in index_of_extendable:
        #checks if some parts are begin sequences, if not, then we can't build routes with multiple sequences
        if route_id in index_of_begin_sequences:
            #create a copy of the df with only the route considered in the loop iteration
            routes_with_route_id = select_stop_sequences(stop_sequences_df, route_id)
            #create a df where only the routes that have an end stop as their first element of the sequence
            route_creation_route_id = routes_with_route_id.loc[index_of_begin_sequences[route_id]][['route_id', 'hash', 'stop_sequence', 'service_id']]
            #create a df with the exentable sequences for that route_id
            route_creation_extensions_route_id = routes_with_route_id.loc[index_of_extendable[route_id]][['route_id', 'hash', 'stop_sequence','service_id']]    
            #make the hash column as a column of lists
            route_creation_route_id['hash'] = route_creation_route_id['hash'].apply(lambda x: [x])
            route_creation_route_id = route_creation_route_id.reset_index(drop=True)
            #to stop the while loop when all the routes are complete in the df for the route_id of the loop iteration
            complete_routes = 0
            while complete_routes < len(route_creation_route_id.index):
                #use a deepcopy to not impact the iterrows of the main loop
                route_creation_deep_copy = copy.deepcopy(route_creation_route_id)
                for index_original, route_part in route_creation_deep_copy.iterrows():
                    #create a dataframe of the possible extentions for each route_part
                    #select an extention only if the extention is the next part of the route and also that no other station are repeated in the sequence if this extention is added(otherwise it might cause an infinite loop)
                    possible_extentions = route_creation_extensions_route_id[route_creation_extensions_route_id['stop_sequence'].apply(lambda x: any(item for item in [route_part['stop_sequence'][-1]] if (item == x[0]) and not(set(x[1:]) & set(route_part['stop_sequence']))))].copy()
                    #take only those extentions that have a common service_id with the route_part
                    possible_extentions = possible_extentions[possible_extentions['service_id'].apply(lambda x: any(item for item in route_part['service_id'] if item in x))].copy()                
                    #checks whether any extention fullfilling the criterias has been found
                    if not (possible_extentions.empty):
                        #if so, extend it with every single possibilities
                        for index_extention, possible_extention in possible_extentions.iterrows():
                            #must create a deepcopy, otherwise the orignal hash list will change as well (mutable)
                            updated_hash = copy.deepcopy(route_part['hash'])
                            updated_hash.append(possible_extention['hash'])
                            updated_route_sequence = route_part['stop_sequence'] + possible_extention['stop_sequence'][1:]
                            common_service_id = possible_extention['service_id'] & route_part['service_id']
                            route_creation_route_id.loc[max(route_creation_route_id.index)+1] = [route_id, updated_hash, updated_route_sequence, common_service_id]
                        #then delete the route with the index (see loop here above)
                        route_creation_route_id = route_creation_route_id.drop(index = index_original)            
                    #the route can't be extended anymore
                    else:
                        complete_routes += 1
            #adds all the possible routes created with the trips of the route_id of the main loop
            route_creation = route_creation.append(route_creation_route_id, ignore_index = True)
    return route_creation

'Creates all the sequences of routes possible to reconstruct the real route'

In [32]:
'''Adds the full sequences to the route_creation dataframe'''

def add_full_sequences(stop_sequences_df, route_creation, index_of_complete_sequences):
    '''returns the second part of the route_creation, one other needs to be added'''
    for route_id in index_of_complete_sequences:
        #findes all the complete sequences for that route_id
        copy_complete_sequences_df = stop_sequences_df.loc[index_of_complete_sequences[route_id]][['route_id','hash','stop_sequence', 'service_id']].copy()
        copy_complete_sequences_df['hash'] = copy_complete_sequences_df['hash'].apply(lambda x: [x])
        #adds each of them in the route_creation dataframe
        for index_complete_sequence, complete_sequence in copy_complete_sequences_df.iterrows():
            route_creation = route_creation.append(complete_sequence, ignore_index = True)
    route_creation = route_creation.sort_values(by=['route_id'], ignore_index = True)
    return route_creation 

'Adds the full sequences to the route_creation dataframe'

In [33]:
'''Adds the sequences that were not yet added in the route_creation dataframe'''

def add_unused_sequences(stop_sequences_df, route_creation):
    '''returns the third part of the route_creation'''
    for route_id in stop_sequences_df['route_id'].unique():
        if route_id in route_creation['route_id'].unique():
            #get a set of the hashes that were employed to create the routes for that route_id
            used_sequences_hash = set(route_creation[route_creation['route_id'] == route_id].apply(lambda x: pd.Series(x['hash']),axis=1).stack().reset_index(level=1, drop=True))
            #get a tuple of all the route sequences for that route_id
            used_sequences = tuple(route_creation[route_creation['route_id'] == route_id]['stop_sequence'])
            copy_sequences_route_id = select_stop_sequences(stop_sequences_df, route_id)[['route_id','hash','stop_sequence', 'service_id']]
            copy_sequences_route_id['hash'] = copy_sequences_route_id['hash'].apply(lambda x: [x]) 
            #adds the hashes that were not employed in any route creations for that route_id
            for index_trip, trip in copy_sequences_route_id.iterrows():
                #first element of the list because there is always only one element
                if trip['hash'][0] not in used_sequences_hash:
                    #checks that the sequence is not a sublist of any existing sequences
                    is_subsequence = False
                    for sequence in used_sequences:
                        if set(trip['stop_sequence']).issubset(sequence):
                            is_subsequence = True
                    if not is_subsequence:
                        route_creation = route_creation.append(trip, ignore_index = True)
    return route_creation

'Adds the sequences that were not yet added in the route_creation dataframe'

In [34]:
'''Calculates the frequency of the constructed routes just made in the route_creation dataframe'''
    
def calculate_frequenty_new_sequences(number_of_trips_per_hash, service_id_count_dates, route_creation):
    '''calculates the frequencies of route_construction_third'''
    #put the default value of the frequency to 0
    route_creation['frequency'] = 0
    for index_sequence, sequence in route_creation[['route_id','hash','service_id']].iterrows():
        #initialize the varibles
        sequence_frequency = 0
        set_common_service_id = sequence['service_id']
        if set_common_service_id:
            #select the number_of_trips_per_hash only for the considered route_id
            number_of_trips_per_hash_route_id = number_of_trips_per_hash[number_of_trips_per_hash['route_id'] == sequence['route_id']]
            #only select the trips with the hash value contained in the sequence and with the same route_id
            containing_hash = number_of_trips_per_hash_route_id[number_of_trips_per_hash_route_id['hash'].apply(lambda x: any(item for item in sequence['hash'] if x == item))]
            #loop over each service_id that were common during the trip
            for service_id in set_common_service_id:
                service_id_number_days = service_id_count_dates[service_id_count_dates['service_id'] == service_id].iloc[0]['count_service_id']
                #adds the minimum number of trips per day multiplied by the number of days in the service_id
                sequence_frequency += containing_hash[containing_hash['service_id'] == service_id]['number_trip_ids'].min() * service_id_number_days
            #adds the frequency in of the new route sequence
            route_creation.loc[index_sequence, 'frequency'] = sequence_frequency
    return route_creation

'Calculates the frequency of the constructed routes just made in the route_creation dataframe'

In [176]:
'''Creates a column in the df that calculates the travel time between the first and last stop (waiting time included)'''
from datetime import datetime
from datetime import timedelta
FMT = '%H:%M:%S'
day_in_seconds = timedelta(days=1).total_seconds()

def give_begin_end_time(route_creation_frequency_single, trips_hash, stops_cleaned_stop_times_trips_merge):
    #create a copy to not change the input DataFrame
    route_creation_frequency_single = route_creation_frequency_single.copy()
    #makes a column with the a representative begin time and end time of the route
    route_creation_frequency_single['travel_time'] = np.nan
    for index_sequence, sequence in route_creation_frequency_single.iterrows():
        constructed_route = pd.DataFrame()
        for index_hash, hash_value in enumerate(sequence['hash']):
            index_plus_one = index_hash + 1
            #take all the trips with that hash
            next_representative_trips = trips_hash[(trips_hash['hash'] == hash_value)].copy()['trip_id']
            #take all the stop sequences and their time that belongs 
            full_times = stops_cleaned_stop_times_trips_merge[stops_cleaned_stop_times_trips_merge['trip_id'].isin(next_representative_trips)].copy()
            #select) only the last stop sequences of full_times for each trip_id
            new_index_max_per_trip_id = full_times.reset_index().groupby(['route_id', 'trip_id'])['stop_sequence'].idxmax()
            max_per_trip_id = full_times.reset_index().loc[new_index_max_per_trip_id]
            #select only the first stop sequences of full_times for each trip_id            
            new_index_min_per_trip_id = full_times.reset_index().groupby(['route_id', 'trip_id'])['stop_sequence'].idxmin()            
            min_per_trip_id = full_times.reset_index().loc[new_index_min_per_trip_id]
            #merge max_per_trip_id and min_per_trip_id
            merged = min_per_trip_id[['trip_id', 'service_id', 'departure_time']].merge(max_per_trip_id[['trip_id', 'arrival_time', 'departure_time']], on='trip_id')
            #take all the stop sequences except the first one, and the last one if it is not the last sequence of the route
            if index_hash == len(sequence['hash']) - 1:
                rest_per_trip_id = full_times.reset_index().drop(pd.concat([new_index_min_per_trip_id,new_index_max_per_trip_id]))
            else:
                rest_per_trip_id = full_times.reset_index().drop(new_index_min_per_trip_id)            
            if not rest_per_trip_id.empty:
                #transform 24:00:00 into 00:00:00
                rest_per_trip_id['departure_time'] = rest_per_trip_id['departure_time'].apply(lambda x: str(int(x[:2])-24) + x[2:] if int(x[:2]) >= 24 else x)
                rest_per_trip_id['arrival_time'] = rest_per_trip_id['arrival_time'].apply(lambda x: str(int(x[:2])-24) + x[2:] if int(x[:2]) >=  24 else x)
                #calculate the waiting_time
                rest_per_trip_id['waiting_time'] = rest_per_trip_id[['arrival_time','departure_time']].apply(lambda x: int((datetime.strptime(x['departure_time'], FMT) - datetime.strptime(x['arrival_time'], FMT)).total_seconds()/60), axis=1)
                #if one day as past, take it into consideration
                rest_per_trip_id['waiting_time'] = rest_per_trip_id['waiting_time'].apply(lambda x: day_in_seconds/60 + x if x < 0 else x)
                #calculate the total waiting_time
                rest_per_trip_id_grouped = rest_per_trip_id.groupby(['trip_id'], as_index=False)['waiting_time'].sum()
                merged_waiting_time = merged.merge(rest_per_trip_id_grouped, on='trip_id')
            #in case there are only two stops in for the hash
            else:
                merged_waiting_time = merged.copy()
                merged_waiting_time['waiting_time'] = 0
            #rename the columns     
            merged_waiting_time = merged_waiting_time.rename(columns = {'trip_id': 'trip_id_' + str(index_plus_one),'departure_time_x':'departure_time_'+ str(index_plus_one), 'arrival_time':'arrival_time_'+ str(index_plus_one),
                                          'departure_time_y':'departure_time_'+ str(index_plus_one + 1), 'waiting_time': 'waiting_time_' + str(index_plus_one)})
            if index_hash == 0:
                constructed_route = merged_waiting_time
            elif index_hash > 0:
                constructed_route = constructed_route.merge(merged_waiting_time, how='inner', left_on=['departure_time_' + str(index_plus_one), 'service_id'], right_on=['departure_time_' + str(index_plus_one), 'service_id'])
        #make a list of all the columns of waiting_times
        list_column_waiting_time = []
        for i in range(1, index_plus_one + 1):
            list_column_waiting_time.append('waiting_time_' + str(i))
        #sum all the waiting times together for each route itinerary
        constructed_route['waiting_time'] = constructed_route[list_column_waiting_time].astype(int).sum(1)
        
        #sometimes it is impossible to find trips that follow each other
        if not constructed_route.empty:
            #when the loop is finished, take the last arrival time, that will be used to calculate the travel time
            time_constructed_route = constructed_route[['departure_time_1', 'arrival_time_' + str(index_plus_one), 'waiting_time']]
            time_constructed_route = time_constructed_route.rename(columns = {'departure_time_1':'departure_time', 'arrival_time_' + str(index_plus_one):'arrival_time'})
            #transform 24:00:00 into 00:00:00
            time_constructed_route['departure_time'] = time_constructed_route['departure_time'].apply(lambda x: str(int(x[:2])-24) + x[2:] if int(x[:2]) >= 24 else x)
            time_constructed_route['arrival_time'] = time_constructed_route['arrival_time'].apply(lambda x: str(int(x[:2])-24) + x[2:] if int(x[:2]) >=  24 else x)
            #calculate the travel time
            time_constructed_route['time_diff_sec'] = time_constructed_route[['arrival_time','departure_time']].apply(lambda x: int((datetime.strptime(x['arrival_time'], FMT) - datetime.strptime(x['departure_time'], FMT)).total_seconds()/60), axis=1)
            #if one day as past, take it into consideration
            time_constructed_route['time_diff_sec'] = time_constructed_route['time_diff_sec'].apply(lambda x: day_in_seconds/60 + x if x < 0 else x)
            #take the first most frequent one
            mode_travel_time = time_constructed_route['time_diff_sec'].mode().mean()
            mode_waiting_time = time_constructed_route['waiting_time'].mode().mean() 
            #Add this to the first dataframe
            route_creation_frequency_single.loc[index_sequence,'travel_time'] = mode_travel_time
            route_creation_frequency_single.loc[index_sequence,'waiting_time'] = mode_waiting_time
        #if there is no trips that follow each other with the hash from the array
        else:
            route_creation_frequency_single = route_creation_frequency_single.drop(index_sequence)
            
    return route_creation_frequency_single

'Creates a column in the df that calculates the travel time between the first and last stop (waiting time included)'

In [177]:
def calculate_hash_route_creation(route_creation): 
    '''calculates the hash and the hash inverse of the route_creation'''
    #copy the route_creation dataFrame
    route_creation_hash = route_creation.copy()
    #calculate the hash and the hash inverse using the lists in stop_sequence
    route_creation_hash['hash'] = route_creation_hash['stop_sequence'].apply(lambda x: hash(tuple(x)))
    route_creation_hash['hash_inverse'] = route_creation_hash['stop_sequence'].apply(lambda x: hash(tuple(x[::-1])))
    return route_creation_hash

In [178]:
'''Regroup the routes that are the same (even though they are in the opposite direction)'''

def regroup_same_stop_sequences(route_creation_hash):
    '''regroups the stop_sequences that are the same'''
    
    route_creation_max_hash = route_creation_hash.copy()
    route_creation_max_hash['max_hash'] = route_creation_max_hash[['hash', 'hash_inverse']].max(axis=1)
    #create a df that sums the frequence of the trips going from opposite directions
    route_creation_max_hash_freq = route_creation_max_hash.groupby(['route_id','max_hash'], as_index = False)[['frequency']].sum()
    #renames the max_hash column into hash so it the dataframe can be merged with route_hash_without_freq
    route_creation_max_hash_freq = route_creation_max_hash_freq.rename(columns = {'max_hash':'hash'})
    #drops the column freq_sequence_route because the one that is of interest is in route_creation_max_hash_freq
    route_hash_without_freq = route_creation_hash.copy().drop(['frequency'], axis = 1)
    route_hash_without_freq = route_hash_without_freq.drop_duplicates(subset=['route_id', 'hash'])
    route_hash_freq_combined_first_merge = pd.merge(route_creation_max_hash_freq, route_hash_without_freq, on=['route_id', 'hash'], how='left')
    #selects the part of the dataset that doesn't have NaN (because for the NaN, their hash_value that was max was the one in hash_inverse and it didn't exist in the other df), so we can concatenate it with the part that had NaN later
    route_hash_freq_first_part = route_hash_freq_combined_first_merge[pd.notnull(route_hash_freq_combined_first_merge['stop_sequence'])]
    #selects one part the part of the dataset that does have NaN, so we can concatenate it with the part that has no NaN later on.
    #but first, we will need to fill those NaN values (done in the code lines behind this one)
    route_hash_freq_second_part = route_hash_freq_combined_first_merge[pd.isnull(route_hash_freq_combined_first_merge['stop_sequence'])][['route_id', 'hash', 'frequency']]
    #renames the hash column into hash_inverse so it the dataframe can be merged with route_hash_without_freq (because it didn't work with 'hash' on the first merge)
    route_hash_freq_second_part = route_hash_freq_second_part.rename(columns = {'hash':'hash_inverse'})
    route_hash_freq_second_part = pd.merge(route_hash_freq_second_part, route_hash_without_freq, on=['route_id', 'hash_inverse'], how='left')
    #the hash that is of interest in the final df will be hash and not hash_inverse
    route_hash_freq_combined_not_sorted = pd.concat([route_hash_freq_first_part, route_hash_freq_second_part])
    route_hash_freq_combined = route_hash_freq_combined_not_sorted.sort_values(by = ['route_id'])
    route_hash_freq_combined = route_hash_freq_combined.reset_index(drop = True)
    return route_hash_freq_combined

'Regroup the routes that are the same (even though they are in the opposite direction)'

In [179]:
'''Deletes the routes that do not represent 10% of the total route frequency and creates new route, if some of them are different'''

def apply_treshold_route_creation(route_hash_freq_combined): 
    #calculates the total frequency per route_id
    frequency_each_route = route_hash_freq_combined.groupby(['route_id'], as_index = False)['frequency'].sum()
    frequency_treshold = frequency_each_route.copy()
    #calculates the treshold (here 10%)
    frequency_treshold['frequency'] = frequency_treshold['frequency']/10
    frequency_treshold.rename(columns = {'frequency':'frequency_treshold'}, inplace = True)
    route_hash_freq_treshold = route_hash_freq_combined.merge(frequency_treshold, on='route_id', how = 'left')
    #find the sequences that are not more than 10% of the route frequency and delete them
    index_names = route_hash_freq_treshold[route_hash_freq_treshold['frequency'] < route_hash_freq_treshold['frequency_treshold']].index
    route_hash_freq_treshold.drop(index_names, inplace = True)
    #drop the routes with the same hash as others
    route_hash_freq_treshold['max_hash'] = route_hash_freq_treshold[['hash', 'hash_inverse']].max(axis=1)
    route_hash_freq_treshold = route_hash_freq_treshold.drop_duplicates(subset='max_hash')
    route_hash_freq_treshold  = route_hash_freq_treshold.drop(['hash_inverse', 'max_hash'], axis = 1)
    #selects the sequences that are not the first most frequent per route_id
    sequences_max_freq = route_hash_freq_treshold.groupby(['route_id'],as_index = False)['frequency'].max()
    sequences_max_freq.rename(columns = {'frequency':'max_frequency'}, inplace = True)
    sequences_max_freq_merged = route_hash_freq_treshold.merge(sequences_max_freq, on='route_id', how='left')
    sequences_max_freq_index = sequences_max_freq_merged[sequences_max_freq_merged['frequency'] == sequences_max_freq_merged['max_frequency']].drop_duplicates(subset='route_id').index
    sequences_non_max_freq_index = sequences_max_freq_merged[~sequences_max_freq_merged.index.isin(sequences_max_freq_index)].index
    #those selected sequences get a new route_id that starts from routes['route_id'].max() + 1 and increments by one for each new route
    route_id_creation =  route_hash_freq_combined['route_id'].max() + 1
    new_route_id_column = list(range(route_id_creation, route_id_creation + len(sequences_non_max_freq_index)))    
    sequences_max_freq_merged.loc[sequences_non_max_freq_index, 'route_id'] = new_route_id_column
    sequences_max_freq_merged = sequences_max_freq_merged.sort_values(by=['route_id'],ignore_index=True)
    #keep only the column route_id and stop_sequence
    final_routes = sequences_max_freq_merged.drop(sequences_max_freq_merged[sequences_max_freq_merged['frequency'] == 0].index)
    final_routes = final_routes.drop(columns=['hash', 'frequency', 'frequency_treshold', 'max_frequency', 'service_id'])
    return final_routes

'Deletes the routes that do not represent 10% of the total route frequency and creates new route, if some of them are different'

In [180]:
''' To keep only the routes that have at least one belgian station in their route_sequence'''

def keep_belgian_routes(final_routes):
    non_belgian_routes = set()
    for index_route, route in final_routes.iterrows():
        is_in_Belgium = False
        for stop in route['stop_sequence']:
            if stop in set(belgian_stops_Belgium_series):
                is_in_Belgium = True
                break
        if not is_in_Belgium:
            route_id = route['route_id']
            non_belgian_routes.add(route_id)
    belgian_routes = final_routes.loc[~final_routes['route_id'].isin(non_belgian_routes)]
    
    return belgian_routes

' To keep only the routes that have at least one belgian station in their route_sequence'

In [181]:
'''Makes a set that can be used for building the edges of the graph using Networkx package'''

def create_df_for_Networkx(final_routes):
    '''return df_for_edges a df that can be used to build a Networkx L-space graph'''
    #takes the list stop sequence and make it a new column for each stop
    stop_sequence_values = final_routes.apply(lambda x: pd.Series(x['stop_sequence']),axis=1).stack().reset_index(level=1, drop=True)
    stop_sequence_values.name = 'stop_sequence'
    final_routes_stops = final_routes.drop('stop_sequence', axis=1).join(stop_sequence_values)
    final_routes_stops = final_routes_stops.reset_index(drop=True)
    #Creates a shifted instance of the df to use it for the final result
    final_routes_stops_shifted = final_routes_stops.shift()
    #Check if which of the rows are followed by a row with the same trip_id
    final_routes_stops_shifted['match'] = final_routes_stops_shifted['route_id'].eq(final_routes_stops['route_id'])
    #Drop the rows for which this condition is not satisfied
    final_routes_stops_shifted.drop(final_routes_stops_shifted[final_routes_stops_shifted['match'] == False].index, inplace = True)
    final_routes_stops_shifted.rename(columns=
      {"stop_sequence": "stop_name_1",
      "stop_name": "stop_name_1"}, inplace=True)
    #joins the df with its shifted version sothat each sequence of two stations is represented in the table as a row
    df_for_edges = final_routes_stops_shifted.join(final_routes_stops[['stop_sequence']], lsuffix='_caller', rsuffix='_other', how='left')
    df_for_edges.rename(columns=
      {"stop_sequence": "stop_name_2",
      "stop_name": "stop_name_2"}, inplace=True)

    df_for_edges['route_id'] = df_for_edges['route_id'].astype(np.int64)
    df_for_edges = df_for_edges.drop_duplicates()
    df_for_edges = df_for_edges[['route_id','stop_name_1', 'stop_name_2']]
    df_for_edges = df_for_edges.reset_index(drop=True)
    return df_for_edges

'Makes a set that can be used for building the edges of the graph using Networkx package'

# To apply the route creation function

In [182]:
#Sort the df to get always the right order of rows 
distinct_stop_sequences_sorted = distinct_stop_sequences.sort_values(by=['route_id', 'hash']).copy()
route_hash_service_freq_sorted = route_hash_service_freq.sort_values(by=['route_id','hash']).copy()

In [183]:
def full_route_creation(stop_sequences_df, number_of_trips_per_hash, service_id_count_dates, trips_hash, stops_cleaned_stop_times_trips_merge):
    '''return a df that can be used to make a Networkx L-space (with treshold applied of 10%)'''
    index_of_extendable, index_of_begin_sequences, index_of_complete_sequences = get_extention_indexes(stop_sequences_df)
    route_creation_first = possible_sequences_construction(stop_sequences_df, index_of_extendable, index_of_begin_sequences, index_of_complete_sequences)
    route_creation_second = add_full_sequences(stop_sequences_df, route_creation_first, index_of_complete_sequences)
    route_creation_third = add_unused_sequences(stop_sequences_df, route_creation_second)
    route_creation_frequency_single = calculate_frequenty_new_sequences(number_of_trips_per_hash, service_id_count_dates, route_creation_third)
    route_creation_frequency_single_travel_time = give_begin_end_time(route_creation_frequency_single, trips_hash, stops_cleaned_stop_times_trips_merge)
    route_creation_hash = calculate_hash_route_creation(route_creation_frequency_single_travel_time)
    route_hash_freq_combined = regroup_same_stop_sequences(route_creation_hash)
    final_routes = apply_treshold_route_creation(route_hash_freq_combined)
    belgian_routes = keep_belgian_routes(final_routes)
    df_for_edges = create_df_for_Networkx(belgian_routes)
    
    return route_creation_frequency_single, route_creation_frequency_single_travel_time ,belgian_routes, df_for_edges

In [184]:
route_creation_frequency_single, route_creation_frequency_single_travel_time, belgian_routes_Belgium, df_for_edges_Belgium = full_route_creation(distinct_stop_sequences_sorted, route_hash_service_freq_sorted, service_id_df, trips_hash, stops_cleaned_stop_times_trips_merge)
route_creation_frequency_single
belgian_routes_Belgium
df_for_edges_Belgium

Unnamed: 0,route_id,hash,stop_sequence,service_id,frequency
0,115,[-8715344402839177113],"[MOUSCRON, HERSEAUX, FROYENNES, TOURNAI]","{42, 14, 25, 122, 93, 31}",64
1,115,[7308961383809034169],"[TOURNAI, FROYENNES, HERSEAUX, MOUSCRON]","{42, 14, 25, 122, 93, 31}",64
2,116,[-7343106737481693615],"[KNOKKE, DUINBERGEN, HEIST, BRUGES-SAINT-PIERR...","{33, 5, 172, 15, 16, 733}",342
3,116,[-8491097663875580788],"[BRUGES, BRUGES-SAINT-PIERRE, HEIST, DUINBERGE...","{33, 5, 15, 16, 212, 733}",316
4,117,[-7176509651129648319],"[SPA-GERONSTERE, SPA, FRANCHIMONT, THEUX, JUSL...","{26, 43, 28}",58
...,...,...,...,...,...
1525,733,"[7725302905947260432, -1170946074405319360]","[BRUXELLES-MIDI, BRUXELLES-CHAPELLE, BRUXELLES...",{25},1
1526,733,"[301682476470293667, 8382849686158626730, 8401...","[DEN HAAG HS (NL), ROTTERDAM CS (NL), BREDA (N...","{1555, 1566}",106
1527,733,"[900702404119183476, -4488299857125108449, 759...","[BRUXELLES-MIDI, BRUXELLES-CHAPELLE, BRUXELLES...","{1553, 1563, 1565}",71
1528,734,"[340386202383150578, -1170946074405319360]","[BRUXELLES-NORD, SCHAERBEEK, BRUSSELS AIRPORT-...",{25},2


Unnamed: 0,route_id,stop_sequence,travel_time,waiting_time
0,115,"[TOURNAI, FROYENNES, HERSEAUX, MOUSCRON]",16.0,0.0
1,116,"[KNOKKE, DUINBERGEN, HEIST, BRUGES-SAINT-PIERR...",20.0,0.0
2,117,"[VERVIERS-CENTRAL, PEPINSTER, PEPINSTER-CITE, ...",28.0,3.0
3,118,"[GAND-SAINT-PIERRE, DE PINTE, DEINZE, AARSELE,...",69.0,4.0
4,119,"[GRAMMONT, SCHENDELBEKE, IDEGEM, ZANDBERGEN, A...",27.0,1.0
...,...,...,...,...
544,794,"[LOUVAIN, HERENT, VELTEM, ERPS-KWERPS, KORTENB...",66.5,7.5
545,795,"[MOUSCRON, HERSEAUX, FROYENNES, TOURNAI, LEUZE...",143.0,24.0
546,796,"[BINCHE, LEVAL, LA LOUVIERE-SUD, LA LOUVIERE- ...",158.0,21.0
547,797,"[POPERINGE, YPRES, COMINES, WERVIK, MENIN, WEV...",186.5,26.5


Unnamed: 0,route_id,stop_name_1,stop_name_2
0,115,TOURNAI,FROYENNES
1,115,FROYENNES,HERSEAUX
2,115,HERSEAUX,MOUSCRON
3,116,KNOKKE,DUINBERGEN
4,116,DUINBERGEN,HEIST
...,...,...,...
10197,798,ERPS-KWERPS,KORTENBERG
10198,798,KORTENBERG,NOSSEGEM
10199,798,NOSSEGEM,BRUSSELS AIRPORT-ZAVENTEM
10200,798,BRUSSELS AIRPORT-ZAVENTEM,SCHAERBEEK


In [185]:
df_for_edges_Belgium.to_csv(r'/Users/pol/Desktop/CSV_export/df_for_edges_Belgium.csv', index = False, header=True, encoding='utf-8-sig')

In [186]:
belgian_routes_Belgium.to_csv(r'/Users/pol/Desktop/CSV_export/belgian_routes_Belgium.csv', index = False, header=True, encoding='utf-8-sig')

# test

In [190]:
stop_sequences_df, number_of_trips_per_hash, service_id_count_dates, trips_hash, stops_cleaned_stop_times_trips_merge = distinct_stop_sequences_sorted, route_hash_service_freq_sorted, service_id_df, trips_hash, stops_cleaned_stop_times_trips_merge
index_of_extendable, index_of_begin_sequences, index_of_complete_sequences = get_extention_indexes(stop_sequences_df)
route_creation_first = possible_sequences_construction(stop_sequences_df, index_of_extendable, index_of_begin_sequences, index_of_complete_sequences)
route_creation_second = add_full_sequences(stop_sequences_df, route_creation_first, index_of_complete_sequences)
route_creation_third = add_unused_sequences(stop_sequences_df, route_creation_second)
route_creation_frequency_single = calculate_frequenty_new_sequences(number_of_trips_per_hash, service_id_count_dates, route_creation_third)
route_creation_frequency_single_travel_time = give_begin_end_time(route_creation_frequency_single, trips_hash, stops_cleaned_stop_times_trips_merge)
route_creation_hash = calculate_hash_route_creation(route_creation_frequency_single_travel_time)
route_hash_freq_combined = regroup_same_stop_sequences(route_creation_hash)
final_routes = apply_treshold_route_creation(route_hash_freq_combined)
belgian_routes = keep_belgian_routes(final_routes)
df_for_edges = create_df_for_Networkx(belgian_routes)

In [197]:
route_creation_frequency_single_travel_time = give_begin_end_time(route_creation_frequency_single, trips_hash, stops_cleaned_stop_times_trips_merge)

[-8715344402839177113]
-8715344402839177113
      index  route_id  service_id  \
1        22       115          14   
2        21       115          14   
5       182       115          42   
6       181       115          42   
9        26       115          14   
..      ...       ...         ...   
222  413380       607         187   
225  413385       607        1454   
226  413384       607        1454   
229  421731       701          57   
230  421730       701          57   

                                         trip_id trip_headsign  \
1    88____:007::8885704:8885001:4:1023:20210418       TOURNAI   
2    88____:007::8885704:8885001:4:1023:20210418       TOURNAI   
5    88____:007::8885704:8885001:4:1023:20210530       TOURNAI   
6    88____:007::8885704:8885001:4:1023:20210530       TOURNAI   
9    88____:007::8885704:8885001:4:1123:20210418       TOURNAI   
..                                           ...           ...   
222   88____:044::8885704:8885001:4:504:20210402 

      index  route_id  service_id  \
1      1147       117          43   
2      1154       117          43   
3      1153       117          43   
4      1152       117          43   
5      1151       117          43   
..      ...       ...         ...   
154  348201       470        1099   
155  348200       470        1099   
156  348199       470        1099   
157  348198       470        1099   
158  348197       470        1099   

                                         trip_id   trip_headsign  \
1    88____:046::8844008:8844420:8:1022:20210411  SPA-GERONSTERE   
2    88____:046::8844008:8844420:8:1022:20210411  SPA-GERONSTERE   
3    88____:046::8844008:8844420:8:1022:20210411  SPA-GERONSTERE   
4    88____:046::8844008:8844420:8:1022:20210411  SPA-GERONSTERE   
5    88____:046::8844008:8844420:8:1022:20210411  SPA-GERONSTERE   
..                                           ...             ...   
154   88____:046::8844008:8844420:8:822:20211211  SPA-GERONSTERE   
155   88___

      index  route_id  service_id  \
1      2168       121          14   
2      2169       121          14   
3      2170       121          14   
6      2173       121          14   
7      2174       121          14   
..      ...       ...         ...   
232  418798       650        1517   
233  418799       650        1517   
236  418792       650         471   
237  418793       650         471   
238  418794       650         471   

                                         trip_id trip_headsign  \
1    88____:007::8872009:8871308:5:1055:20210418        LUTTRE   
2    88____:007::8872009:8871308:5:1055:20210418        LUTTRE   
3    88____:007::8872009:8871308:5:1055:20210418        LUTTRE   
6    88____:007::8872009:8871308:5:1155:20210418        LUTTRE   
7    88____:007::8872009:8871308:5:1155:20210418        LUTTRE   
..                                           ...           ...   
232   88____:044::8872009:8871308:5:721:20211210        LUTTRE   
233   88____:044::8872009:8

40  5.313890  
[-936373227176635266, -5253903085747018694]
-936373227176635266
      index  route_id  service_id                                     trip_id  \
1      4563       122          24  88____:007::8864006:8864501:7:654:20210502   
2      4569       122          24  88____:007::8864006:8864501:7:654:20210502   
3      4568       122          24  88____:007::8864006:8864501:7:654:20210502   
4      4567       122          24  88____:007::8864006:8864501:7:654:20210502   
5      4566       122          24  88____:007::8864006:8864501:7:654:20210502   
..      ...       ...         ...                                         ...   
205  417741       632         266  88____:044::8864006:8864501:7:547:20211210   
206  417740       632         266  88____:044::8864006:8864501:7:547:20211210   
207  417739       632         266  88____:044::8864006:8864501:7:547:20211210   
208  417738       632         266  88____:044::8864006:8864501:7:547:20211210   
209  417737       632         

     index  route_id  service_id                                      trip_id  \
1     4683       123         155  88____:007::8872009:8814209:7:1037:20210411   
2     4684       123         155  88____:007::8872009:8814209:7:1037:20210411   
3     4685       123         155  88____:007::8872009:8814209:7:1037:20210411   
4     4686       123         155  88____:007::8872009:8814209:7:1037:20210411   
5     4687       123         155  88____:007::8872009:8814209:7:1037:20210411   
..     ...       ...         ...                                          ...   
134   4676       123         155   88____:007::8872009:8814209:7:937:20210411   
135   4677       123         155   88____:007::8872009:8814209:7:937:20210411   
136   4678       123         155   88____:007::8872009:8814209:7:937:20210411   
137   4679       123         155   88____:007::8872009:8814209:7:937:20210411   
138   4680       123         155   88____:007::8872009:8814209:7:937:20210411   

    trip_headsign          

      index  route_id  service_id  \
1      5038       124          41   
2      5037       124          41   
4      5041       124          41   
5      5040       124          41   
7      5044       124          41   
..      ...       ...         ...   
191  158754       280          72   
193  215099       321         191   
194  215098       321         191   
196  215074       321         189   
197  215073       321         189   

                                         trip_id      trip_headsign  \
1    88____:007::8894201:8893401:3:1054:20210502           TERMONDE   
2    88____:007::8894201:8893401:3:1054:20210502           TERMONDE   
4    88____:007::8894201:8893401:3:1154:20210502           TERMONDE   
5    88____:007::8894201:8893401:3:1154:20210502           TERMONDE   
7    88____:007::8894201:8893401:3:1254:20210502           TERMONDE   
..                                           ...                ...   
191   88____:007::8894201:8893401:3:929:20210425  GAND-SAI

      index  route_id  service_id  \
1      5656       129          15   
2      5655       129          15   
3      5652       129          15   
4      5653       129          15   
7      5662       129          15   
..      ...       ...         ...   
622  383583       501          81   
625  383644       501        1298   
626  383643       501        1298   
627  383640       501        1298   
628  383641       501        1298   

                                         trip_id trip_headsign  \
1    88____:007::8841004:8841673:6:1056:20210321         LIERS   
2    88____:007::8841004:8841673:6:1056:20210321         LIERS   
3    88____:007::8841004:8841673:6:1056:20210321         LIERS   
4    88____:007::8841004:8841673:6:1056:20210321         LIERS   
7    88____:007::8841004:8841673:6:1256:20210321         LIERS   
..                                           ...           ...   
622   88____:046::8841004:8841673:6:910:20210328         LIERS   
625   88____:046::8841004:8

     index  route_id  service_id                                      trip_id  \
1     8500       131          56  88____:007::8821006:8821600:5:1025:20210523   
2     8499       131          56  88____:007::8821006:8821600:5:1025:20210523   
3     8498       131          56  88____:007::8821006:8821600:5:1025:20210523   
4     8497       131          56  88____:007::8821006:8821600:5:1025:20210523   
6     8505       131          42  88____:007::8821006:8821600:5:1025:20210530   
..     ...       ...         ...                                          ...   
174   8481       131          56   88____:007::8821006:8821600:5:925:20210523   
176   8489       131          42   88____:007::8821006:8821600:5:925:20210530   
177   8488       131          42   88____:007::8821006:8821600:5:925:20210530   
178   8487       131          42   88____:007::8821006:8821600:5:925:20210530   
179   8486       131          42   88____:007::8821006:8821600:5:925:20210530   

    trip_headsign          

      index  route_id  service_id  \
1      9386       133          71   
2      9385       133          71   
3      9378       133          71   
4      9379       133          71   
5      9380       133          71   
..      ...       ...         ...   
324  377060       491         714   
325  377064       491         714   
326  377063       491         714   
327  377062       491         714   
328  377061       491         714   

                                          trip_id trip_headsign  \
1    88____:046::8841004:8831005:11:1023:20210423       HASSELT   
2    88____:046::8841004:8831005:11:1023:20210423       HASSELT   
3    88____:046::8841004:8831005:11:1023:20210423       HASSELT   
4    88____:046::8841004:8831005:11:1023:20210423       HASSELT   
5    88____:046::8841004:8831005:11:1023:20210423       HASSELT   
..                                            ...           ...   
324   88____:L72::8841004:8831005:11:923:20211210       HASSELT   
325   88____:L72::8

      index  route_id  service_id  \
1      9972       137          40   
2      9971       137          40   
3      9970       137          40   
4      9968       137          40   
7      9978       137          40   
..      ...       ...         ...   
142  420353       678         824   
145  421790       702         734   
146  421789       702         734   
147  421788       702         734   
148  421786       702         734   

                                         trip_id      trip_headsign  \
1    88____:007::8896008:8892007:6:1049:20210416  GAND-SAINT-PIERRE   
2    88____:007::8896008:8892007:6:1049:20210416  GAND-SAINT-PIERRE   
3    88____:007::8896008:8892007:6:1049:20210416  GAND-SAINT-PIERRE   
4    88____:007::8896008:8892007:6:1049:20210416  GAND-SAINT-PIERRE   
7    88____:007::8896008:8892007:6:1149:20210416  GAND-SAINT-PIERRE   
..                                           ...                ...   
142   88____:044::8896008:8892007:6:820:20211210  GAND-SAI

[-5972308920044671357]
-5972308920044671357
     index  route_id  service_id  \
1    10516       141          28   
2    10517       141          28   
3    10518       141          28   
4    10519       141          28   
5    10520       141          28   
..     ...       ...         ...   
544  10491       141          14   
545  10509       141          14   
546  10510       141          14   
547  10511       141          14   
548  10512       141          14   

                                          trip_id trip_headsign  \
1    88____:007::8814001:8861200:22:1029:20210411      GEMBLOUX   
2    88____:007::8814001:8861200:22:1029:20210411      GEMBLOUX   
3    88____:007::8814001:8861200:22:1029:20210411      GEMBLOUX   
4    88____:007::8814001:8861200:22:1029:20210411      GEMBLOUX   
5    88____:007::8814001:8861200:22:1029:20210411      GEMBLOUX   
..                                            ...           ...   
544   88____:007::8814001:8861200:22:929:20210418     

Index: []
[-219529755453204261]
-219529755453204261
      index  route_id  service_id  \
1     12096       147         111   
2     12095       147         111   
5    334969       453        1135   
6    334968       453        1135   
9    335039       453        1135   
10   335038       453        1135   
13   335109       453        1135   
14   335108       453        1135   
17   335225       453        1135   
18   335224       453        1135   
21   335229       453        1130   
22   335228       453        1130   
25   335315       453        1135   
26   335314       453        1135   
29   334899       453        1135   
30   334898       453        1135   
33   339396       460        1157   
34   339395       460        1157   
37   339485       460        1157   
38   339484       460        1157   
41   339574       460        1157   
42   339573       460        1157   
45   339663       460        1157   
46   339662       460        1157   
49   339752       460  

     index  route_id  service_id                                      trip_id  \
1    12572       150          17   88____:007::8872009:8874609:7:705:20210328   
2    12573       150          17   88____:007::8872009:8874609:7:705:20210328   
3    12574       150          17   88____:007::8872009:8874609:7:705:20210328   
4    12575       150          17   88____:007::8872009:8874609:7:705:20210328   
5    12576       150          17   88____:007::8872009:8874609:7:705:20210328   
8   297503       390         102  88____:007::8872009:8874609:7:2306:20210327   
9   297504       390         102  88____:007::8872009:8874609:7:2306:20210327   
10  297505       390         102  88____:007::8872009:8874609:7:2306:20210327   
11  297506       390         102  88____:007::8872009:8874609:7:2306:20210327   
12  297507       390         102  88____:007::8872009:8874609:7:2306:20210327   
15  297510       390          37  88____:007::8872009:8874609:7:2306:20210410   
16  297511       390        

     index  route_id  service_id  \
1    13741       155         184   
2    13742       155         184   
3    13743       155         184   
4    13744       155         184   
5    13745       155         184   
..     ...       ...         ...   
764  13701       155         183   
765  13702       155         183   
766  13703       155         183   
767  13704       155         183   
768  13705       155         183   

                                          trip_id trip_headsign  \
1    88____:046::8812005:8833001:11:1010:20210627       LOUVAIN   
2    88____:046::8812005:8833001:11:1010:20210627       LOUVAIN   
3    88____:046::8812005:8833001:11:1010:20210627       LOUVAIN   
4    88____:046::8812005:8833001:11:1010:20210627       LOUVAIN   
5    88____:046::8812005:8833001:11:1010:20210627       LOUVAIN   
..                                            ...           ...   
764   88____:046::8812005:8833001:11:918:20210321       LOUVAIN   
765   88____:046::8812005:88330

      index  route_id  service_id  \
1     14931       156          28   
2     14932       156          28   
3     14933       156          28   
4     14934       156          28   
5     14935       156          28   
..      ...       ...         ...   
239  414652       609        6235   
240  414653       609        6235   
241  414654       609        6235   
242  414655       609        6235   
243  414656       609        6235   

                                         trip_id      trip_headsign  \
1    88____:046::8892601:8892007:7:1003:20210411  GAND-SAINT-PIERRE   
2    88____:046::8892601:8892007:7:1003:20210411  GAND-SAINT-PIERRE   
3    88____:046::8892601:8892007:7:1003:20210411  GAND-SAINT-PIERRE   
4    88____:046::8892601:8892007:7:1003:20210411  GAND-SAINT-PIERRE   
5    88____:046::8892601:8892007:7:1003:20210411  GAND-SAINT-PIERRE   
..                                           ...                ...   
239  88____:G70::8892601:8892007:7:2203:20210404          

[2895643963109709058]
2895643963109709058
     index  route_id  service_id  \
1    16439       159          15   
2    16450       159          15   
3    16451       159          15   
4    16452       159          15   
5    16453       159          15   
..     ...       ...         ...   
750  16420       159          15   
751  16421       159          15   
752  16422       159          15   
753  16427       159          15   
754  16428       159          15   

                                          trip_id trip_headsign  \
1    88____:007::8872009:8885704:27:1041:20210321      MOUSCRON   
2    88____:007::8872009:8885704:27:1041:20210321      MOUSCRON   
3    88____:007::8872009:8885704:27:1041:20210321      MOUSCRON   
4    88____:007::8872009:8885704:27:1041:20210321      MOUSCRON   
5    88____:007::8872009:8885704:27:1041:20210321      MOUSCRON   
..                                            ...           ...   
750   88____:007::8872009:8885704:27:941:20210321      M

     index  route_id  service_id  \
1    17294       160          28   
2    17293       160          28   
3    17292       160          28   
4    17291       160          28   
5    17290       160          28   
..     ...       ...         ...   
351  17277       160          28   
352  17278       160          28   
353  17279       160          28   
354  17280       160          28   
355  17281       160          28   

                                          trip_id    trip_headsign  \
1    88____:007::8885704:8882206:21:1026:20210411  LA LOUVIERE-SUD   
2    88____:007::8885704:8882206:21:1026:20210411  LA LOUVIERE-SUD   
3    88____:007::8885704:8882206:21:1026:20210411  LA LOUVIERE-SUD   
4    88____:007::8885704:8882206:21:1026:20210411  LA LOUVIERE-SUD   
5    88____:007::8885704:8882206:21:1026:20210411  LA LOUVIERE-SUD   
..                                            ...              ...   
351   88____:007::8885704:8882206:21:926:20210411  LA LOUVIERE-SUD   
352   8

[1228935386793170234]
1228935386793170234
    index  route_id  service_id                                      trip_id  \
1   19021       164         141  88____:046::8822004:8822111:3:1017:20210502   
4   19024       164         141  88____:046::8822004:8822111:3:1117:20210502   
7   19027       164         141  88____:046::8822004:8822111:3:1217:20210502   
10  19030       164         141  88____:046::8822004:8822111:3:1317:20210502   
13  19033       164         141  88____:046::8822004:8822111:3:1417:20210502   
16  19036       164         141  88____:046::8822004:8822111:3:1517:20210502   
19  19039       164         141  88____:046::8822004:8822111:3:1617:20210502   
22  19042       164         141  88____:046::8822004:8822111:3:1717:20210502   
25  19045       164         141  88____:046::8822004:8822111:3:1817:20210502   
28  19048       164         141  88____:046::8822004:8822111:3:1917:20210502   
31  19051       164         141  88____:046::8822004:8822111:3:2017:20210502  

       index  route_id  service_id  \
1      21550       167          58   
2      21539       167          58   
3      21549       167          58   
4      21548       167          58   
5      21547       167          58   
...      ...       ...         ...   
1528  412041       582         516   
1529  412036       582         516   
1530  412037       582         516   
1531  412038       582         516   
1532  412039       582         516   

                                           trip_id   trip_headsign  \
1     88____:007::8833001:8821006:13:1028:20210611  ANVERS-CENTRAL   
2     88____:007::8833001:8821006:13:1028:20210611  ANVERS-CENTRAL   
3     88____:007::8833001:8821006:13:1028:20210611  ANVERS-CENTRAL   
4     88____:007::8833001:8821006:13:1028:20210611  ANVERS-CENTRAL   
5     88____:007::8833001:8821006:13:1028:20210611  ANVERS-CENTRAL   
...                                            ...             ...   
1528   88____:044::8833001:8821006:13:620:20211210  A

[-7259601560496996261]
-7259601560496996261
     index  route_id  service_id  \
1    21926       168           6   
2    21925       168           6   
3    21932       168           6   
4    21933       168           6   
5    21934       168           6   
..     ...       ...         ...   
368  21901       168           6   
369  21909       168           6   
370  21902       168           6   
371  21908       168           6   
372  21907       168           6   

                                          trip_id      trip_headsign  \
1    88____:007::8821006:8821832:22:1044:20210516  HEIST-OP-DEN-BERG   
2    88____:007::8821006:8821832:22:1044:20210516  HEIST-OP-DEN-BERG   
3    88____:007::8821006:8821832:22:1044:20210516  HEIST-OP-DEN-BERG   
4    88____:007::8821006:8821832:22:1044:20210516  HEIST-OP-DEN-BERG   
5    88____:007::8821006:8821832:22:1044:20210516  HEIST-OP-DEN-BERG   
..                                            ...                ...   
368   88____:007::8

[3746525405376110914]
3746525405376110914
     index  route_id  service_id  \
1    23101       169          56   
2    23100       169          56   
3    23113       169          56   
4    23112       169          56   
5    23111       169          56   
..     ...       ...         ...   
402  23096       169          56   
403  23095       169          56   
404  23094       169          56   
405  23090       169          56   
406  23091       169          56   

                                          trip_id   trip_headsign  \
1    88____:007::8831005:8821006:24:1001:20210523  ANVERS-CENTRAL   
2    88____:007::8831005:8821006:24:1001:20210523  ANVERS-CENTRAL   
3    88____:007::8831005:8821006:24:1001:20210523  ANVERS-CENTRAL   
4    88____:007::8831005:8821006:24:1001:20210523  ANVERS-CENTRAL   
5    88____:007::8831005:8821006:24:1001:20210523  ANVERS-CENTRAL   
..                                            ...             ...   
402   88____:007::8831005:8821006:24:901:2

      index  route_id  service_id  \
1     21019       167         697   
2     21018       167         697   
3     21017       167         697   
4     21016       167         697   
5     21015       167         697   
..      ...       ...         ...   
521  210776       314        2097   
522  210775       314        2097   
523  210781       314        2097   
524  210782       314        2097   
525  210768       314        2097   

                                         trip_id   trip_headsign  \
1    88____:007::8833001:8821006:17:624:20210509  ANVERS-CENTRAL   
2    88____:007::8833001:8821006:17:624:20210509  ANVERS-CENTRAL   
3    88____:007::8833001:8821006:17:624:20210509  ANVERS-CENTRAL   
4    88____:007::8833001:8821006:17:624:20210509  ANVERS-CENTRAL   
5    88____:007::8833001:8821006:17:624:20210509  ANVERS-CENTRAL   
..                                           ...             ...   
521  88____:007::8833001:8821006:17:924:20211205  ANVERS-CENTRAL   
522  88____

       index  route_id  service_id  \
1      23814       169          53   
2      23829       169          53   
3      23828       169          53   
4      23822       169          53   
5      23823       169          53   
...      ...       ...         ...   
1973  208643       314          36   
1974  208644       314          36   
1975  208645       314          36   
1976  208646       314          36   
1977  208640       314          36   

                                           trip_id trip_headsign  \
1     88____:007::8821006:8831401:23:1104:20210328       HASSELT   
2     88____:007::8821006:8831401:23:1104:20210328       HASSELT   
3     88____:007::8821006:8831401:23:1104:20210328       HASSELT   
4     88____:007::8821006:8831401:23:1104:20210328       HASSELT   
5     88____:007::8821006:8831401:23:1104:20210328       HASSELT   
...                                            ...           ...   
1973   88____:007::8821006:8831401:23:908:20210423         DIEST   

[-2377796365951257741, -5254354434976796691]
-2377796365951257741
      index  route_id  service_id  \
1     26488       169          71   
2     26487       169          71   
3     26486       169          71   
4     26485       169          71   
5     26484       169          71   
..      ...       ...         ...   
294  320649       428         212   
295  320648       428         212   
296  320658       428         212   
297  320659       428         212   
298  320647       428         212   

                                          trip_id trip_headsign  \
1     88____:007::8821006:8832409:13:802:20210423       HASSELT   
2     88____:007::8821006:8832409:13:802:20210423       HASSELT   
3     88____:007::8821006:8832409:13:802:20210423       HASSELT   
4     88____:007::8821006:8832409:13:802:20210423       HASSELT   
5     88____:007::8821006:8832409:13:802:20210423       HASSELT   
..                                            ...           ...   
294  88____:007::882

-5641077122717147131
       index  route_id  service_id  \
1      26900       169         114   
2      26899       169         114   
3      26889       169         114   
4      26890       169         114   
5      26891       169         114   
...      ...       ...         ...   
1437  320700       429        1016   
1438  320701       429        1016   
1439  320702       429        1016   
1440  320703       429        1016   
1441  320704       429        1016   

                                           trip_id   trip_headsign  \
1     88____:007::8832409:8821006:13:1051:20210417  ANVERS-CENTRAL   
2     88____:007::8832409:8821006:13:1051:20210417  ANVERS-CENTRAL   
3     88____:007::8832409:8821006:13:1051:20210417  ANVERS-CENTRAL   
4     88____:007::8832409:8821006:13:1051:20210417  ANVERS-CENTRAL   
5     88____:007::8832409:8821006:13:1051:20210417  ANVERS-CENTRAL   
...                                            ...             ...   
1437   88____:007::8832409:88210

      index  route_id  service_id  \
1     27102       169        1662   
2     27103       169        1662   
3     27104       169        1662   
5     27098       169        6212   
6     27099       169        6212   
..      ...       ...         ...   
110   26843       169        1009   
111   26844       169        1009   
113  322640       435        1009   
114  322641       435        1009   
115  322642       435        1009   

                                         trip_id   trip_headsign  \
1    88____:007::8831005:8832243:4:1028:20210530  ANVERS-CENTRAL   
2    88____:007::8831005:8832243:4:1028:20210530  ANVERS-CENTRAL   
3    88____:007::8831005:8832243:4:1028:20210530  ANVERS-CENTRAL   
5    88____:007::8831005:8832243:4:1028:20210611  ANVERS-CENTRAL   
6    88____:007::8831005:8832243:4:1028:20210611  ANVERS-CENTRAL   
..                                           ...             ...   
110   88____:007::8831005:8832243:4:928:20210611  ANVERS-CENTRAL   
111   88___

[3015018855473690232, 3902732984968948064, -179716622490667639, -5641077122717147131]
3015018855473690232
      index  route_id  service_id  \
1     27102       169        1662   
2     27103       169        1662   
3     27104       169        1662   
5     27098       169        6212   
6     27099       169        6212   
..      ...       ...         ...   
110   26843       169        1009   
111   26844       169        1009   
113  322640       435        1009   
114  322641       435        1009   
115  322642       435        1009   

                                         trip_id   trip_headsign  \
1    88____:007::8831005:8832243:4:1028:20210530  ANVERS-CENTRAL   
2    88____:007::8831005:8832243:4:1028:20210530  ANVERS-CENTRAL   
3    88____:007::8831005:8832243:4:1028:20210530  ANVERS-CENTRAL   
5    88____:007::8831005:8832243:4:1028:20210611  ANVERS-CENTRAL   
6    88____:007::8831005:8832243:4:1028:20210611  ANVERS-CENTRAL   
..                                       

-179716622490667639


KeyboardInterrupt: 

In [188]:
print(belgian_routes_Belgium[belgian_routes_Belgium['route_id'] == 168].to_string())

    route_id                                                                                                                                                                                                                                                                                 stop_sequence  travel_time  waiting_time
48       168  [ANVERS-CENTRAL, ANVERS-BERCHEM, MORTSEL, MORTSEL-LIERSESTEENWEG, HOVE, KONTICH-LINT, DUFFEL, SINT-KATELIJNE-WAVER, MALINES-NEKKERSPOEL, MUIZEN, HEVER, BOORTMEERBEEK, HAACHT, WESPELAAR-TILDONK, HAMBOS, WIJGMAAL, LOUVAIN, WEZEMAAL, AARSCHOT, BEGIJNENDIJK, BOOISCHOT, HEIST-OP-DEN-BERG]         84.0          13.0


In [189]:
listlol = ['bonjour', 'hello', 'lol']
print(len(listlol))
for index, elem in enumerate(listlol):
    print(index)
    print(elem)

3
0
bonjour
1
hello
2
lol


In [196]:
'''Creates a column in the df that calculates the travel time between the first and last stop (waiting time included)'''
from datetime import datetime
from datetime import timedelta
FMT = '%H:%M:%S'
day_in_seconds = timedelta(days=1).total_seconds()

def give_begin_end_time(route_creation_frequency_single, trips_hash, stops_cleaned_stop_times_trips_merge):
    #create a copy to not change the input DataFrame
    route_creation_frequency_single = route_creation_frequency_single.copy()
    #makes a column with the a representative begin time and end time of the route
    route_creation_frequency_single['travel_time'] = np.nan
    for index_sequence, sequence in route_creation_frequency_single.iterrows():
        constructed_route = pd.DataFrame()
        print(sequence['hash'])
        for index_hash, hash_value in enumerate(sequence['hash']):
            print(hash_value)
            index_plus_one = index_hash + 1
            #take all the trips with that hash
            next_representative_trips = trips_hash[(trips_hash['hash'] == hash_value)].copy()['trip_id']
            #take all the stop sequences and their time that belongs 
            full_times = stops_cleaned_stop_times_trips_merge[stops_cleaned_stop_times_trips_merge['trip_id'].isin(next_representative_trips)].copy()
            #select) only the last stop sequences of full_times for each trip_id
            new_index_max_per_trip_id = full_times.reset_index().groupby(['route_id', 'trip_id'])['stop_sequence'].idxmax()
            max_per_trip_id = full_times.reset_index().loc[new_index_max_per_trip_id]
            #select only the first stop sequences of full_times for each trip_id            
            new_index_min_per_trip_id = full_times.reset_index().groupby(['route_id', 'trip_id'])['stop_sequence'].idxmin()            
            min_per_trip_id = full_times.reset_index().loc[new_index_min_per_trip_id]
            #merge max_per_trip_id and min_per_trip_id
            merged = min_per_trip_id[['trip_id', 'service_id', 'departure_time']].merge(max_per_trip_id[['trip_id', 'arrival_time', 'departure_time']], on='trip_id')
            #take all the stop sequences except the first one, and the last one if it is not the last part of the route
            if index_hash == len(sequence['hash']) - 1:
                rest_per_trip_id = full_times.reset_index().drop(pd.concat([new_index_min_per_trip_id,new_index_max_per_trip_id]))
            else:
                rest_per_trip_id = full_times.reset_index().drop(new_index_min_per_trip_id)
            print(rest_per_trip_id)
            if not rest_per_trip_id.empty:
                #transform 24:00:00 into 00:00:00
                rest_per_trip_id['departure_time'] = rest_per_trip_id['departure_time'].apply(lambda x: str(int(x[:2])-24) + x[2:] if int(x[:2]) >= 24 else x)
                rest_per_trip_id['arrival_time'] = rest_per_trip_id['arrival_time'].apply(lambda x: str(int(x[:2])-24) + x[2:] if int(x[:2]) >=  24 else x)
                #calculate the waiting_time
                rest_per_trip_id['waiting_time'] = rest_per_trip_id[['arrival_time','departure_time']].apply(lambda x: int((datetime.strptime(x['departure_time'], FMT) - datetime.strptime(x['arrival_time'], FMT)).total_seconds()/60), axis=1)
                #if one day as past, take it into consideration
                rest_per_trip_id['waiting_time'] = rest_per_trip_id['waiting_time'].apply(lambda x: day_in_seconds/60 + x if x < 0 else x)
                #calculate the total waiting_time
                rest_per_trip_id_grouped = rest_per_trip_id.groupby(['trip_id'], as_index=False)['waiting_time'].sum()
                merged_waiting_time = merged.merge(rest_per_trip_id_grouped, on='trip_id')
            #in case there are only two stops in for the hash
            else:
                merged_waiting_time = merged.copy()
                merged_waiting_time['waiting_time'] = 0
            #rename the columns     
            merged_waiting_time = merged_waiting_time.rename(columns = {'trip_id': 'trip_id_' + str(index_plus_one),'departure_time_x':'departure_time_'+ str(index_plus_one), 'arrival_time':'arrival_time_'+ str(index_plus_one),
                                          'departure_time_y':'departure_time_'+ str(index_plus_one + 1), 'waiting_time': 'waiting_time_' + str(index_plus_one)})
            if index_hash == 0:
                constructed_route = merged_waiting_time
            elif index_hash > 0:
                constructed_route = constructed_route.merge(merged_waiting_time, how='inner', left_on=['departure_time_' + str(index_plus_one), 'service_id'], right_on=['departure_time_' + str(index_plus_one), 'service_id'])
        #make a list of all the columns of waiting_times
        list_column_waiting_time = []
        for i in range(1, index_plus_one + 1):
            list_column_waiting_time.append('waiting_time_' + str(i))
        #sum all the waiting times together for each route itinerary
        constructed_route['waiting_time'] = constructed_route[list_column_waiting_time].astype(int).sum(1)
        
        #sometimes it is impossible to find trips that follow each other
        if not constructed_route.empty:
            #when the loop is finished, take the last arrival time, that will be used to calculate the travel time
            time_constructed_route = constructed_route[['departure_time_1', 'arrival_time_' + str(index_plus_one), 'waiting_time']]
            time_constructed_route = time_constructed_route.rename(columns = {'departure_time_1':'departure_time', 'arrival_time_' + str(index_plus_one):'arrival_time'})
            #transform 24:00:00 into 00:00:00
            time_constructed_route['departure_time'] = time_constructed_route['departure_time'].apply(lambda x: str(int(x[:2])-24) + x[2:] if int(x[:2]) >= 24 else x)
            time_constructed_route['arrival_time'] = time_constructed_route['arrival_time'].apply(lambda x: str(int(x[:2])-24) + x[2:] if int(x[:2]) >=  24 else x)
            #calculate the travel time
            time_constructed_route['time_diff_sec'] = time_constructed_route[['arrival_time','departure_time']].apply(lambda x: int((datetime.strptime(x['arrival_time'], FMT) - datetime.strptime(x['departure_time'], FMT)).total_seconds()/60), axis=1)
            #if one day as past, take it into consideration
            time_constructed_route['time_diff_sec'] = time_constructed_route['time_diff_sec'].apply(lambda x: day_in_seconds/60 + x if x < 0 else x)
            #take the first most frequent one
            mode_travel_time = time_constructed_route['time_diff_sec'].mode().mean()
            mode_waiting_time = time_constructed_route['waiting_time'].mode().mean() 
            #Add this to the first dataframe
            route_creation_frequency_single.loc[index_sequence,'travel_time'] = mode_travel_time
            route_creation_frequency_single.loc[index_sequence,'waiting_time'] = mode_waiting_time
        #if there is no trips that follow each other with the hash from the array
        else:
            route_creation_frequency_single = route_creation_frequency_single.drop(index_sequence)
            
    return route_creation_frequency_single

'Creates a column in the df that calculates the travel time between the first and last stop (waiting time included)'

In [324]:
route_creation_frequency_single.loc[1527]['hash']

[900702404119183476, -4488299857125108449, 759975243806517983]