In [None]:
!pip install geopy



In [None]:
'''To import the required packages.'''
import pandas as pd
import numpy as np
import networkx as nx
import collections
import matplotlib.pyplot as plt
import math
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

In [None]:
import osmnx as ox
ox.plot_graph(ox.graph_from_place('Modena, Italy'))

# Settings


In [None]:
'''To display all output results of a Jupyter cell.'''
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
'''To ensure that the output results of extensive output results are not truncated.'''
#pd.options.display.max_rows = 4000

# **Belgian railway system**

# Import of the Belgian railway datasets

In [None]:
'''To register the GitHub link with the Belgian data as a variable.'''
datalink = "https://raw.githubusercontent.com/polkuleuven/Thesis_Train/main/gtfs_train_Belgium_1503/"

In [None]:
'''Import all the GTFS data'''

#To import the agency dataset that contains limited information about Belgian NMBS/SNCB railway agency
agency = pd.read_csv(datalink + "agency.txt", sep=",")
#To import the stops dataset that contains information about the ids, the names and the geographical coordinates of the Belgian railway stations.
stops = pd.read_csv(datalink + "stops.txt", sep=",")
#To import the translations dataset that provides the French-, Dutch-, German- and English-language translations of the Belgian railway stations.
translations = pd.read_csv(datalink + "translations.txt", sep=",")
#To import the transfers dataset that gives the minimum transfer time to switch routes at each Belgian railway station.
transfers = pd.read_csv(datalink + "transfers.txt", sep=",")
#To import the routes dataset that provides the id, the name and the type of vehicle used for all Belgian railway routes.
routes = pd.read_csv(datalink + "routes.txt", sep=",")
#To import the trips dataset that gives for all routes an overview of the trips and the headsigns of these trips belonging to the Belgian railway route.
#The service_id is an indication of all the dates this trip is valid (consultable in the calendar_dates dataset).
trips = pd.read_csv(datalink + "trips.txt", sep=",")
#To import the stop_times dataset that gives for all trips an overview of the ids of the stations served and the sequence in which these stations are served. 
#In addition, for all the trips the arrival and departure times at the stations served are given.
stop_times = pd.read_csv(datalink + "stop_times.txt", sep=",")
#To import the calendar dataset that gives the first and last date of all data observations.
calendar = pd.read_csv(datalink + "calendar.txt", sep=",")
#To import the calendar_dates dataset that gives for each service_id all the exact dates when that service_id is valid.
calendar_dates = pd.read_csv(datalink + "calendar_dates.txt", sep=",")
#???
stop_time_overrides = pd.read_csv(datalink + "stop_time_overrides.txt", sep=",")

# Cleaning of the Belgian railway data

''' To clean the stops df.  (1) ''' 
#####To eliminate the stop_ids in the stops dataset that contain an underscore or that start with a character 'S'. 
stops_cleaned = stops[(~stops['stop_id'].str.contains('_')) & (~stops['stop_id'].str.contains('S'))]

#####To modify the object datatype of the stop_id column to the numpy int64 datatype
stops_cleaned.loc[:,'stop_id'] = stops_cleaned.loc[:,'stop_id'].astype(np.int64)

##### To remove the accents from the stop_name and to change to uppercase
stops_cleaned.loc[:,'stop_name'] = stops_cleaned.loc[:,'stop_name'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
stops_cleaned.loc[:,'stop_name'] = stops_cleaned.loc[:,'stop_name'].str.upper()

''' To clean the stops df.  (2) ''' 
##### To initialize the Nominatim API to get the location from the input string 
geolocator = Nominatim(user_agent="application")
reverse = RateLimiter(geolocator.reverse, min_delay_seconds=0.2)

##### To get the location with the geolocator.reverse() function and to extract the country from the location instance
country_list = []
for index, row in stops_cleaned.iterrows():
    latitude = row['stop_lat']
    longitude = row['stop_lon']
    # To assign the latitude and longitude into a geolocator.reverse() method
    location = reverse((latitude, longitude), language='en', exactly_one=True)
    # To get the country from the given list and parsed into a dictionary with raw function()
    address = location.raw['address']
    country = address.get('country', '')
    country_list.append(country)

##### To add the values of country_list as a new attribute country 
stops_cleaned.loc[:,'country'] = country_list
stops_cleaned

##### To calculate the total number of Belgian stations in the stops_cleaned dataset
belgian_stops_Belgium = stops_cleaned[stops_cleaned['country'] == 'Belgium']
belgian_stops_Belgium_series = stops_cleaned.loc[stops_cleaned['country'] == 'Belgium', 'stop_name']

stops_cleaned.to_csv(r'/Users/pol/Desktop/CSV_export/stops_cleaned_Belgium.csv', index = False, header=True, encoding='utf-8-sig')

belgian_stops_Belgium_series.to_csv(r'/Users/pol/Desktop/CSV_export/belgian_stops_Belgium_series.csv', index = False, header=True, encoding='utf-8-sig')

In [None]:
'''Imports the cleaned version of the stops with their country'''
stops_cleaned = pd.read_csv("https://raw.githubusercontent.com/polkuleuven/Thesis_Train/main/stops_cleaned/stops_cleaned_Belgium.csv", sep=",")
belgian_stops_Belgium_series = pd.read_csv("https://raw.githubusercontent.com/polkuleuven/Thesis_Train/main/country_stops_series/stops_Belgium_series.csv", sep=",")['stop_name']

In [None]:
'''To clean the trips df'''
#To merge a selection of the trips dataset and a selection of the routes dataset on route_id
trip_route_short_name = pd.merge(trips[['route_id','service_id','trip_id', 'trip_headsign']], routes[['route_id', 'route_short_name', 'route_long_name']], on='route_id')

#To select the trips that belong to the routes that have a route_short_name that begins with an 'S' or is equal to 'IC', 'L' or 'P.'''
allowed_route_type = {'IC', 'L', 'P', 'ICT', 'IZY'}
filtered_trips = trip_route_short_name[(trip_route_short_name['route_short_name'].isin(allowed_route_type)) | (trip_route_short_name['route_short_name'].str.startswith('S'))]
filtered_trips = filtered_trips.drop(columns=['route_short_name'])

# To remove the accents from the route_long_name and to change to uppercase
filtered_trips['route_long_name'] = filtered_trips['route_long_name'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
filtered_trips['route_long_name'] = filtered_trips['route_long_name'].str.upper()

# To remove the accents from the trip_headsign and to change to uppercase
filtered_trips['trip_headsign'] = filtered_trips['trip_headsign'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
filtered_trips['trip_headsign'] = filtered_trips['trip_headsign'].str.upper()
filtered_trips

In [None]:
'''Filters the dats from the selected begin to the end date'''
#here we used 4 months
begin_date = 20210314
end_date = 20210713
filtered_calendar_dates = calendar_dates.copy()
filtered_calendar_dates = filtered_calendar_dates.drop(filtered_calendar_dates[(filtered_calendar_dates['date'] > end_date) |(filtered_calendar_dates['date'] < begin_date)].index)
filtered_calendar_dates

# Exploratory data analysis with the Belgian railway data

In [None]:
'''To calculate the number of unique route_ids before removing the routes with a route_short_name that does not begin with an S and is not 'IC', 'L', or 'P'.'''
initial_set_routes = {r for r in routes['route_id']}
len(initial_set_routes)

In [None]:
'''To calculate the number of unique route_ids after removing the routes with a route_short_name that does not begin with an S and is not 'IC', 'L', or 'P'.'''
set_routes = {r for r in filtered_trips['route_id']}
len(set_routes)

In [None]:
'''To calculate the total number of stations in the stops_cleaned dataset'''
set_stations = {s for s in stops_cleaned['stop_id']}
len(set_stations)

In [None]:
'''To calculate the total number of Belgian stations in the stops_cleaned dataset'''
len(belgian_stops_Belgium_series)

# **Preparation for the L-space representation of the Belgian railway system**


In [None]:
'''To merge a selection of the stops_cleaned dataset with a selection of the stop_times dataset'''
stops_cleaned_stop_times_merge = pd.merge(stop_times[['trip_id','arrival_time', 'departure_time','stop_id','stop_sequence']], stops_cleaned[['stop_id', 'stop_name', 'stop_lat', 'stop_lon']], on='stop_id')
stops_cleaned_stop_times_merge

In [None]:
'''To merge a selection of the stops_cleaned_stop_times_merge dataset with the filtered_trips dataset. And sort the values.'''
stops_cleaned_stop_times_trips_merge = pd.merge(filtered_trips, stops_cleaned_stop_times_merge, on='trip_id')
stops_cleaned_stop_times_trips_merge = stops_cleaned_stop_times_trips_merge.sort_values(by=['route_id', 'trip_id', 'stop_sequence'])
stops_cleaned_stop_times_trips_merge

In [None]:
'''Creates a dataframe with the departure time form the first stop sequence and with the one from last stop sequence for each trip_id'''
departure_time_first = stops_cleaned_stop_times_trips_merge.reset_index().loc[stops_cleaned_stop_times_trips_merge.reset_index().groupby(['trip_id'])['stop_sequence'].idxmin()][['route_id', 'trip_id', 'departure_time']].copy()
departure_time_first = departure_time_first.rename(columns = {'departure_time': 'departure_time_first'})
departure_time_last = stops_cleaned_stop_times_trips_merge.reset_index().loc[stops_cleaned_stop_times_trips_merge.reset_index().groupby(['trip_id'])['stop_sequence'].idxmax()][['route_id', 'trip_id', 'departure_time']].copy()
departure_time_last = departure_time_last.rename(columns = {'departure_time': 'departure_time_last'})
departure_times = departure_time_first.merge(departure_time_last[['trip_id', 'departure_time_last']], on='trip_id')
departure_times

In [None]:
''' To groupby the trip_id and to order the stop_sequence in an ascending order
Otherwise, different hash values could correspond to a same stop_sequence (since the stop_sequences of some
routes are initially in descending order while other stop_sequences are in ascending order)'''

trip_stop_sequence_ascending = stops_cleaned_stop_times_merge.groupby(['trip_id'], as_index=False).apply(lambda x: x.sort_values('stop_sequence'))
trip_stop_sequence_ascending

In [None]:
'''To put the stop_names per trip_id in a list'''
trip_stop_sequence = trip_stop_sequence_ascending.groupby('trip_id')['stop_name'].apply(lambda group_series: group_series.tolist()).reset_index()
trip_stop_sequence.rename(columns={'stop_name':'stop_sequence'}, inplace=True)
trip_stop_sequence

In [None]:
'''To calculate the hash value for the stop sequence of each trip_id
and also the hash value of the stop sequence in the opposite direction'''

#To copy the filtered_trips dataset
trips_hash = trip_stop_sequence.copy()

#calculates the hash of the stop sequence in both order (ascending and descending)
trips_hash['hash'] = trips_hash['stop_sequence'].apply(lambda x: hash(tuple(x)))
trips_hash['hash_inverse'] = trips_hash['stop_sequence'].apply(lambda x: hash(tuple(x[::-1])))

In [None]:
''' To add the list of stop_sequence of stations to the trips_hash df by joining on trip_id'''
# To add the stop_sequence of stations to the filtered_trips dataset by joining on trip_id
trips_hash_stop_sequence = pd.merge(filtered_trips, trips_hash, on='trip_id', how='left')

# To put the columns in a more logical order
trips_hash_stop_sequence = trips_hash_stop_sequence[['route_id', 'route_long_name','service_id','trip_headsign','trip_id','hash', 'hash_inverse','stop_sequence']]
trips_hash_stop_sequence

In [None]:
'''Merges the trips_hash_stop_sequence with the departure_times'''
trips_hash_stop_sequence_departure = trips_hash_stop_sequence.merge(departure_times[['trip_id','departure_time_first','departure_time_last']], on='trip_id')
trips_hash_stop_sequence_departure

In [None]:
'''To count the number of dates for each service_id'''
service_id_df = filtered_calendar_dates.groupby(['service_id'])[['service_id']].count().rename(columns={'service_id':'count_service_id'}).reset_index()
service_id_df

In [None]:
'''Regroups the days per service id in a set and count them'''
service_id_dates = filtered_calendar_dates.groupby('service_id')['date'].apply(lambda group_series: set(group_series.tolist())).reset_index()
service_id_dates.rename(columns={'date':'dates'}, inplace=True)
service_id_dates = service_id_dates.merge(service_id_df, on='service_id', how='left')
service_id_dates

In [None]:
'''To merge the trips_hash_stop_sequence df with the service_id_dates to get the sets of corresponding dates'''
stops_cleaned_stop_times_trips_merge_dates = pd.merge(stops_cleaned_stop_times_trips_merge, service_id_dates, on='service_id', how='inner')
stops_cleaned_stop_times_trips_merge_dates

In [None]:
'''To put the different trip_ids in a list and add the departure_time first and last lists'''
common_columns = ['route_id','route_long_name','hash', 'hash_inverse', 'service_id']
route_hash_freq_dep = trips_hash_stop_sequence_departure.groupby(common_columns)['trip_id'].apply(lambda group_series: group_series.tolist()).reset_index()
route_hash_freq_dep_first = trips_hash_stop_sequence_departure.groupby(common_columns)['departure_time_first'].apply(lambda group_series: group_series.tolist()).reset_index()
route_hash_freq_dep_last = trips_hash_stop_sequence_departure.groupby(common_columns)['departure_time_last'].apply(lambda group_series: group_series.tolist()).reset_index()
route_hash_freq_dep = route_hash_freq_dep.merge(route_hash_freq_dep_first, on= common_columns)
route_hash_freq_dep = route_hash_freq_dep.merge(route_hash_freq_dep_last, on= common_columns)
route_hash_freq_dep

In [None]:
'''To add the sequence of stops to the route_hash_freq_dep dataset'''
route_hash_freq_seq = pd.merge(route_hash_freq_dep, trips_hash_stop_sequence[['route_id','hash', 'hash_inverse', 'service_id','stop_sequence']], on=['route_id', 'hash', 'hash_inverse', 'service_id'], how='left')
route_hash_freq_seq = route_hash_freq_seq.drop_duplicates( subset = ['route_id', 'hash', 'service_id'], keep = 'first')

route_hash_freq_seq

In [None]:
'''To calculate the number of trip ids in the list of trip_ids and to add it as a new column'''
route_hash_freq = route_hash_freq_seq.copy()
number_trip_ids = []
for list_trip_ids in route_hash_freq['trip_id']:
    count = len(list_trip_ids)
    number_trip_ids.append(count)
route_hash_freq['number_trip_ids'] = number_trip_ids

route_hash_freq

In [None]:
'''To merge the route_hash_freq df with the service_id_dates to get the sets of corresponding dates'''
route_hash_service_freq = pd.merge(route_hash_freq, service_id_dates, on='service_id', how='inner')
route_hash_service_freq

## Functions for the route creation

In [None]:
'''Some functions to better factorise the functions in the coming cells'''

def select_stop_sequences(stop_sequences_df, route_id):
    '''retruns the stop sequences with the selected route_id'''
    return stop_sequences_df[stop_sequences_df['route_id'] == route_id].copy()

def take_leftovers_list_c_from_intersection_AAndB(list_a, list_b, list_c):
    '''take the indexes of the intersection of list a with list b and retain the elments of list c with that index'''
    ind_dict = dict((k,i) for i,k in enumerate(list_a))
    return [list_c[ind_dict[x]] for x in (set(list_a).intersection(list_b))]

def get_extentions (after_or_behind, route_sequences_route_id, trip):
    '''returns the extentions for the trip (behind or after)'''
    if after_or_behind == 'after':
        #checks the extentions possible for the trip that can follow after its last stop
        possible_extentions = route_sequences_route_id[route_sequences_route_id['stop_sequence'].apply(lambda x: any(item for item in [trip['stop_sequence'][-1]] if (item == x[0]) and not(set(x[1:]) & set(trip['stop_sequence']))))].copy()
    elif after_or_behind == 'behind':
        #checks the extentions possible for the trip that can follow before its first stop
        possible_extentions = route_sequences_route_id[route_sequences_route_id['stop_sequence'].apply(lambda x: any(item for item in [trip['stop_sequence'][0]] if (item == x[-1]) and not(set(x[:-1]) & set(trip['stop_sequence']))))].copy()        
    #checks that those extentions have a common date as the trip
    possible_extentions = possible_extentions[possible_extentions['dates'].apply(lambda x: any(item for item in trip['dates'] if item in x))].copy()   
    if not possible_extentions.empty: 
        if after_or_behind == 'after':
            #checks that those extentions have a matching time schedule as the trip
            possible_extentions = possible_extentions[possible_extentions['departure_time_first'].apply(lambda x: any(item for item in trip['departure_time_last'] if item in x))].copy()
        elif after_or_behind == 'behind':
            #checks that those extentions have a matching time schedule as the trip
            possible_extentions = possible_extentions[possible_extentions['departure_time_last'].apply(lambda x: any(item for item in trip['departure_time_first'] if item in x))].copy()
    return possible_extentions      

def calculate_frequency (sequences_df):
    '''calculate the frequency based on the length of the dates and departure_time and put the hash in as a column of list'''
    sequences_df['number_dates'] = sequences_df['dates'].apply(lambda x: len(x))
    sequences_df['number_times'] = sequences_df['departure_time_last'].apply(lambda x: len(x))
    sequences_df['frequency'] = sequences_df['number_dates']* sequences_df['number_times'] 
    sequences_df = sequences_df.drop(['dates', 'departure_time_last', 'number_dates', 'number_times'], axis=1)
    sequences_df['hash'] = sequences_df['hash'].apply(lambda x: [x])
    return sequences_df.copy()
        
from datetime import datetime
from datetime import timedelta
FMT = '%H:%M:%S'
day_in_seconds = timedelta(days=1).total_seconds()
def calculate_time_difference(time_df, later_time, earlier_time, column_name):
    '''calculates the time difference between later time and earlier time and put it in time_df[column_name]'''
    #transform 24:00:00 into 00:00:00
    time_df['departure_time'] = time_df['departure_time'].apply(lambda x: str(int(x[:2])-24) + x[2:] if int(x[:2]) >= 24 else x)
    time_df['arrival_time'] = time_df['arrival_time'].apply(lambda x: str(int(x[:2])-24) + x[2:] if int(x[:2]) >=  24 else x)
    #calculate the waiting_time
    time_df[column_name] = time_df[['arrival_time','departure_time']].apply(lambda x: int((datetime.strptime(x[later_time], FMT) - datetime.strptime(x[earlier_time], FMT)).total_seconds()/60), axis=1)
    #if one day as past, take it into consideration
    time_df[column_name] = time_df[column_name].apply(lambda x: day_in_seconds/60 + x if x < 0 else x)
    return time_df            


In [None]:
'''Finds the routes that can be either extended from behind or from after and those which are complete sequences'''

def get_extention_indexes(stop_sequences_df):
    '''returns the tree indexes: index_of_extendable, index_of_begin_sequences, index_of_complete_sequences'''
    #intiate the dictionnaries, that will be used to retrieve different rows later on
    index_of_extendable = {}
    index_of_begin_sequences = {}
    index_of_complete_sequences = {}
    for route_id in stop_sequences_df['route_id'].unique():
        #select the route with the route_id selected by the loop iteration
        route_sequences_route_id = select_stop_sequences(stop_sequences_df, route_id)
        for index_trip, trip in route_sequences_route_id.iterrows():
            #checks the extentions possible for the trip that can follow after its last stop
            possible_extentions_after = get_extentions('after', route_sequences_route_id, trip)
            #checks the extentions possible for the trip that can follow before its first stop
            possible_extentions_behind = get_extentions('behind', route_sequences_route_id, trip)
            #put all the sequences that can be extended either from the beginning either from the end together
            possible_extentions = possible_extentions_after.append(possible_extentions_behind, ignore_index = True)
            if not possible_extentions.empty:
                if route_id not in index_of_extendable:
                    index_of_extendable[route_id] = []
                index_of_extendable[route_id].append(index_trip)
                if possible_extentions_behind.empty:
                    if route_id not in index_of_begin_sequences:
                        index_of_begin_sequences[route_id] = []
                    index_of_begin_sequences[route_id].append(index_trip)
            elif possible_extentions.empty:
                if route_id not in index_of_complete_sequences:
                    index_of_complete_sequences[route_id] = []
                index_of_complete_sequences[route_id].append(index_trip)
                
    return index_of_extendable, index_of_begin_sequences, index_of_complete_sequences

In [None]:
'''Creates all the sequences of routes possible to reconstruct the real route and calculates their frequency'''

def possible_sequences_construction(stop_sequences_df, index_of_extendable, index_of_begin_sequences, index_of_complete_sequences):
    '''returns the first part of the route_creation, two others need to be added'''
    import copy
    #create an empty df for the process of route creation
    route_creation  = pd.DataFrame()
    for route_id in index_of_extendable:
        #checks if some parts are begin sequences, if not, then we can't build routes with multiple sequences
        if route_id in index_of_begin_sequences:
            #create a copy of the df with only the route considered in the loop iteration
            routes_with_route_id = select_stop_sequences(stop_sequences_df, route_id)
            #set default frequency to NaN
            routes_with_route_id['frequency'] = np.nan
            #create a df where only the routes that have an end stop as their first element of the sequence
            route_creation_route_id = routes_with_route_id.loc[index_of_begin_sequences[route_id]][['route_id', 'hash', 'stop_sequence', 'dates', 'departure_time_last','frequency']]
            #create a df with the exentable sequences for that route_id
            route_creation_extensions_route_id = routes_with_route_id.loc[index_of_extendable[route_id]][['route_id', 'hash', 'stop_sequence', 'dates', 'departure_time_first', 'departure_time_last','frequency']]    
            #make the hash column as a column of lists
            route_creation_route_id['hash'] = route_creation_route_id['hash'].apply(lambda x: [x])
            route_creation_route_id = route_creation_route_id.reset_index(drop=True)
            #to stop the while loop when all the routes are complete in the df for the route_id of the loop iteration
            complete_routes = 0
            while complete_routes < len(route_creation_route_id.index):
                #use a deepcopy to not impact the iterrows of the main loop
                route_creation_deep_copy = copy.deepcopy(route_creation_route_id)
                for index_original, route_part in route_creation_deep_copy.iterrows():
                    #create a dataframe of the possible extentions for each route_part
                    #select an extention only if the extention is the next part of the route 
                    #and also that no other station are repeated in the sequence if this extention is added(otherwise it might cause an infinite loop)
                    possible_extentions = get_extentions('after', route_creation_extensions_route_id, route_part)
                    #checks whether any extention fullfilling the criterias has been found
                    if not possible_extentions.empty:
                        #if so, extend it with every single possibilities
                        for index_extention, possible_extention in possible_extentions.iterrows():
                            #must create a deepcopy, otherwise the orignal hash list will change as well (mutable)
                            updated_hash = copy.deepcopy(route_part['hash'])
                            updated_hash.append(possible_extention['hash'])
                            updated_route_sequence = route_part['stop_sequence'] + possible_extention['stop_sequence'][1:]
                            common_dates = possible_extention['dates'] & route_part['dates']
                            new_departure_time_last = take_leftovers_list_c_from_intersection_AAndB(list(possible_extentions['departure_time_first'])[0], list(route_part['departure_time_last']), list(possible_extentions['departure_time_last'])[0])
                            new_frequency = len(new_departure_time_last) * len(common_dates)
                            route_creation_route_id.loc[max(route_creation_route_id.index)+1] = [route_id, updated_hash, updated_route_sequence, common_dates, new_departure_time_last, new_frequency]
                        #then delete the route with the index (see loop here above)
                        route_creation_route_id = route_creation_route_id.drop(index = index_original)            
                    #the route can't be extended anymore
                    else:
                        complete_routes += 1
            #adds all the possible routes created with the trips of the route_id of the main loop
            route_creation = route_creation.append(route_creation_route_id, ignore_index = True)
    if 'departure_time_last' in route_creation.columns:
        route_creation = route_creation.drop(['dates', 'departure_time_last'], axis=1)
    route_creation = route_creation.reindex(columns=['route_id','hash','stop_sequence', 'frequency'])
    return route_creation

In [None]:
'''Adds the full sequences to the route_creation dataframe'''

def add_full_sequences(stop_sequences_df, route_creation, index_of_complete_sequences):
    '''returns the second part of the route_creation, one other needs to be added'''
    for route_id in index_of_complete_sequences:
        #findes all the complete sequences for that route_id
        copy_complete_sequences_df = stop_sequences_df.loc[index_of_complete_sequences[route_id]][['route_id','hash','stop_sequence', 'dates', 'departure_time_last']].copy()
        copy_complete_sequences_df = calculate_frequency(copy_complete_sequences_df)
        #adds each of them in the route_creation dataframe
        for index_complete_sequence, complete_sequence in copy_complete_sequences_df.iterrows():
            route_creation = route_creation.append(complete_sequence, ignore_index = True)
    route_creation = route_creation.sort_values(by=['route_id'], ignore_index = True)
    return route_creation 

In [None]:
'''Adds the sequences that were not yet added in the route_creation dataframe'''

def add_unused_sequences(stop_sequences_df, route_creation):
    '''returns the third part of the route_creation'''
    for route_id in stop_sequences_df['route_id'].unique():
        if route_id in route_creation['route_id'].unique():
            #get a set of the hashes that were employed to create the routes for that route_id
            used_sequences_hash = set(route_creation[route_creation['route_id'] == route_id].apply(lambda x: pd.Series(x['hash']),axis=1).stack().reset_index(level=1, drop=True))
            #get a tuple of all the route sequences for that route_id
            used_sequences = tuple(route_creation[route_creation['route_id'] == route_id]['stop_sequence'])
            copy_sequences_route_id = select_stop_sequences(stop_sequences_df, route_id)[['route_id','hash','stop_sequence', 'dates', 'departure_time_last']]
            copy_sequences_route_id = calculate_frequency(copy_sequences_route_id)
            #adds the hashes that were not employed in any route creations for that route_id
            for index_trip, trip in copy_sequences_route_id.iterrows():
                #first element of the list because there is always only one element
                if trip['hash'][0] not in used_sequences_hash:
                    #checks that the sequence is not a sublist of any existing sequences
                    is_subsequence = False
                    for sequence in used_sequences:
                        if set(trip['stop_sequence']).issubset(sequence):
                            is_subsequence = True
                    if not is_subsequence:
                        route_creation = route_creation.append(trip, ignore_index = True)
    return route_creation

In [None]:
'''Creates a column in the df that calculates the travel time between the first and last stop (waiting time included)
and another column with the waiting time (calculated with a weighted average based on the frequency)'''
from datetime import datetime
from datetime import timedelta
FMT = '%H:%M:%S'
day_in_seconds = timedelta(days=1).total_seconds()

def give_begin_end_time(route_creation_frequency_single, trips_hash_stop_sequence, stops_cleaned_stop_times_trips_merge_dates):
    #create a copy to not change the input DataFrame
    route_creation_frequency_single = route_creation_frequency_single.copy()
    #makes a column with the a representative begin time and end time of the route
    route_creation_frequency_single['travel_time'] = np.nan
    for index_sequence, sequence in route_creation_frequency_single.iterrows():
        constructed_route = pd.DataFrame()
        for index_hash, hash_value in enumerate(sequence['hash']):
            index_plus_one = index_hash + 1
            #take all the trips with that hash
            next_representative_trips = trips_hash_stop_sequence[(trips_hash_stop_sequence['hash'] == hash_value) & (trips_hash_stop_sequence['route_id'] == sequence['route_id'])].copy()['trip_id']
            #take all the stop sequences and their time that belongs 
            full_times = stops_cleaned_stop_times_trips_merge_dates[stops_cleaned_stop_times_trips_merge_dates['trip_id'].isin(next_representative_trips)].copy()
            #select) only the last stop sequences of full_times for each trip_id
            new_index_max_per_trip_id = full_times.reset_index().groupby(['route_id', 'trip_id'])['stop_sequence'].idxmax()
            max_per_trip_id = full_times.reset_index().loc[new_index_max_per_trip_id]
            #select only the first stop sequences of full_times for each trip_id            
            new_index_min_per_trip_id = full_times.reset_index().groupby(['route_id', 'trip_id'])['stop_sequence'].idxmin()            
            min_per_trip_id = full_times.reset_index().loc[new_index_min_per_trip_id]
            #merge max_per_trip_id and min_per_trip_id
            merged = min_per_trip_id[['trip_id', 'dates', 'departure_time']].merge(max_per_trip_id[['trip_id', 'arrival_time', 'departure_time']], on='trip_id')
            #take all the stop sequences except the first one, and the last one if it is not the last sequence of the route
            if index_hash == len(sequence['hash']) - 1:
                rest_per_trip_id = full_times.reset_index().drop(pd.concat([new_index_min_per_trip_id,new_index_max_per_trip_id]))
            else:
                rest_per_trip_id = full_times.reset_index().drop(new_index_min_per_trip_id)            
            if not rest_per_trip_id.empty:
                rest_per_trip_id = calculate_time_difference(rest_per_trip_id, 'departure_time', 'arrival_time', 'waiting_time')
                #calculate the total waiting_time
                rest_per_trip_id_grouped = rest_per_trip_id.groupby(['trip_id'], as_index=False)['waiting_time'].sum()
                merged_waiting_time = merged.merge(rest_per_trip_id_grouped, on='trip_id')
            #in case there are only two stops in for the hash
            else:
                merged_waiting_time = merged.copy()
                merged_waiting_time['waiting_time'] = 0
            #rename the columns     
            merged_waiting_time = merged_waiting_time.rename(columns = {'trip_id': 'trip_id_' + str(index_plus_one),'departure_time_x':'departure_time_'+ str(index_plus_one), 'arrival_time':'arrival_time_'+ str(index_plus_one),
                                          'departure_time_y':'departure_time_'+ str(index_plus_one + 1), 'waiting_time': 'waiting_time_' + str(index_plus_one)})
            if index_hash == 0:
                constructed_route = merged_waiting_time
            elif index_hash > 0:
                constructed_route = constructed_route.merge(merged_waiting_time, how='inner', on=['departure_time_' + str(index_plus_one)])
                #take the intersection of the dates => only get the common dates and retain those rows with common dates
                constructed_route['dates'] = [a & b for a,b in zip(constructed_route['dates_x'], constructed_route['dates_y'])]
                constructed_route = constructed_route[constructed_route['dates'].map(lambda d: len(d)) > 0]
                constructed_route = constructed_route.drop(['dates_x','dates_y'], axis=1)        
        #make a list of all the columns of waiting_times
        list_column_waiting_time = []
        for i in range(1, index_plus_one + 1):
            list_column_waiting_time.append('waiting_time_' + str(i))
        #sum all the waiting times together for each route itinerary
        constructed_route['waiting_time'] = constructed_route[list_column_waiting_time].astype(int).sum(1)
        
        #sometimes it is impossible to find trips that follow each other
        if not constructed_route.empty:
            #when the loop is finished, take the last arrival time, that will be used to calculate the travel time
            time_constructed_route = constructed_route[['departure_time_1', 'arrival_time_' + str(index_plus_one), 'waiting_time', 'dates']]
            time_constructed_route = time_constructed_route.rename(columns = {'departure_time_1':'departure_time', 'arrival_time_' + str(index_plus_one):'arrival_time'})
            time_constructed_route = calculate_time_difference(time_constructed_route, 'arrival_time', 'departure_time', 'time_diff_min')
            #add here a new column count dates that is the sum of the common dates
            time_constructed_route['count_dates'] = time_constructed_route['dates'].apply(lambda x: len(x))
            sum_count_dates = time_constructed_route['count_dates'].sum()
            #take the first most frequent one
            #create the weighted sum
            time_constructed_route['WS_travel_time'] = (time_constructed_route['time_diff_min'] * time_constructed_route['count_dates'])/sum_count_dates
            time_constructed_route['WS_waiting_time'] = (time_constructed_route['waiting_time'] * time_constructed_route['count_dates'])/sum_count_dates    
            weighted_sum_tt = time_constructed_route['WS_travel_time'].sum()
            weighted_sum_wt = time_constructed_route['WS_waiting_time'].sum()
            #Add this to the first dataframe
            route_creation_frequency_single.loc[index_sequence,'travel_time'] = weighted_sum_tt
            route_creation_frequency_single.loc[index_sequence,'waiting_time'] = weighted_sum_wt
        #if there is no trips that follow each other with the hash from the array
        else:
            route_creation_frequency_single = route_creation_frequency_single.drop(index_sequence)
            
    return route_creation_frequency_single

In [None]:
def calculate_hash_route_creation(route_creation): 
    '''calculates the hash and the hash inverse of the route_creation'''
    #copy the route_creation dataFrame
    route_creation_hash = route_creation.copy()
    #calculate the hash and the hash inverse using the lists in stop_sequence
    route_creation_hash['hash'] = route_creation_hash['stop_sequence'].apply(lambda x: hash(tuple(x)))
    route_creation_hash['hash_inverse'] = route_creation_hash['stop_sequence'].apply(lambda x: hash(tuple(x[::-1])))
    return route_creation_hash

In [None]:
'''Regroup the routes that are the same (even though they are in the opposite direction)'''

def regroup_same_stop_sequences(route_creation_hash):
    '''regroups the stop_sequences that are the same'''
    
    route_creation_max_hash = route_creation_hash.copy()
    route_creation_max_hash['max_hash'] = route_creation_max_hash[['hash', 'hash_inverse']].max(axis=1)
    #create a df that sums the frequence of the trips going from opposite directions
    route_creation_max_hash_freq = route_creation_max_hash.groupby(['route_id','max_hash'], as_index = False)[['frequency']].sum()
    #renames the max_hash column into hash so it the dataframe can be merged with route_hash_without_freq
    route_creation_max_hash_freq = route_creation_max_hash_freq.rename(columns = {'max_hash':'hash'})
    #drops the column freq_sequence_route because the one that is of interest is in route_creation_max_hash_freq
    route_hash_without_freq = route_creation_hash.copy().drop(['frequency'], axis = 1)
    route_hash_without_freq = route_hash_without_freq.drop_duplicates(subset=['route_id', 'hash'])
    route_hash_freq_combined_first_merge = pd.merge(route_creation_max_hash_freq, route_hash_without_freq, on=['route_id', 'hash'], how='left')
    #selects the part of the dataset that doesn't have NaN (because for the NaN, their hash_value that was max was the one in hash_inverse and it didn't exist in the other df), so we can concatenate it with the part that had NaN later
    route_hash_freq_first_part = route_hash_freq_combined_first_merge[pd.notnull(route_hash_freq_combined_first_merge['stop_sequence'])]
    #selects one part the part of the dataset that does have NaN, so we can concatenate it with the part that has no NaN later on.
    #but first, we will need to fill those NaN values (done in the code lines behind this one)
    route_hash_freq_second_part = route_hash_freq_combined_first_merge[pd.isnull(route_hash_freq_combined_first_merge['stop_sequence'])][['route_id', 'hash', 'frequency']]
    #renames the hash column into hash_inverse so it the dataframe can be merged with route_hash_without_freq (because it didn't work with 'hash' on the first merge)
    route_hash_freq_second_part = route_hash_freq_second_part.rename(columns = {'hash':'hash_inverse'})
    route_hash_freq_second_part = pd.merge(route_hash_freq_second_part, route_hash_without_freq, on=['route_id', 'hash_inverse'], how='left')
    #the hash that is of interest in the final df will be hash and not hash_inverse
    route_hash_freq_combined_not_sorted = pd.concat([route_hash_freq_first_part, route_hash_freq_second_part])
    route_hash_freq_combined = route_hash_freq_combined_not_sorted.sort_values(by = ['route_id'])
    route_hash_freq_combined = route_hash_freq_combined.reset_index(drop = True)
    return route_hash_freq_combined

In [None]:
'''Deletes the routes that do not represent 10% of the total route frequency and creates new route, if some of them are different'''

def apply_treshold_route_creation(route_hash_freq_combined): 
    #calculates the total frequency per route_id
    frequency_each_route = route_hash_freq_combined.groupby(['route_id'], as_index = False)['frequency'].sum()
    frequency_treshold = frequency_each_route.copy()
    #calculates the treshold (here 10%)
    frequency_treshold['frequency'] = frequency_treshold['frequency']/10
    frequency_treshold.rename(columns = {'frequency':'frequency_treshold'}, inplace = True)
    route_hash_freq_treshold = route_hash_freq_combined.merge(frequency_treshold, on='route_id', how = 'left')
    #find the sequences that are not more than 10% of the route frequency and delete them
    index_names = route_hash_freq_treshold[route_hash_freq_treshold['frequency'] < route_hash_freq_treshold['frequency_treshold']].index
    route_hash_freq_treshold.drop(index_names, inplace = True)
    #drop the routes with the same hash as others
    route_hash_freq_treshold['max_hash'] = route_hash_freq_treshold[['hash', 'hash_inverse']].max(axis=1)
    route_hash_freq_treshold = route_hash_freq_treshold.drop_duplicates(subset='max_hash')
    route_hash_freq_treshold  = route_hash_freq_treshold.drop(['hash_inverse', 'max_hash'], axis = 1)
    #selects the sequences that are not the first most frequent per route_id
    sequences_max_freq = route_hash_freq_treshold.groupby(['route_id'],as_index = False)['frequency'].max()
    sequences_max_freq.rename(columns = {'frequency':'max_frequency'}, inplace = True)
    sequences_max_freq_merged = route_hash_freq_treshold.merge(sequences_max_freq, on='route_id', how='left')
    sequences_max_freq_index = sequences_max_freq_merged[sequences_max_freq_merged['frequency'] == sequences_max_freq_merged['max_frequency']].drop_duplicates(subset='route_id').index
    sequences_non_max_freq_index = sequences_max_freq_merged[~sequences_max_freq_merged.index.isin(sequences_max_freq_index)].index
    #those selected sequences get a new route_id that starts from routes['route_id'].max() + 1 and increments by one for each new route
    route_id_creation =  route_hash_freq_combined['route_id'].max() + 1
    new_route_id_column = list(range(route_id_creation, route_id_creation + len(sequences_non_max_freq_index)))    
    sequences_max_freq_merged.loc[sequences_non_max_freq_index, 'route_id'] = new_route_id_column
    sequences_max_freq_merged = sequences_max_freq_merged.sort_values(by=['route_id'],ignore_index=True)
    #keep only the column route_id and stop_sequence
    final_routes = sequences_max_freq_merged.drop(sequences_max_freq_merged[sequences_max_freq_merged['frequency'] == 0].index)
    final_routes = final_routes.drop(columns=['hash', 'frequency', 'frequency_treshold', 'max_frequency'])
    return final_routes

In [None]:
''' To keep only the routes that have at least one belgian station in their route_sequence'''

def keep_belgian_routes(final_routes):
    non_belgian_routes = set()
    for index_route, route in final_routes.iterrows():
        is_in_Belgium = False
        for stop in route['stop_sequence']:
            if stop in set(belgian_stops_Belgium_series):
                is_in_Belgium = True
                break
        if not is_in_Belgium:
            route_id = route['route_id']
            non_belgian_routes.add(route_id)
    belgian_routes = final_routes.loc[~final_routes['route_id'].isin(non_belgian_routes)]
    
    return belgian_routes

In [None]:
'''Calculates the distances of the trip, by taking the distance between each stop of the stop_sequence'''

def calculate_distance_from_lat_long(name_first, name_second, stop_df):
        lon_first, lat_first = math.radians(stop_df[stop_df['stop_name'] == name_first].iloc[0]['stop_lon']), math.radians(stop_df[stop_df['stop_name'] == name_first].iloc[0]['stop_lat'])
        lon_second, lat_second = math.radians(stop_df[stop_df['stop_name'] == name_second].iloc[0]['stop_lon']), math.radians(stop_df[stop_df['stop_name'] == name_second].iloc[0]['stop_lat'])
        # The radius of the earth
        R = 6373.0 
        # To calculate the change in coordinates
        dlon = lon_second - lon_first
        dlat = lat_second - lat_first
        # To use the Haversine formula to get the distance in kilometers between the starting_station and the ending_station
        a = math.sin(dlat / 2)**2 + math.cos(lat_first) * math.cos(lat_second) * math.sin(dlon / 2)**2
        c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
        # To calculate the distance
        distance = R * c
        return distance

def calculate_distance(stop_sequence, stop_df):
    distance = 0
    for index_stop ,stop in enumerate(stop_sequence):
        index_plus_one = index_stop + 1
        if index_plus_one <= len(stop_sequence) - 1:
            distance += calculate_distance_from_lat_long(stop, stop_sequence[index_plus_one], stop_df)
    return distance

In [None]:
'''Makes a df that can be used for building the nodes and edges of the graph using Networkx package'''

def create_df_for_Networkx(final_routes):
    '''return df_for_edges a df that can be used to build a Networkx L-space graph'''
    #takes the list stop sequence and make it a new column for each stop
    stop_sequence_values = final_routes.apply(lambda x: pd.Series(x['stop_sequence']),axis=1).stack().reset_index(level=1, drop=True)
    stop_sequence_values.name = 'stop_sequence'
    final_routes_stops = final_routes.drop('stop_sequence', axis=1).join(stop_sequence_values)
    final_routes_stops = final_routes_stops.reset_index(drop=True)
    #Creates a shifted instance of the df to use it for the final result
    final_routes_stops_shifted = final_routes_stops.shift()
    #Check if which of the rows are followed by a row with the same trip_id
    final_routes_stops_shifted['match'] = final_routes_stops_shifted['route_id'].eq(final_routes_stops['route_id'])
    #Drop the rows for which this condition is not satisfied
    final_routes_stops_shifted.drop(final_routes_stops_shifted[final_routes_stops_shifted['match'] == False].index, inplace = True)
    final_routes_stops_shifted.rename(columns=
      {"stop_sequence": "stop_name_1",
      "stop_name": "stop_name_1"}, inplace=True)
    #joins the df with its shifted version sothat each sequence of two stations is represented in the table as a row
    df_for_edges = final_routes_stops_shifted.join(final_routes_stops[['stop_sequence']], lsuffix='_caller', rsuffix='_other', how='left')
    df_for_edges.rename(columns=
      {"stop_sequence": "stop_name_2",
      "stop_name": "stop_name_2"}, inplace=True)

    df_for_edges['route_id'] = df_for_edges['route_id'].astype(np.int64)
    df_for_edges = df_for_edges.drop_duplicates()
    df_for_edges = df_for_edges[['route_id','stop_name_1', 'stop_name_2']]
    df_for_edges = df_for_edges.reset_index(drop=True)
    return df_for_edges

# To apply the route creation function

In [None]:
#Sort the df to get always the right order of rows 
route_hash_service_freq_sorted = route_hash_service_freq.sort_values(by=['route_id','hash']).copy()

In [None]:
'''Applies all the functions from 1 get_extention_indexes to 11 create_df_for_Networkx'''

def full_route_creation(stop_sequences_df, number_of_trips_per_hash, service_id_count_dates, trips_hash_stop_sequence, stops_cleaned_stop_times_trips_merge_dates, stops_cleaned):
    '''return a df that can be used to make a Networkx L-space (with treshold applied of 10%)'''
    index_of_extendable, index_of_begin_sequences, index_of_complete_sequences = get_extention_indexes(stop_sequences_df)
    route_creation_first = possible_sequences_construction(stop_sequences_df, index_of_extendable, index_of_begin_sequences, index_of_complete_sequences)
    route_creation_second = add_full_sequences(stop_sequences_df, route_creation_first, index_of_complete_sequences)
    route_creation_third = add_unused_sequences(stop_sequences_df, route_creation_second)
    route_creation_frequency_single_travel_time = give_begin_end_time(route_creation_third, trips_hash_stop_sequence, stops_cleaned_stop_times_trips_merge_dates)
    route_creation_hash = calculate_hash_route_creation(route_creation_frequency_single_travel_time)
    route_hash_freq_combined = regroup_same_stop_sequences(route_creation_hash)
    final_routes = apply_treshold_route_creation(route_hash_freq_combined)
    belgian_routes = keep_belgian_routes(final_routes)
    belgian_routes['distance'] = belgian_routes['stop_sequence'].apply(lambda x: calculate_distance(x, stops_cleaned))
    df_for_edges = create_df_for_Networkx(belgian_routes)
    
    return route_creation_frequency_single_travel_time, route_creation_frequency_single_travel_time ,belgian_routes, df_for_edges

In [None]:
route_creation_frequency_single_travel_time, route_creation_frequency_single_travel_time, belgian_routes_Belgium, df_for_edges_Belgium = full_route_creation(route_hash_service_freq, route_hash_service_freq_sorted, service_id_df, trips_hash_stop_sequence, stops_cleaned_stop_times_trips_merge_dates, stops_cleaned)
route_creation_frequency_single_travel_time
belgian_routes_Belgium
df_for_edges_Belgium

In [None]:
df_for_edges_Belgium.to_csv(r'/Users/pol/Desktop/CSV_export/df_for_edges_Belgium.csv', index = False, header=True, encoding='utf-8-sig')

In [None]:
belgian_routes_Belgium.to_csv(r'/Users/pol/Desktop/CSV_export/belgian_routes_Belgium.csv', index = False, header=True, encoding='utf-8-sig')

In [None]:
route_hash_service_freq, route_hash_service_freq_sorted, service_id_df, trips_hash_stop_sequence, stops_cleaned_stop_times_trips_merge_dates, stops_cleaned