In [1]:
!pip install geopy



In [2]:
'''To import the required packages.'''
import pandas as pd
import numpy as np
import networkx as nx
import collections
import matplotlib.pyplot as plt
import math
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

# Settings

In [3]:
'''To display all output results of a Jupyter cell.'''
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [4]:
'''To ensure that the output results of extensive output results are not truncated.'''
#pd.options.display.max_rows = 4000

'To ensure that the output results of extensive output results are not truncated.'

# Import of the Dutch railway datasets

In [5]:
'''To register the GitHub link with the Dutch data as a variable.'''
datalink = "https://raw.githubusercontent.com/polkuleuven/Thesis_Train/Pol/gtfs_train_Netherlands_1503/"

'To register the GitHub link with the Dutch data as a variable.'

In [6]:
'''Import all the GTFS data'''

#To import the agency dataset that contains limited information about the Dutch NS railway agency.
agency_Netherlands = pd.read_csv(datalink + "agency.txt", sep=",")
#To import the stops dataset that contains information about the ids, the names and the geographical coordinates of the Dutch NS railway stations.
stops_Netherlands = pd.read_csv(datalink + "stops.txt", sep=",")
#To import the feed_info dataset that contains limited information about the Dutch NS railway feed.
feed_info_Netherlands = pd.read_csv(datalink + "feed_info.txt", sep=",")
#To import the transfers dataset that gives the minimum transfer time to switch routes at each Belgian railway station.
transfers_not_cleaned_Netherlands = pd.read_csv(datalink + "transfers.txt", sep=",")
#To import the routes dataset that provides the id, the name and the type of vehicle used for all Dutch NS railway routes.
routes_Netherlands = pd.read_csv(datalink + "routes.txt", sep=",")
#To import the trips dataset that gives for all routes an overview of the trips and the headsigns of these trips belonging to the Dutch NS railway route.
#The service_id is an indication of all the dates this trip is valid (consultable in the calendar_dates dataset).
trips_Netherlands = pd.read_csv(datalink + "trips.txt", sep=",")
#To import the calendar_dates dataset that gives for each service_id all the exact dates when that service_id is valid.
calendar_dates_Netherlands = pd.read_csv(datalink + "calendar_dates.txt", sep=",")


'Import all the GTFS data'

HTTPError: HTTP Error 404: Not Found

In [None]:
'''Import stop_times that is is of multiple csv files'''
#To import the stop_times dataset that gives for all trips an overview of the ids of the stations served and the sequence in which these stations are served. 
#In addition, for all the trips the arrival and departure times at the stations served are given.
stop_times_range = [*range(2, 19)]
stop_times_Netherlands = pd.read_csv(datalink + "stop_times-1.csv", sep=",")
for index in stop_times_range:
    stop_times_Netherlands = pd.concat([stop_times_Netherlands, pd.read_csv(datalink + "stop_times-" + str(index)+ ".csv", sep=",")])
stop_times_Netherlands

In [None]:
'''Import shapes that consists of multiple csv files'''
# ???
shapes_range = [*range(2, 4)]
shapes_Netherlands = pd.read_csv(datalink + "shapes-1.csv", sep=",")
for index in shapes_range:
    shapes_Netherlands = pd.concat([shapes_Netherlands, pd.read_csv(datalink + "shapes-" + str(index)+ ".csv", sep=",")])
shapes_Netherlands

# Cleaning of the Dutch railway data


In [None]:
# To define a definition to remove the accents from a string
def remove_accents(text):
    import unicodedata
    try:
        text = unicode(text, 'utf-8')
    except NameError:
        pass
    text = unicodedata.normalize('NFD', text).encode('ascii', 'ignore').decode("utf-8")
    return str(text)

In [None]:
''' To clean the routes_Netherlands df.'''
#To keep the train routes
routes_cleaned_Netherlands = routes_Netherlands[routes_Netherlands['route_type'] == 2]

# To remove the accents from the route_long_name and to change to uppercase
routes_cleaned_Netherlands.loc[:,'route_long_name'] = routes_cleaned_Netherlands.loc[:,'route_long_name'].apply(remove_accents)
routes_cleaned_Netherlands.loc[:,'route_long_name'] = routes_cleaned_Netherlands.loc[:,'route_long_name'].str.upper()
routes_cleaned_Netherlands

In [None]:
''' To clean the trips_Netherlands df.'''
# To remove the routes that are not train routes
no_route_id_train_routes = routes_Netherlands.loc[routes_Netherlands['route_type'] != 2, 'route_id']
trips_cleaned_Netherlands = trips_Netherlands[(~trips_Netherlands['route_id'].isin(no_route_id_train_routes))]

# To remove the accents from the trip_headsign and to change to uppercase
trips_cleaned_Netherlands.loc[:,'trip_headsign'] = trips_cleaned_Netherlands.loc[:,'trip_headsign'].apply(remove_accents)
trips_cleaned_Netherlands.loc[:,'trip_headsign'] = trips_cleaned_Netherlands.loc[:,'trip_headsign'].str.upper()
trips_cleaned_Netherlands

In [None]:
''' To clean the stop_times_Netherlands df.'''
# To remove the stop_times trip_ids that are not trip_ids that belong to train routes and 
# to change the data type of the stop_id column
no_trip_id_train_routes = trips_Netherlands.loc[trips_Netherlands['route_id'].isin(no_route_id_train_routes), 'trip_id']
stop_times_cleaned_Netherlands = stop_times_Netherlands[(~stop_times_Netherlands['trip_id'].isin(no_trip_id_train_routes))]
stop_times_cleaned_Netherlands.loc[:,'stop_id'] = stop_times_cleaned_Netherlands.stop_id.apply(str)

# To take a subset of the stops_Netherlands df and to remove the accents from the stop_name and to change the stop_name to uppercase
stop_id_name_Netherlands = stops_Netherlands[['stop_id', 'stop_name']]
stop_id_name_Netherlands.loc[:,'stop_name'] = stop_id_name_Netherlands.loc[:,'stop_name'].apply(remove_accents)
stop_id_name_Netherlands.loc[:,'stop_name'] = stop_id_name_Netherlands.loc[:,'stop_name'].str.upper()

# To add the stop_name attribute of the stop_times_stop_id_name_Netherlands df to the stop_times_cleaned_Netherlands df and
# to remove the stop_id attribute
stop_times_cleaned_Netherlands = pd.merge(stop_times_cleaned_Netherlands, stop_id_name_Netherlands, on='stop_id', how='left')
stop_times_cleaned_Netherlands.drop('stop_id', axis=1, inplace=True)
stop_times_cleaned_Netherlands

''' To clean the stops_Netherlands df.  (1) ''' 
##### To take all unique stop_names that appear in the stop_times df
unique_stop_names_stop_times_Netherlands = stop_times_cleaned_Netherlands['stop_name'].drop_duplicates()

##### To select all rows of the stops_Netherlands df that contain a stop_name that is in unique_stop_names_stop_times_Netherlands
stops_initial_Netherlands = stops_Netherlands.copy()
stops_initial_Netherlands['stop_name'] = stops_Netherlands['stop_name'].apply(remove_accents)
stops_initial_Netherlands['stop_name'] = stops_initial_Netherlands['stop_name'].str.upper()
stops_initial_Netherlands = stops_initial_Netherlands[stops_initial_Netherlands['stop_name'].isin(unique_stop_names_stop_times_Netherlands)]

##### To take from the stops_initial_Netherlands df all stop_ids that contain a 'stoparea:' to get the correct stop coordinates
stops_cleaned_Netherlands = stops_initial_Netherlands[stops_initial_Netherlands['stop_id'].str.contains('stoparea:')]
stops_cleaned_Netherlands = stops_cleaned_Netherlands.drop_duplicates()

##### To verify that there is an equal number of unique stop_names in the unique_stop_names_stop_times_Netherlands series and the stops_cleaned_Netherlands df
stop_names_stops_cleaned_Netherlands = stops_cleaned_Netherlands[['stop_name']].drop_duplicates()
len(unique_stop_names_stop_times_Netherlands)
len(stop_names_stops_cleaned_Netherlands)

''' To clean the stops_Netherlands df.  (2) ''' 
###### To initialize the Nominatim API to get the location from the input string 
geolocator = Nominatim(user_agent="application")
reverse = RateLimiter(geolocator.reverse, min_delay_seconds=0.2)

##### To get the location with the geolocator.reverse() function and to extract the country from the location instance
country_list = []
for index, row in stops_cleaned_Netherlands.iterrows():
    latitude = row['stop_lat']
    longitude = row['stop_lon']
    # To assign the latitude and longitude into a geolocator.reverse() method
    location = reverse((latitude, longitude), language='en', exactly_one=True)
    # To get the country from the given list and parsed into a dictionary with raw function()
    address = location.raw['address']
    country = address.get('country', '')
    country_list.append(country)

##### To add the values of country_list as a new attribute country     
stops_cleaned_Netherlands.loc[:,'country'] = country_list
stops_cleaned_Netherlands

###### To calculate the total number of Belgian stations in the stops_cleaned dataset
dutch_stops_Netherlands = stops_cleaned_Netherlands[stops_cleaned_Netherlands['country'] == 'Netherlands']
dutch_stops_Netherlands_series = stops_cleaned_Netherlands.loc[stops_cleaned_Netherlands['country'] == 'Netherlands', 'stop_name']

stops_cleaned_Netherlands.to_csv(r'/Users/pol/Desktop/CSV_export/stops_cleaned_Netherlands.csv', index = False, header=True, encoding='utf-8-sig')

dutch_stops_Netherlands_series.to_csv(r'/Users/pol/Desktop/CSV_export/dutch_stops_Netherlands_series.csv', index = False, header=True, encoding='utf-8-sig')

In [None]:
'''import the cleaned version of the stops with their country'''
stops_cleaned_Netherlands = pd.read_csv("https://raw.githubusercontent.com/polkuleuven/Thesis_Train/Pol/stops_cleaned/stops_cleaned_Netherlands.csv", sep=",")
dutch_stops_Netherlands_series = pd.read_csv("https://raw.githubusercontent.com/polkuleuven/Thesis_Train/Pol/country_stops_series/dutch_stops_Netherlands_series.csv", sep=",")['stop_name']

# Exploratory data analysis with the Dutch railway data

In [None]:
'''To calculate the number of unique route_ids '''
set_routes_Netherlands = {r for r in routes_cleaned_Netherlands['route_id']}
len(set_routes_Netherlands)

In [None]:
'''To calculate the total number of stations in the stops_cleaned_Netherlands dataset'''
set_stations_Netherlands = {s for s in stops_cleaned_Netherlands['stop_id']}
len(set_stations_Netherlands)

In [None]:
'''To calculate the total number of Dutch stations in the stops_cleaned dataset'''
len(dutch_stops_Netherlands_series)

# **Preparation for the L-space representation of the Dutch railway system**

In [None]:
'''To merge a selection of the trips dataset and a selection of the routes dataset on route_id'''
trips_routes_Netherlands = pd.merge(trips_cleaned_Netherlands[['route_id','service_id','trip_id', 'trip_headsign']], routes_cleaned_Netherlands[['route_id', 'route_short_name', 'route_long_name']], on='route_id')
trips_routes_Netherlands

In [None]:
'''To merge a selection of the stop_times_cleaned_Netherlands dataset with a selection of the stops_cleaned_Netherlands dataset'''
stop_times_stops_Netherlands = pd.merge(stop_times_cleaned_Netherlands[['trip_id','arrival_time', 'departure_time','stop_name','stop_sequence']], stops_cleaned_Netherlands[['stop_name', 'stop_lat', 'stop_lon']], on='stop_name')
stop_times_stops_Netherlands

In [None]:
'''To merge a selection of the stop_times_stops_Netherlands dataset with the trips_routes_Netherlands dataset.'''
trips_routes_stop_times_stops_Netherlands = pd.merge(trips_routes_Netherlands, stop_times_stops_Netherlands, on='trip_id')
trips_routes_stop_times_stops_Netherlands

In [None]:
'''To create a route_sequence dataset that gives for each trip_id that belongs to a route the sequence of stations served'''
route_sequence_Netherlands = trips_routes_stop_times_stops_Netherlands.groupby(['route_id','route_long_name','trip_headsign','trip_id','stop_sequence'], as_index=False)[['stop_name', 'stop_lat', 'stop_lon']].last()
route_sequence_Netherlands

'''To calculate the hash and the hash_inverse values for the stop sequence of each trip_id'''

#To copy the trips_routes_Netherlands df
trips_hash_Netherlands = trips_routes_Netherlands.copy()

#To create a column called hash that contains NaN values
trips_hash_Netherlands['hash'] = np.nan

#To create a column called hash_inverse that contains NaN values
trips_hash_Netherlands['hash_inverse'] = np.nan

#For each trip_id in trips_routes_Netherlands, the stop_sequence that gets calculated is the subset of the stop_times dataset for that trip_id. 
#The tuple that results from the stop_name column of this subset dataset contains all the stop_names that get served by this trip_id. 

#The hash value of the tuple of the stop_name column is calculated and is placed in the hash column of the trip_id in the trips_routes dataset
#The inverse_hash value of the tuple of the stop_name column is calculated as well.

for trip_Netherlands in trips_routes_Netherlands['trip_id'].unique():
    stop_sequence_Netherlands = stop_times_cleaned_Netherlands[stop_times_cleaned_Netherlands['trip_id'] == trip_Netherlands].sort_values(by = 'stop_sequence')
    trips_hash_Netherlands.loc[trips_hash_Netherlands['trip_id'] == trip_Netherlands, 'hash'] = hash(tuple(stop_sequence_Netherlands['stop_name']))
    trips_hash_Netherlands.loc[trips_hash_Netherlands['trip_id'] == trip_Netherlands, 'hash_inverse'] = hash(tuple(list(stop_sequence_Netherlands['stop_name'])[::-1]))

In [None]:
#trips_hash_Netherlands.to_csv(r'/Users/pol/Desktop/CSV_export/trips_hash_Netherlands.csv', index = False, header=True, encoding='utf-8-sig')

In [None]:
datalink = "https://raw.githubusercontent.com/polkuleuven/Thesis_Train/main"
trips_hash_Netherlands = pd.read_csv(datalink + "/hash_cleaning/trips_hash_Netherlands.csv", sep=",")
trips_hash_Netherlands

In [None]:
''' To groupby the trip_id and to order the stop_sequence in an ascending order (the stop_sequences of some
routes are initially in descending order while other stop_sequences are in ascending order) '''

trips_stop_sequence_ascending_Netherlands = stop_times_stops_Netherlands.groupby(['trip_id'], as_index=False).apply(lambda x: x.sort_values('stop_sequence'))
trips_stop_sequence_ascending_Netherlands

In [None]:
''' To put the stop_names of a stop sequence of a trip_id in a list '''
trips_stop_sequence_Netherlands = trips_stop_sequence_ascending_Netherlands.groupby('trip_id')['stop_name'].apply(lambda group_series: group_series.tolist()).reset_index()
trips_stop_sequence_Netherlands.rename(columns={'stop_name':'stop_sequence'}, inplace=True)
trips_stop_sequence_Netherlands

In [None]:
''' To add the list of stop_sequence of stations to the trips_hash_Netherlands df by joining on trip_id'''
# To add the stop_sequence of stations to the trips_hash_France df by joining on trip_id
trips_hash_stop_sequence_Netherlands = pd.merge(trips_hash_Netherlands, trips_stop_sequence_Netherlands, on='trip_id', how='left')

# To put the columns in a more logical order
trips_hash_stop_sequence_Netherlands = trips_hash_stop_sequence_Netherlands[['route_id', 'route_long_name','service_id','trip_headsign','trip_id','hash', 'hash_inverse','stop_sequence']]
trips_hash_stop_sequence_Netherlands

In [None]:
''' To count the number of dates for each service_id '''
service_id_df_Netherlands = calendar_dates_Netherlands.groupby(['service_id'])[['service_id']].count().rename(columns={'service_id':'count_service_id'}).reset_index()
service_id_df_Netherlands

In [None]:
''' To regroup the days per service_id in a set '''
service_id_dates_Netherlands = calendar_dates_Netherlands.groupby('service_id')['date'].apply(lambda group_series: set(group_series.tolist())).reset_index()
service_id_dates_Netherlands.rename(columns={'date':'dates'}, inplace=True)
service_id_dates_Netherlands = service_id_dates_Netherlands.merge(service_id_df_Netherlands, on='service_id', how='left')
service_id_dates_Netherlands

In [None]:
''' To put the different trip_ids in a list after joining on (route_id, route_long_name, hash and service_id) '''
route_hash_freq_Netherlands = trips_hash_stop_sequence_Netherlands.groupby(['route_id','route_long_name','hash', 'hash_inverse', 'service_id'])['trip_id'].apply(lambda group_series: group_series.tolist()).reset_index()
route_hash_freq_Netherlands

In [None]:
''' To add the sequence of stops to the route_hash_freq dataset '''
route_hash_freq_Netherlands = pd.merge(route_hash_freq_Netherlands, trips_hash_stop_sequence_Netherlands[['route_id','hash', 'hash_inverse', 'service_id','stop_sequence']], on=['route_id', 'hash', 'hash_inverse', 'service_id'], how='left')
route_hash_freq_Netherlands = route_hash_freq_Netherlands.drop_duplicates( subset = ['route_id', 'hash', 'service_id'], keep = 'first')

route_hash_freq_Netherlands

In [None]:
''' To calculate the number of trip_ids in the list of trip_ids and to add it as a new column '''
number_trip_ids_Netherlands = []
for list_trip_ids_Netherlands in route_hash_freq_Netherlands['trip_id']:
    count_Netherlands = len(list_trip_ids_Netherlands)
    number_trip_ids_Netherlands.append(count_Netherlands)
route_hash_freq_Netherlands['number_trip_ids'] = number_trip_ids_Netherlands

route_hash_freq_Netherlands

In [None]:
''' To merge the route_hash_freq_Netherlands df with the service_id_dates to get the sets of corresponding dates '''
route_hash_service_freq_Netherlands = pd.merge(route_hash_freq_Netherlands, service_id_dates_Netherlands, on='service_id', how='left')
route_hash_service_freq_Netherlands_copy = route_hash_service_freq_Netherlands.copy()
route_hash_service_freq_Netherlands

In [None]:
'''Groups the service_id together for each route_id and hash combination'''
for index, combi_route_id_hash in route_hash_service_freq_Netherlands_copy.groupby(['route_id','hash'], as_index = False)['service_id'].last().iterrows():
    set_service_id = set(route_hash_service_freq_Netherlands_copy.loc[(route_hash_service_freq_Netherlands_copy['route_id'] == combi_route_id_hash['route_id']) & (route_hash_service_freq_Netherlands_copy['hash'] == combi_route_id_hash['hash'])]['service_id'])
    route_hash_service_freq_Netherlands_copy.loc[(route_hash_service_freq_Netherlands_copy['route_id'] == combi_route_id_hash['route_id']) & (route_hash_service_freq_Netherlands_copy['hash'] == combi_route_id_hash['hash']),['service_id']] = set_service_id
route_hash_service_freq_Netherlands_copy

In [None]:
'''Get the distinct stop sequences for all routes to create the possible track combinations later on'''
distinct_stop_sequences_Netherlands = route_hash_service_freq_Netherlands_copy.drop_duplicates(subset = ["route_id", 'hash'])[['route_id','hash','stop_sequence', 'service_id']]
distinct_stop_sequences_Netherlands

##Functions for the route creation

In [None]:
'''Some functions to better factorise the functions in the coming cells'''

def select_stop_sequences(stop_sequences_df, route_id):
    '''retruns the stop sequences with the selected route_id'''
    return stop_sequences_df[stop_sequences_df['route_id'] == route_id].copy()

In [None]:
'''Finds the routes that can be either extended from behind or from after and those which are complete sequences'''

def get_extention_indexes(stop_sequences_df):
    '''returns the tree indexes: index_of_extendable, index_of_begin_sequences, index_of_complete_sequences'''
    #intiate the dictionnaries, that will be used to retrieve different rows later on
    index_of_extendable = {}
    index_of_begin_sequences = {}
    index_of_complete_sequences = {}
    for route_id in stop_sequences_df['route_id'].unique():
        #select the route with the route_id selected by the loop iteration
        route_sequences_route_id = select_stop_sequences(stop_sequences_df, route_id)
        for index_trip, trip in route_sequences_route_id.iterrows():
            #checks the extentions possible for the trip that can follow after its last stop
            possible_extentions_after = route_sequences_route_id[route_sequences_route_id['stop_sequence'].apply(lambda x: any(item for item in [trip['stop_sequence'][-1]] if (item == x[0]) and not(set(x[1:]) & set(trip['stop_sequence']))))].copy()
            #checks that those extentions have a common service_id as the trip
            possible_extentions_after = possible_extentions_after[possible_extentions_after['service_id'].apply(lambda x: any(item for item in trip['service_id'] if item in x))].copy()
            #checks the extentions possible for the trip that can follow before its first stop
            possible_extentions_behind = route_sequences_route_id[route_sequences_route_id['stop_sequence'].apply(lambda x: any(item for item in [trip['stop_sequence'][0]] if (item == x[-1]) and not(set(x[:-1]) & set(trip['stop_sequence']))))].copy()        
            #checks that those extentions have a common service_id as the trip
            possible_extentions_behind = possible_extentions_behind[possible_extentions_behind['service_id'].apply(lambda x: any(item for item in trip['service_id'] if item in x))].copy()
            #put all the sequences that can be extended either from the beginning either from the end together
            possible_extentions = possible_extentions_after.append(possible_extentions_behind, ignore_index = True)
            if not possible_extentions.empty:
                if route_id not in index_of_extendable:
                    index_of_extendable[route_id] = []
                index_of_extendable[route_id].append(index_trip)
                if possible_extentions_behind.empty:
                    if route_id not in index_of_begin_sequences:
                        index_of_begin_sequences[route_id] = []
                    index_of_begin_sequences[route_id].append(index_trip)
            elif possible_extentions.empty:
                if route_id not in index_of_complete_sequences:
                    index_of_complete_sequences[route_id] = []
                index_of_complete_sequences[route_id].append(index_trip)
                
    return index_of_extendable, index_of_begin_sequences, index_of_complete_sequences

In [None]:
'''Creates all the sequences of routes possible to reconstruct the real route'''

def possible_sequences_construction(stop_sequences_df, index_of_extendable, index_of_begin_sequences, index_of_complete_sequences):
    '''returns the first part of the route_creation, two others need to be added'''
    import copy
    #create an empty df for the process of route creation
    route_creation  = pd.DataFrame()
    for route_id in index_of_extendable:
        #checks if some parts are begin sequences, if not, then we can't build routes with multiple sequences
        if route_id in index_of_begin_sequences:
            #create a copy of the df with only the route considered in the loop iteration
            routes_with_route_id = select_stop_sequences(stop_sequences_df, route_id)
            #create a df where only the routes that have an end stop as their first element of the sequence
            route_creation_route_id = routes_with_route_id.loc[index_of_begin_sequences[route_id]][['route_id', 'hash', 'stop_sequence', 'service_id']]
            #create a df with the exentable sequences for that route_id
            route_creation_extensions_route_id = routes_with_route_id.loc[index_of_extendable[route_id]][['route_id', 'hash', 'stop_sequence','service_id']]    
            #make the hash column as a column of lists
            route_creation_route_id['hash'] = route_creation_route_id['hash'].apply(lambda x: [x])
            route_creation_route_id = route_creation_route_id.reset_index(drop=True)
            #to stop the while loop when all the routes are complete in the df for the route_id of the loop iteration
            complete_routes = 0
            while complete_routes < len(route_creation_route_id.index):
                #use a deepcopy to not impact the iterrows of the main loop
                route_creation_deep_copy = copy.deepcopy(route_creation_route_id)
                for index_original, route_part in route_creation_deep_copy.iterrows():
                    #create a dataframe of the possible extentions for each route_part
                    #select an extention only if the extention is the next part of the route and also that no other station are repeated in the sequence if this extention is added(otherwise it might cause an infinite loop)
                    possible_extentions = route_creation_extensions_route_id[route_creation_extensions_route_id['stop_sequence'].apply(lambda x: any(item for item in [route_part['stop_sequence'][-1]] if (item == x[0]) and not(set(x[1:]) & set(route_part['stop_sequence']))))].copy()
                    #take only those extentions that have a common service_id with the route_part
                    possible_extentions = possible_extentions[possible_extentions['service_id'].apply(lambda x: any(item for item in route_part['service_id'] if item in x))].copy()                
                    #checks whether any extention fullfilling the criterias has been found
                    if not (possible_extentions.empty):
                        #if so, extend it with every single possibilities
                        for index_extention, possible_extention in possible_extentions.iterrows():
                            #must create a deepcopy, otherwise the orignal hash list will change as well (mutable)
                            updated_hash = copy.deepcopy(route_part['hash'])
                            updated_hash.append(possible_extention['hash'])
                            updated_route_sequence = route_part['stop_sequence'] + possible_extention['stop_sequence'][1:]
                            common_service_id = possible_extention['service_id'] & route_part['service_id']
                            route_creation_route_id.loc[max(route_creation_route_id.index)+1] = [route_id, updated_hash, updated_route_sequence, common_service_id]
                        #then delete the route with the index (see loop here above)
                        route_creation_route_id = route_creation_route_id.drop(index = index_original)            
                    #the route can't be extended anymore
                    else:
                        complete_routes += 1
            #adds all the possible routes created with the trips of the route_id of the main loop
            route_creation = route_creation.append(route_creation_route_id, ignore_index = True)
    return route_creation

In [None]:
'''Adds the full sequences to the route_creation dataframe'''

def add_full_sequences(stop_sequences_df, route_creation, index_of_complete_sequences):
    '''returns the second part of the route_creation, one other needs to be added'''
    for route_id in index_of_complete_sequences:
        #findes all the complete sequences for that route_id
        copy_complete_sequences_df = stop_sequences_df.loc[index_of_complete_sequences[route_id]][['route_id','hash','stop_sequence', 'service_id']].copy()
        copy_complete_sequences_df['hash'] = copy_complete_sequences_df['hash'].apply(lambda x: [x])
        #adds each of them in the route_creation dataframe
        for index_complete_sequence, complete_sequence in copy_complete_sequences_df.iterrows():
            route_creation = route_creation.append(complete_sequence, ignore_index = True)
    route_creation = route_creation.sort_values(by=['route_id'], ignore_index = True)
    return route_creation 

In [None]:
'''Adds the sequences that were not yet added in the route_creation dataframe'''

def add_unused_sequences(stop_sequences_df, route_creation):
    '''returns the third part of the route_creation'''
    for route_id in stop_sequences_df['route_id'].unique():
        if route_id in route_creation['route_id'].unique():
            #get a set of the hashes that were employed to create the routes for that route_id
            used_sequences_hash = set(route_creation[route_creation['route_id'] == route_id].apply(lambda x: pd.Series(x['hash']),axis=1).stack().reset_index(level=1, drop=True))
            #get a tuple of all the route sequences for that route_id
            used_sequences = tuple(route_creation[route_creation['route_id'] == route_id]['stop_sequence'])
            copy_sequences_route_id = select_stop_sequences(stop_sequences_df, route_id)[['route_id','hash','stop_sequence', 'service_id']]
            copy_sequences_route_id['hash'] = copy_sequences_route_id['hash'].apply(lambda x: [x]) 
            #adds the hashes that were not employed in any route creations for that route_id
            for index_trip, trip in copy_sequences_route_id.iterrows():
                #first element of the list because there is always only one element
                if trip['hash'][0] not in used_sequences_hash:
                    #checks that the sequence is not a sublist of any existing sequences
                    is_subsequence = False
                    for sequence in used_sequences:
                        if set(trip['stop_sequence']).issubset(sequence):
                            is_subsequence = True
                    if not is_subsequence:
                        route_creation = route_creation.append(trip, ignore_index = True)
    return route_creation

In [None]:
'''Calculates the frequency of the constructed routes just made in the route_creation dataframe'''
    
def calculate_frequenty_new_sequences(number_of_trips_per_hash, service_id_count_dates, route_creation):
    '''calculates the frequencies of route_construction_third'''
    #put the default value of the frequency to 0
    route_creation['frequency'] = 0
    for index_sequence, sequence in route_creation[['route_id','hash','service_id']].iterrows():
        #initialize the varibles
        sequence_frequency = 0
        set_common_service_id = sequence['service_id']
        if set_common_service_id:
            #select the number_of_trips_per_hash only for the considered route_id
            number_of_trips_per_hash_route_id = number_of_trips_per_hash[number_of_trips_per_hash['route_id'] == sequence['route_id']]
            #only select the trips with the hash value contained in the sequence and with the same route_id
            containing_hash = number_of_trips_per_hash_route_id[number_of_trips_per_hash_route_id['hash'].apply(lambda x: any(item for item in sequence['hash'] if x == item))]
            #loop over each service_id that were common during the trip
            for service_id in set_common_service_id:
                service_id_number_days = service_id_count_dates[service_id_count_dates['service_id'] == service_id].iloc[0]['count_service_id']
                #adds the minimum number of trips per day multiplied by the number of days in the service_id
                sequence_frequency += containing_hash[containing_hash['service_id'] == service_id]['number_trip_ids'].min() * service_id_number_days
            #adds the frequency in of the new route sequence
            route_creation.loc[index_sequence, 'frequency'] = sequence_frequency
    return route_creation

In [None]:
def calculate_hash_route_creation(route_creation): 
    '''calculates the hash and the hash inverse of the route_creation'''
    #copy the route_creation dataFrame
    route_creation_hash = route_creation.copy()
    #create a column called hash and hash_invese that contains NaN values
    route_creation_hash['hash'] = np.nan
    route_creation_hash['hash_inverse'] = np.nan
    #calculate the hash and the hash inverse using the lists in stop_sequence
    for index, route_sequence in route_creation_hash.iterrows():
        route_creation_hash.loc[index, 'hash'] = hash(tuple(route_sequence['stop_sequence']))
        route_creation_hash.loc[index, 'hash_inverse'] = hash(tuple(list(route_sequence['stop_sequence'])[::-1]))
    return route_creation_hash

In [None]:
'''Regroup the routes that are the same (even though they are in the opposite direction)'''

def regroup_same_stop_sequences(route_creation_hash):
    '''regroups the stop_sequences that are the same'''
    
    route_creation_max_hash = route_creation_hash.copy()
    route_creation_max_hash['max_hash'] = route_creation_max_hash[['hash', 'hash_inverse']].max(axis=1)
    #create a df that sums the frequence of the trips going from opposite directions
    route_creation_max_hash_freq = route_creation_max_hash.groupby(['route_id','max_hash'], as_index = False)[['frequency']].sum()
    #renames the max_hash column into hash so it the dataframe can be merged with route_hash_without_freq
    route_creation_max_hash_freq = route_creation_max_hash_freq.rename(columns = {'max_hash':'hash'})
    #drops the column freq_sequence_route because the one that is of interest is in route_creation_max_hash_freq
    route_hash_without_freq = route_creation_hash.copy().drop(['frequency'], axis = 1)
    route_hash_without_freq = route_hash_without_freq.drop_duplicates(subset=['route_id', 'hash'])
    route_hash_freq_combined_first_merge = pd.merge(route_creation_max_hash_freq, route_hash_without_freq, on=['route_id', 'hash'], how='left')
    route_hash_freq_combined_first_merge = route_hash_freq_combined_first_merge.drop(['hash_inverse'], axis = 1)
    #selects the part of the dataset that doesn't have NaN (because for the NaN, their hash_value that was max was the one in hash_inverse and it didn't exist in the other df), so we can concatenate it with the part that had NaN later
    route_hash_freq_first_part = route_hash_freq_combined_first_merge[pd.notnull(route_hash_freq_combined_first_merge['stop_sequence'])]
    #selects one part the part of the dataset that does have NaN, so we can concatenate it with the part that has no NaN later on.
    #but first, we will need to fill those NaN values (done in the code lines behind this one)
    route_hash_freq_second_part = route_hash_freq_combined_first_merge[pd.isnull(route_hash_freq_combined_first_merge['stop_sequence'])][['route_id', 'hash', 'frequency']]
    #renames the hash column into hash_inverse so it the dataframe can be merged with route_hash_without_freq (because it didn't work with 'hash' on the first merge)
    route_hash_freq_second_part = route_hash_freq_second_part.rename(columns = {'hash':'hash_inverse'})
    route_hash_freq_second_part = pd.merge(route_hash_freq_second_part, route_hash_without_freq, on=['route_id', 'hash_inverse'], how='left')
    #the hash that is of interest in the final df will be hash and not hash_inverse
    route_hash_freq_second_part  = route_hash_freq_second_part.drop(['hash_inverse'], axis = 1)
    route_hash_freq_combined_not_sorted = pd.concat([route_hash_freq_first_part, route_hash_freq_second_part])
    route_hash_freq_combined = route_hash_freq_combined_not_sorted.sort_values(by = ['route_id'])
    route_hash_freq_combined = route_hash_freq_combined.reset_index(drop = True)
    return route_hash_freq_combined

In [None]:
'''Deletes the routes that do not represent 10% of the total route frequency and creates new route, if some of them are different'''

def apply_treshold_route_creation(route_hash_freq_combined): 
    #calculates the total frequency per route_id
    frequency_each_route = route_hash_freq_combined.groupby(['route_id'], as_index = False)['frequency'].sum()
    frequency_treshold = frequency_each_route.copy()
    #calculates the treshold (here 10%)
    frequency_treshold['frequency'] = frequency_treshold['frequency']/10
    frequency_treshold.rename(columns = {'frequency':'frequency_treshold'}, inplace = True)
    route_hash_freq_treshold = route_hash_freq_combined.merge(frequency_treshold, on='route_id', how = 'left')
    #find the sequences that are not more than 10% of the route frequency and delete them
    index_names = route_hash_freq_treshold[route_hash_freq_treshold['frequency'] < route_hash_freq_treshold['frequency_treshold']].index
    route_hash_freq_treshold.drop(index_names, inplace = True)
    #selects the sequences that are not the most frequent per route_id
    sequences_max_freq = route_hash_freq_treshold.groupby(['route_id'],as_index = False)['frequency'].max()
    sequences_max_freq.rename(columns = {'frequency':'max_frequency'}, inplace = True)
    sequences_max_freq_merged = route_hash_freq_treshold.merge(sequences_max_freq, on='route_id', how='left')
    sequences_non_max_freq_index = sequences_max_freq_merged[sequences_max_freq_merged['frequency'] != sequences_max_freq_merged['max_frequency']].index
    #those selected sequences get a new route_id that starts from routes['route_id'].max() + 1 and increments by one for each new route
    route_id_creation =  route_hash_freq_combined['route_id'].max() + 1
    new_route_id_column = list(range(route_id_creation, route_id_creation + len(sequences_non_max_freq_index)))    
    sequences_max_freq_merged.loc[sequences_non_max_freq_index, 'route_id'] = new_route_id_column
    sequences_max_freq_merged = sequences_max_freq_merged.sort_values(by=['route_id'],ignore_index=True)
    #keep only the column route_id and stop_sequence
    final_routes = sequences_max_freq_merged.drop(columns=['hash', 'frequency', 'frequency_treshold', 'max_frequency', 'service_id'])
    return final_routes

In [None]:
''' To keep only the routes that have at least one dutch station in their route_sequence'''

def keep_dutch_routes(final_routes):
    non_dutch_routes = set()
    for index_route, route in final_routes.iterrows():
        is_in_Netherlands = False
        for stop in route['stop_sequence']:
            if stop in set(dutch_stops_Netherlands_series):
                is_in_Netherlands = True
                break
        if not is_in_Netherlands:
            route_id = route['route_id']
            non_dutch_routes.add(route_id)
    dutch_routes = final_routes.loc[~final_routes['route_id'].isin(non_dutch_routes)]    

    return dutch_routes

In [None]:
'''Makes a set that can be used for building the edges of the graph using Networkx package'''

def create_df_for_Networkx(final_routes):
    '''return df_for_edges a df that can be used to build a Networkx L-space graph'''
    #takes the list stop sequence and make it a new column for each stop
    stop_sequence_values = final_routes.apply(lambda x: pd.Series(x['stop_sequence']),axis=1).stack().reset_index(level=1, drop=True)
    stop_sequence_values.name = 'stop_sequence'
    final_routes_stops = final_routes.drop('stop_sequence', axis=1).join(stop_sequence_values)
    final_routes_stops = final_routes_stops.reset_index(drop=True)
    #Creates a shifted instance of the df to use it for the final result
    final_routes_stops_shifted = final_routes_stops.shift()
    #Check if which of the rows are followed by a row with the same trip_id
    final_routes_stops_shifted['match'] = final_routes_stops_shifted['route_id'].eq(final_routes_stops['route_id'])
    #Drop the rows for which this condition is not satisfied
    final_routes_stops_shifted.drop(final_routes_stops_shifted[final_routes_stops_shifted['match'] == False].index, inplace = True)
    final_routes_stops_shifted.rename(columns=
      {"stop_sequence": "stop_name_1",
      "stop_name": "stop_name_1"}, inplace=True)
    #joins the df with its shifted version sothat each sequence of two stations is represented in the table as a row
    df_for_edges = final_routes_stops_shifted.join(final_routes_stops[['stop_sequence']], lsuffix='_caller', rsuffix='_other', how='left')
    df_for_edges.rename(columns=
      {"stop_sequence": "stop_name_2",
      "stop_name": "stop_name_2"}, inplace=True)

    df_for_edges['route_id'] = df_for_edges['route_id'].astype(np.int64)
    df_for_edges = df_for_edges.drop_duplicates()
    df_for_edges = df_for_edges[['route_id','stop_name_1', 'stop_name_2']]
    df_for_edges = df_for_edges.reset_index(drop=True)
    return df_for_edges

In [None]:
def full_route_creation(stop_sequences_df, number_of_trips_per_hash, service_id_count_dates):
    '''return a df that can be used to make a Networkx L-space (with treshold applied of 10%)'''
    index_of_extendable, index_of_begin_sequences, index_of_complete_sequences = get_extention_indexes(stop_sequences_df)
    route_creation_first = possible_sequences_construction(stop_sequences_df, index_of_extendable, index_of_begin_sequences, index_of_complete_sequences)
    route_creation_second = add_full_sequences(stop_sequences_df, route_creation_first, index_of_complete_sequences)
    route_creation_third = add_unused_sequences(stop_sequences_df, route_creation_second)
    route_creation_frequency_single = calculate_frequenty_new_sequences(number_of_trips_per_hash, service_id_count_dates, route_creation_third)
    route_creation_hash = calculate_hash_route_creation(route_creation_frequency_single)
    route_hash_freq_combined = regroup_same_stop_sequences(route_creation_hash)
    final_routes = apply_treshold_route_creation(route_hash_freq_combined)
    dutch_routes = keep_dutch_routes(final_routes)
    df_for_edges = create_df_for_Networkx(dutch_routes)
    
    return dutch_routes, df_for_edges

In [None]:
dutch_routes_Netherlands, df_for_edges_Netherlands = full_route_creation(distinct_stop_sequences_Netherlands, route_hash_service_freq_Netherlands.copy(), service_id_df_Netherlands)
dutch_routes_Netherlands
df_for_edges_Netherlands

In [None]:
df_for_edges_Netherlands.to_csv(r'/Users/pol/Desktop/CSV_export/df_for_edges_Netherlands.csv', index = False, header=True, encoding='utf-8-sig')