# Import of packages


In [1]:
'''Import the required packages.'''
import pandas as pd
import numpy as np
import networkx as nx
import collections
import matplotlib.pyplot as plt
import math
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import os

# Settings

In [2]:
'''Display all output results of a Jupyter cell.'''
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
'''Ensure that the output results of extensive output results are not truncated.'''
#pd.options.display.max_rows = 4000

'Ensure that the output results of extensive output results are not truncated.'

In [4]:
'''Change the width of the Notebook to see the output on the screen'''
#from IPython.core.display import display, HTML
#display(HTML("<style>.container { width:100% !important; }</style>"))

'Change the width of the Notebook to see the output on the screen'

# File locations

In [5]:
'''Register the GitHub link or the file relative location'''
#the Github link
#repository_loc = "https://raw.githubusercontent.com/polkuleuven/Thesis_Train/main"
#the local link
repository_loc = os.getcwd()

'Register the GitHub link or the file relative location'

In [None]:
'''Get the other folder locations'''
belgian_GTFS_loc = repository_loc + '/gtfs_train_Belgium_1503/'
dutch_GTFS_loc = repository_loc + '/gtfs_train_Netherlands_1503/'
swiss_GTFS_loc = repository_loc + '/gtfs_train_Switzerland_1503/'

stops_series_loc = repository_loc + '/country_stops_series/'
stops_cleaned_loc = repository_loc + '/stops_cleaned/'
df_for_edges_loc = repository_loc + '/df_for_edges/'
routes_loc = repository_loc + '/routes/'

# Import of the datasets

## Functions

In [7]:
'''Import all the DataFrames that are common for the three train networks'''

def common_imports(datalink):
    #To import the agency dataset that contains limited information about the railway agency.
    agency = pd.read_csv(datalink + "agency.txt", sep=",")
    #To import the calendar_dates dataset that gives for each service_id all the exact dates when that service_id is valid.
    calendar_dates = pd.read_csv(datalink + "calendar_dates.txt", sep=",")
    #To import the routes dataset that provides the id, the name and the type of vehicle used for all railway routes.
    routes = pd.read_csv(datalink + "routes.txt", sep=",")
    #To import the stops dataset that contains information about the ids, the names and the geographical coordinates of the railway stations.
    stops = pd.read_csv(datalink + "stops.txt", sep=",")
    #To import the transfers dataset that gives the minimum transfer time to switch routes at each railway station.
    transfers = pd.read_csv(datalink + "transfers.txt", sep=",")
    #To import the trips dataset that gives for all routes an overview of the trips and the headsigns of these trips belonging to the railway route.
    #The service_id is an indication of all the dates this trip is valid (consultable in the calendar_dates dataset).
    trips = pd.read_csv(datalink + "trips.txt", sep=",")
    return agency, calendar_dates, routes, stops, transfers, trips

'Import all the DataFrames that are common for the three train networks'

## Acutal imports

### Belgium

In [8]:
'''Apply common_import()'''
agency_Belgium, calendar_dates_Belgium, routes_Belgium, stops_Belgium, transfers_Belgium, trips_Belgium = common_imports(belgian_GTFS_loc)

'Apply common_import()'

In [9]:
'''Import other DataFrames'''
#To import the translations dataset that provides the French-, Dutch-, German- and English-language translations of the Belgian railway stations.
translations_Belgium = pd.read_csv(belgian_GTFS_loc + "translations.txt", sep=",")
#To import the stop_times dataset that gives for all trips an overview of the ids of the stations served and the sequence in which these stations are served. 
#In addition, for all the trips the arrival and departure times at the stations served are given.
stop_times_Belgium = pd.read_csv(belgian_GTFS_loc + "stop_times.txt", sep=",")
#To import the calendar dataset that gives the first and last date of all data observations.
calendar_Belgium = pd.read_csv(belgian_GTFS_loc + "calendar.txt", sep=",")
#To import the stop_time_overrides dataset 
stop_time_overrides_Belgium = pd.read_csv(belgian_GTFS_loc + "stop_time_overrides.txt", sep=",")

'Import other DataFrames'

### Netherlands

In [10]:
'''Apply common_import()'''
agency_Netherlands, calendar_dates_Netherlands, routes_Netherlands, stops_Netherlands, transfers_not_cleaned_Netherlands, trips_Netherlands = common_imports(dutch_GTFS_loc)

'Apply common_import()'

In [11]:
'''Import other DataFrames'''
#To import the feed_info dataset that contains limited information about the Dutch NS railway feed.
feed_info_Netherlands = pd.read_csv(dutch_GTFS_loc + "feed_info.txt", sep=",")
#To import the stop_times dataset that gives for all trips an overview of the ids of the stations served and the sequence in which these stations are served. 
#In addition, for all the trips the arrival and departure times at the stations served are given.
stop_times_range = [*range(2, 19)]
stop_times_Netherlands = pd.read_csv(dutch_GTFS_loc + "stop_times-1.csv", sep=",")
for index in stop_times_range:
    stop_times_Netherlands = pd.concat([stop_times_Netherlands, pd.read_csv(dutch_GTFS_loc + "stop_times-" + str(index)+ ".csv", sep=",")])

'Import other DataFrames'

### Switzerland

In [12]:
'''Apply common_import()'''
agency_Switzerland, calendar_dates_Switzerland, routes_Switzerland, stops_Switzerland, transfers_not_cleaned_Switzerland, trips_Switzerland = common_imports(swiss_GTFS_loc)

'Apply common_import()'

In [13]:
'''Import other DataFrames'''
#To import the feed_info dataset that contains limited information about the Swiss SBB railway feed.
feed_info_Switzerland = pd.read_csv(swiss_GTFS_loc + "feed_info.txt", sep=",")
#To import the stop_times dataset that gives for all trips an overview of the ids of the stations served and the sequence in which these stations are served. 
#In addition, for all the trips the arrival and departure times at the stations served are given.
stop_times_Switzerland = pd.read_csv(swiss_GTFS_loc + "stop_times.txt", sep=",")
#To import the calendar dataset that gives the first and last date of all data observations.
calendar_Switzerland = pd.read_csv(swiss_GTFS_loc + "calendar.txt", sep=",")

'Import other DataFrames'

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


# Cleaning of the railway data

## Functions 

In [14]:
'''Clean the calendar_dates DataFrame'''

def clean_calendar_dates(calendar_dates):
    #To filter the dates from the selected begin to the end date
    begin_date = 20210314
    end_date = 20210713
    calendar_dates_cleaned = calendar_dates.copy()
    calendar_dates_cleaned = calendar_dates_cleaned.drop(calendar_dates_cleaned[(calendar_dates_cleaned['date'] > end_date) | (calendar_dates_cleaned['date'] < begin_date)].index)
    return calendar_dates_cleaned

'Clean the calendar_dates DataFrame'

In [15]:
'''Add the country to the stops DataFrame and returns the country filtered DataFrame of stops and the serie of those stops'''

def country_information(stops, country_name, stops_cleaned_loc, stops_series_loc):
    #To initialize the Nominatim API to get the location from the input string 
    geolocator = Nominatim(user_agent="application")
    reverse = RateLimiter(geolocator.reverse, min_delay_seconds=0.2)

    #To get the location with the geolocator.reverse() function and to extract the country from the location instance
    country_list = []
    for index, row in stops.iterrows():
        latitude = row['stop_lat']
        longitude = row['stop_lon']
        # To assign the latitude and longitude into a geolocator.reverse() method
        location = reverse((latitude, longitude), language='en', exactly_one=True)
        # To get the country from the given list and parsed into a dictionary with raw function()
        address = location.raw['address']
        country = address.get('country', '')
        country_list.append(country)

    #To add the values of country_list as a new attribute country 
    stops.loc[:,'country'] = country_list

    #To calculate the total number of Belgian stations in the stops dataset
    country_stops = stops[stops['country'] == country_name]
    country_stops_series = stops.loc[stops['country'] == country_name, 'stop_name']
    
    stops.to_csv(f'{stops_cleaned_loc}stops_cleaned_{country_name}.csv')
    country_stops_series.to_csv(f'{stops_series_loc}stops_{country_name}_series.csv')

'Add the country to the stops DataFrame and returns the country filtered DataFrame of stops and the serie of those stops'

In [16]:
'''Remove the accents from a string'''

def remove_accents(text):
    import unicodedata
    try:
        text = unicode(text, 'utf-8')
    except NameError:
        pass
    text = unicodedata.normalize('NFD', text).encode('ascii', 'ignore').decode("utf-8")
    return str(text)

'Remove the accents from a string'

## Acutal cleaning

### Belgium

In [17]:
'''Clean the routes_Belgium df'''
allowed_route_type = {'IC', 'L', 'P', 'ICT', 'IZY'}
routes_cleaned_Belgium = routes_Belgium[(routes_Belgium['route_short_name'].isin(allowed_route_type)) | (routes_Belgium['route_short_name'].str.startswith('S'))]

'Clean the routes_Belgium df'

In [18]:
'''Apply clean_calendar_dates()'''
calendar_dates_cleaned_Belgium = clean_calendar_dates(calendar_dates_Belgium)

'Apply clean_calendar_dates()'

In [19]:
'''Clean the stops_Belgium df.''' 
#To eliminate the stop_ids in the stops dataset that contain an underscore or that start with a character 'S'. 
stops_cleaned_Belgium = stops_Belgium[(~stops_Belgium['stop_id'].str.contains('_')) & (~stops_Belgium['stop_id'].str.contains('S'))]

#To modify the object datatype of the stop_id column to the NumPy int64 datatype
stops_cleaned_Belgium.loc[:,'stop_id'] = stops_cleaned_Belgium.loc[:,'stop_id'].astype(np.int64)

#To remove the accents from the stop_name and to change to uppercase
stops_cleaned_Belgium.loc[:,'stop_name'] = stops_cleaned_Belgium.loc[:,'stop_name'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
stops_cleaned_Belgium.loc[:,'stop_name'] = stops_cleaned_Belgium.loc[:,'stop_name'].str.upper()

'Clean the stops_Belgium df.'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [250]:
'''Apply country_information() and take the DataFrames from the files'''
country_name = 'Belgium'
#country_information(stops_cleaned_Belgium, country_name, stops_cleaned_loc, stops_series_loc)
stops_cleaned_Belgium = pd.read_csv(f"{stops_cleaned_loc}stops_cleaned_{country_name}.csv", sep=",")
stops_Belgium_series = pd.read_csv(f"{stops_series_loc}stops_{country_name}_series.csv", sep=",")['stop_name']

'Apply country_information() and take the DataFrames from the files'

### Netherlands

In [251]:
'''Clean the routes_Netherlands DataFrame'''
#To keep the train routes
routes_cleaned_Netherlands = routes_Netherlands[routes_Netherlands['route_type'] == 2]
routes_cleaned_Netherlands = routes_cleaned_Netherlands.astype(str)
routes_cleaned_Netherlands.describe(include=['object'])

#To change the route_id object datatype to a NumPy int64 datatype
routes_cleaned_Netherlands.loc[:,'route_id'] = routes_cleaned_Netherlands.loc[:,'route_id'].astype(np.int64)

'Clean the routes_Netherlands DataFrame'

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_desc,route_type,route_color,route_text_color,route_url
count,145,145,145,145,145.0,145,145.0,145.0,145.0
unique,145,11,15,144,1.0,1,1.0,1.0,1.0
top,17803,IFF:NS,Sprinter,Nachtnettrein Utrecht Centraal <-> Rotterdam C...,,2,,,
freq,1,87,47,2,145.0,145,145.0,145.0,145.0


In [252]:
'''Apply clean_calendar_dates()'''
calendar_dates_cleaned_Netherlands = clean_calendar_dates(calendar_dates_Netherlands)

'Apply clean_calendar_dates()'

In [253]:
'''Clean the stops DataFrame'''
#To take from the stops_initial_Netherlands df all stop_ids that contain a 'stoparea:' to get the correct stop coordinates
stops_cleaned_Netherlands = stops_Netherlands[stops_Netherlands['stop_id'].str.contains('stoparea:')]

#To remove the accents from the accented characters and to convert the remaining characters to uppercase characters
stops_cleaned_Netherlands.loc[:,'stop_name'] = stops_cleaned_Netherlands.loc[:,'stop_name'].apply(remove_accents)
stops_cleaned_Netherlands.loc[:,'stop_name'] = stops_cleaned_Netherlands.loc[:,'stop_name'].str.upper()

'Clean the stops DataFrame'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [254]:
'''Apply country_information() and take the DataFrames from the files'''
country_name = 'Netherlands'
#country_information(stops_cleaned_Netherlands, country_name, stops_cleaned_loc, stops_series_loc)
stops_cleaned_Netherlands = pd.read_csv(f"{stops_cleaned_loc}stops_cleaned_{country_name}.csv", sep=",")
stops_Netherlands_series = pd.read_csv(f"{stops_series_loc}stops_{country_name}_series.csv", sep=",")['stop_name']

'Apply country_information() and take the DataFrames from the files'

In [255]:
'''Clean the stop_times df'''
stop_times_cleaned_Netherlands = stop_times_Netherlands.copy()
stop_times_cleaned_Netherlands.loc[:,'stop_id'] = stop_times_cleaned_Netherlands.stop_id.apply(str)
stop_times_cleaned_Netherlands = pd.merge(stop_times_cleaned_Netherlands, stops_Netherlands[['stop_id', 'stop_name']], on='stop_id')
stop_times_cleaned_Netherlands.loc[:,'stop_name'] = stop_times_cleaned_Netherlands.loc[:,'stop_name'].apply(remove_accents)
stop_times_cleaned_Netherlands.loc[:,'stop_name'] = stop_times_cleaned_Netherlands.loc[:,'stop_name'].str.upper()

'Clean the stop_times df'

KeyboardInterrupt: 

### Switzerland

In [None]:
'''Clean the routes_Switzerland DataFrame'''
#To keep the train routes
routes_cleaned_Switzerland = routes_Switzerland[routes_Switzerland['route_type'] == 2]

In [None]:
'''Apply clean_calendar_dates()'''
calendar_dates_cleaned_Switzerland = clean_calendar_dates(calendar_dates_Switzerland)

In [None]:
'''Clean the stop_times_Switzerland DataFrame'''
# To remove the superfluous characters of the stop_id (platform codes)
stop_times_cleaned_Switzerland = stop_times_Switzerland.copy()
stop_times_cleaned_Switzerland_column = stop_times_cleaned_Switzerland['stop_id'].str.split(':').str[0]
stop_times_cleaned_Switzerland.loc[:,'stop_id'] = stop_times_cleaned_Switzerland_column

# To make the stop_ids numerical 
stop_times_cleaned_Switzerland.loc[:,'stop_id'] = stop_times_cleaned_Switzerland.loc[:,'stop_id'].astype(np.int64)

In [None]:
'''Clean the stops_Switzerland DataFrame'''
#To remove the superfluous characters (platform codes)
stops_cleaned_Switzerland_column = stops_Switzerland['stop_id'].str.split(':').str[0]
stops_cleaned_Switzerland = stops_Switzerland.copy()
stops_cleaned_Switzerland.loc[:,'stop_id'] = stops_cleaned_Switzerland_column

#To make the stop_ids numerical and to remove the duplicate stop_ids
stops_cleaned_Switzerland = stops_cleaned_Switzerland[['stop_id', 'stop_name', 'stop_lat', 'stop_lon']]
stops_cleaned_Switzerland.loc[:,'stop_id'] = stops_cleaned_Switzerland.loc[:,'stop_id'].astype(np.int64)
stops_cleaned_Switzerland = stops_cleaned_Switzerland.drop_duplicates()

#To remove the accents from the stop_name and to change to uppercase
stops_cleaned_Switzerland.loc[:,'stop_name'] = stops_cleaned_Switzerland.loc[:,'stop_name'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
stops_cleaned_Switzerland.loc[:,'stop_name'] = stops_cleaned_Switzerland.loc[:,'stop_name'].str.upper()

In [256]:
'''Apply country_information() and take the DataFrames from the files'''
country_name = 'Switzerland'
#country_information(stops_cleaned_Switzerland, country_name, stops_cleaned_loc, stops_series_loc)
stops_cleaned_Switzerland = pd.read_csv(f"{stops_cleaned_loc}stops_cleaned_{country_name}.csv", sep=",")
stops_Switzerland_series = pd.read_csv(f"{stops_series_loc}stops_{country_name}_series.csv", sep=",")['stop_name']

'Apply country_information() and take the DataFrames from the files'

# Merge the DataFrames

## Functions

In [None]:
'''Merge the DataFrames'''

def merge_df(stop_times, stops, routes, trips, calendar_dates, on_stop):
    list_columns = ['stop_name', 'stop_lat', 'stop_lon', 'country']
    if on_stop == 'stop_id':
        list_columns.append('stop_id')
    #To merge the stop_times df with the stops df on stop_id
    stop_times_stops = pd.merge(stop_times, stops[list_columns], on= on_stop)

    #To merge the trips df with the routes df on route_id
    routes_trips = pd.merge(routes[['route_id']], trips, on='route_id')

    #To merge the stop_times_stops df with the trips_routes df on trip_id
    uncleaned_railway_system_information = pd.merge(routes_trips, stop_times_stops, on='trip_id')

    #To take only the service_ids present in both the routes_trips_stop_times_stops df and the calendar_dates df into account
    calendar_dates_unique = calendar_dates['service_id'].unique()
    railway_system_information = uncleaned_railway_system_information[(uncleaned_railway_system_information['service_id'].isin(calendar_dates_unique))]
    
    return railway_system_information

## Actual merging

### Belgium 

In [177]:
'''Select all required fields'''
agency_cleaned_Belgium = agency_Belgium[['agency_id', 'agency_name', 'agency_url', 'agency_timezone']]
routes_cleaned_Belgium = routes_cleaned_Belgium[['route_id', 'agency_id', 'route_short_name', 'route_long_name', 'route_type']]
trips_cleaned_Belgium = trips_Belgium[['trip_id', 'route_id', 'service_id', 'trip_headsign']]
calendar_dates_cleaned_Belgium = calendar_dates_cleaned_Belgium[['service_id', 'date']]
stops_cleaned_Belgium = stops_cleaned_Belgium[['stop_id', 'stop_name', 'stop_lat', 'stop_lon', 'country']]
stop_times_cleaned_Belgium = stop_times_Belgium[['trip_id', 'stop_id', 'arrival_time', 'departure_time', 'stop_sequence']]

'Select all required fields'

In [178]:
'''Apply merge_df()'''
railway_system_information_Belgium = merge_df(stop_times_cleaned_Belgium, stops_cleaned_Belgium, routes_cleaned_Belgium, trips_cleaned_Belgium, calendar_dates_cleaned_Belgium, 'stop_id')

'Apply merge_df()'

### Netherlands

In [179]:
'''Select all required fields'''
agency_cleaned_Netherlands = agency_Netherlands[['agency_id', 'agency_name', 'agency_url', 'agency_timezone']]
routes_cleaned_Netherlands = routes_cleaned_Netherlands[['route_id', 'agency_id', 'route_short_name', 'route_long_name', 'route_type']]
trips_cleaned_Netherlands = trips_Netherlands[['trip_id', 'route_id', 'service_id', 'trip_headsign']]
calendar_dates_cleaned_Netherlands = calendar_dates_cleaned_Netherlands[['service_id', 'date']]
stops_cleaned_Netherlands = stops_cleaned_Netherlands[['stop_name', 'stop_lat', 'stop_lon', 'country']]
stop_times_cleaned_Netherlands = stop_times_cleaned_Netherlands[['trip_id', 'stop_name', 'arrival_time', 'departure_time', 'stop_sequence']]

'Select all required fields'

In [180]:
'''Apply merge_df()'''
railway_system_information_Netherlands = merge_df(stop_times_cleaned_Netherlands, stops_cleaned_Netherlands, routes_cleaned_Netherlands, trips_cleaned_Netherlands, calendar_dates_cleaned_Netherlands, 'stop_name')

'Apply merge_df()'

### Switzerland

In [181]:
'''Select all required fields'''
agency_cleaned_Switzerland = agency_Switzerland[['agency_id', 'agency_name', 'agency_url', 'agency_timezone']]
routes_cleaned_Switzerland = routes_cleaned_Switzerland[['route_id', 'agency_id', 'route_short_name', 'route_long_name', 'route_type']]
trips_cleaned_Switzerland = trips_Switzerland[['trip_id', 'route_id', 'service_id', 'trip_headsign']]
calendar_dates_cleaned_Switzerland = calendar_dates_cleaned_Switzerland[['service_id', 'date']]
stops_cleaned_Switzerland = stops_cleaned_Switzerland[['stop_id', 'stop_name', 'stop_lat', 'stop_lon', 'country']]
stop_times_cleaned_Switzerland = stop_times_cleaned_Switzerland[['trip_id', 'stop_id', 'arrival_time', 'departure_time', 'stop_sequence']]

'Select all required fields'

In [182]:
'''Apply merge_df()'''
railway_system_information_Switzerland = merge_df(stop_times_cleaned_Switzerland, stops_cleaned_Switzerland, routes_cleaned_Switzerland, trips_cleaned_Switzerland, calendar_dates_cleaned_Switzerland, 'stop_id')

'Apply merge_df()'

# Preparation for the space-of-stops representation of the railway systems


In [183]:
railway_system_information_Belgium

Unnamed: 0,route_id,trip_id,service_id,trip_headsign,stop_id,arrival_time,departure_time,stop_sequence,stop_name,stop_lat,stop_lon,country
0,115,88____:007::8885704:8885001:4:523:20210418,14,Tournai,8885001,05:23:00,05:23:00,4,TOURNAI,50.61313,3.396940,Belgium
1,115,88____:007::8885704:8885001:4:523:20210418,14,Tournai,8885068,05:19:00,05:19:00,3,FROYENNES,50.62989,3.354835,Belgium
2,115,88____:007::8885704:8885001:4:523:20210418,14,Tournai,8885753,05:12:00,05:12:00,2,HERSEAUX,50.71390,3.245961,Belgium
3,115,88____:007::8885704:8885001:4:523:20210418,14,Tournai,8885704,05:07:00,05:07:00,1,MOUSCRON,50.74100,3.228449,Belgium
4,115,88____:007::8885704:8885001:4:623:20210418,14,Tournai,8885001,06:23:00,06:23:00,4,TOURNAI,50.61313,3.396940,Belgium
...,...,...,...,...,...,...,...,...,...,...,...,...
431487,734,88____:007::8821105:8812005:22:1723:20210418,25,Bruxelles-Nord,8811270,17:01:00,17:01:00,19,VELTEM,50.90052,4.633520,Belgium
431488,734,88____:007::8821105:8812005:22:1723:20210418,25,Bruxelles-Nord,8811288,16:59:00,16:59:00,18,HERENT,50.90353,4.672190,Belgium
431489,734,88____:007::8821105:8812005:22:1723:20210418,25,Bruxelles-Nord,8819406,17:10:00,17:12:00,23,BRUSSELS AIRPORT-ZAVENTEM,50.89646,4.482072,Belgium
431490,734,88____:007::8821105:8812005:22:1723:20210418,25,Bruxelles-Nord,8821063,16:11:00,16:11:00,5,ANVERS-LUCHTBAL,51.24413,4.425033,Belgium


# Preparation for the space-of-stops representation of the railway systems


## Functions

In [184]:
'''Create a DataFrame with the departure time form the first stop sequence and with the one from last stop sequence for each trip_id'''

def create_trip_departure_times(railway_system_information):
    departure_time_first = railway_system_information.reset_index().loc[railway_system_information.reset_index().groupby(['trip_id'])['stop_sequence'].idxmin()][['route_id', 'trip_id', 'departure_time']].copy()
    departure_time_first = departure_time_first.rename(columns = {'departure_time': 'departure_time_first'})
    departure_time_last = railway_system_information.reset_index().loc[railway_system_information.reset_index().groupby(['trip_id'])['stop_sequence'].idxmax()][['route_id', 'trip_id', 'departure_time']].copy()
    departure_time_last = departure_time_last.rename(columns = {'departure_time': 'departure_time_last'})
    trip_departure_times = departure_time_first.merge(departure_time_last[['trip_id', 'departure_time_last']], on='trip_id')
    return trip_departure_times

'Create a DataFrame with the departure time form the first stop sequence and with the one from last stop sequence for each trip_id'

In [185]:
'''Put the stop_names per trip_id in a list in the new trip_stops_sequence DataFrame and
Calculate the hash of the stop sequence in both order (ascending and descending)'''

def create_trip_stop_sequence(trip_departure_times):    
    #Put the stop_names per trip_id in a list in the new trip_stops_sequence DataFrame
    trip_stop_sequence = trip_departure_times.groupby('trip_id')['stop_name'].apply(lambda group_series: group_series.tolist()).reset_index()
    trip_stop_sequence.rename(columns={'stop_name':'stops_sequence'}, inplace=True)
    #Calculate the hash of the stop sequence in both order (ascending and descending)
    trip_stop_sequence['hash'] = trip_stop_sequence['stops_sequence'].apply(lambda x: hash(tuple(x)))
    trip_stop_sequence['hash_inverse'] = trip_stop_sequence['stops_sequence'].apply(lambda x: hash(tuple(x[::-1])))
    return trip_stop_sequence

'Put the stop_names per trip_id in a list in the new trip_stops_sequence DataFrame and\nCalculate the hash of the stop sequence in both order (ascending and descending)'

In [186]:
'''Regroup the days per service id in a set and count them'''

def create_service_id_dates(calendar_dates):
    service_id_dates = calendar_dates.groupby('service_id')['date'].apply(lambda group_series: set(group_series.tolist())).reset_index()
    service_id_dates.rename(columns={'date':'dates'}, inplace=True)
    service_id_dates['count_service_id'] = service_id_dates['dates'].apply(lambda x: len(x))
    return service_id_dates

'Regroup the days per service id in a set and count them'

In [187]:
'''Put the different trip_ids in a list and add the departure_time first and last lists'''

def create_routes_hash(trips_hash):
    common_columns = ['route_id','hash', 'hash_inverse', 'service_id']
    routes_hash = trips_hash.groupby(common_columns)['trip_id'].apply(lambda group_series: group_series.tolist()).reset_index()
    route_hash_dep_first = trips_hash.groupby(common_columns)['departure_time_first'].apply(lambda group_series: group_series.tolist()).reset_index()
    route_hash_dep_last = trips_hash.groupby(common_columns)['departure_time_last'].apply(lambda group_series: group_series.tolist()).reset_index()
    routes_hash = routes_hash.merge(route_hash_dep_first, on= common_columns)
    routes_hash = routes_hash.merge(route_hash_dep_last, on= common_columns)
    return routes_hash

'Put the different trip_ids in a list and add the departure_time first and last lists'

In [339]:
'''Create DataFrames that will be used for the route_creation process'''

def prepartion_space(railway_system_information, calendar_dates):    
    #Sort values by the route_id, the trip_id, and the stop_sequence fields
    railway_system_information = railway_system_information.sort_values(by=['route_id', 'trip_id','service_id', 'stop_sequence'])

    trip_departure_times = create_trip_departure_times(railway_system_information)

    #Merge railway_system_information with trip_departure_times
    trip_departure_times = railway_system_information.merge(trip_departure_times[['trip_id','departure_time_first','departure_time_last']], on='trip_id')

    trip_stop_sequence = create_trip_stop_sequence(trip_departure_times)
    
    #Add the stop_sequence of stations to the trip_departure_times dataset by joining on trip_id
    trips_hash = pd.merge(trip_departure_times, trip_stop_sequence, on='trip_id')
    
    service_id_dates = create_service_id_dates(calendar_dates)
    
    #Merge trips_hash with service_id_dates
    trips_hash = pd.merge(trips_hash, service_id_dates, on='service_id', how='left')
    
    #Calculate generic_trips_information
    generic_trips_information = trips_hash.groupby(['route_id', 'trip_id', 'service_id', 'hash', 'hash_inverse', 'departure_time_first','departure_time_last', 'count_service_id'], as_index=False)[['stops_sequence', 'dates']].first()
    
    routes_hash = create_routes_hash(generic_trips_information)
    
    #Add the sequence of stops, dates and service_id_count to the route_hash_freq_dep dataset
    routes_hash = pd.merge(routes_hash, trips_hash[['route_id','hash', 'hash_inverse', 'service_id','stops_sequence','dates','count_service_id']], on=['route_id', 'hash', 'hash_inverse', 'service_id'], how='left')
    routes_hash = routes_hash.drop_duplicates( subset = ['route_id', 'hash', 'service_id'], keep = 'first')
    
    return trips_hash, generic_trips_information, routes_hash

'Create DataFrames that will be used for the route_creation process'

## Acutal preparation

### Belgium

In [344]:
trips_hash_Belgium, generic_trips_information_Belgium, routes_hash_Belgium = prepartion_space(railway_system_information_Belgium, calendar_dates_cleaned_Belgium)

### Netherlands

In [345]:
trips_hash_Netherlands, generic_trips_information_Netherlands, routes_hash_Netherlands = prepartion_space(railway_system_information_Netherlands, calendar_dates_cleaned_Netherlands)

### Switzerland

In [346]:
trips_hash_Switzerland, generic_trips_information_Switzerland, routes_hash_Switzerland = prepartion_space(railway_system_information_Switzerland, calendar_dates_cleaned_Switzerland)

# Route Creation

## Functions

In [212]:
'''Some functions to better factorise the functions in the coming cells'''

def select_stops_sequences(stops_sequences_df, route_id):
    '''retruns the stop sequences with the selected route_id'''
    return stops_sequences_df[stops_sequences_df['route_id'] == route_id].copy()


def take_leftovers_list_c_from_intersection_AAndB(list_a, list_b, list_c):
    '''take the indexes of the intersection of list a with list b and retain the elments of list c with that index'''
    ind_dict = dict((k,i) for i,k in enumerate(list_a))
    return [list_c[ind_dict[x]] for x in (set(list_a).intersection(list_b))]

def get_extentions (after_or_behind, route_sequences_route_id, trip):
    '''returns the extentions for the trip (behind or after)'''
    if after_or_behind == 'after':
        #checks the extentions possible for the trip that can follow after its last stop
        possible_extentions = route_sequences_route_id[route_sequences_route_id['stops_sequence'].apply(lambda x: any(item for item in [trip['stops_sequence'][-1]] if (item == x[0]) and not(set(x[1:]) & set(trip['stops_sequence']))))].copy()
    elif after_or_behind == 'behind':
        #checks the extentions possible for the trip that can follow before its first stop
        possible_extentions = route_sequences_route_id[route_sequences_route_id['stops_sequence'].apply(lambda x: any(item for item in [trip['stops_sequence'][0]] if (item == x[-1]) and not(set(x[:-1]) & set(trip['stops_sequence']))))].copy()        
    #checks that those extentions have a common date as the trip
    possible_extentions = possible_extentions[possible_extentions['dates'].apply(lambda x: any(item for item in trip['dates'] if item in x))].copy()   
    if not possible_extentions.empty: 
        if after_or_behind == 'after':
            #checks that those extentions have a matching time schedule as the trip
            possible_extentions = possible_extentions[possible_extentions['departure_time_first'].apply(lambda x: any(item for item in trip['departure_time_last'] if item in x))].copy()
        elif after_or_behind == 'behind':
            #checks that those extentions have a matching time schedule as the trip
            possible_extentions = possible_extentions[possible_extentions['departure_time_last'].apply(lambda x: any(item for item in trip['departure_time_first'] if item in x))].copy()
    return possible_extentions      

def calculate_frequency (sequences_df):
    '''calculate the frequency based on the length of the dates and departure_time and put the hash in as a column of list'''
    sequences_df['number_dates'] = sequences_df['dates'].apply(lambda x: len(x))
    sequences_df['number_times'] = sequences_df['departure_time_last'].apply(lambda x: len(x))
    sequences_df['frequency'] = sequences_df['number_dates']* sequences_df['number_times'] 
    sequences_df = sequences_df.drop(['dates', 'departure_time_last', 'number_dates', 'number_times'], axis=1)
    sequences_df['hash'] = sequences_df['hash'].apply(lambda x: [x])
    return sequences_df.copy()
         
from datetime import datetime
from datetime import timedelta
FMT = '%H:%M:%S'
day_in_seconds = timedelta(days=1).total_seconds()
def calculate_time_difference(time_df, later_time, earlier_time, column_name):
    '''calculates the time difference between later time and earlier time and put it in time_df[column_name]'''
    #transform 24:00:00 into 00:00:00
    time_df['departure_time'] = time_df['departure_time'].apply(lambda x: str(int(x[:2])-24) + x[2:] if int(x[:2]) >= 24 else x)
    time_df['arrival_time'] = time_df['arrival_time'].apply(lambda x: str(int(x[:2])-24) + x[2:] if int(x[:2]) >=  24 else x)
    #calculate the waiting_time
    time_df[column_name] = time_df[['arrival_time','departure_time']].apply(lambda x: int((datetime.strptime(x[later_time], FMT) - datetime.strptime(x[earlier_time], FMT)).total_seconds()/60), axis=1)
    #if one day as past, take it into consideration
    time_df[column_name] = time_df[column_name].apply(lambda x: day_in_seconds/60 + x if x < 0 else x)
    return time_df            

'Some functions to better factorise the functions in the coming cells'

In [213]:
'''Finds the routes that can be either extended from behind or from after and those which are complete sequences'''

def get_extention_indexes(stop_sequences_df):
    '''returns the tree indexes: index_of_extendable, index_of_begin_sequences, index_of_complete_sequences'''
    #intiate the dictionnaries, that will be used to retrieve different rows later on
    index_of_extendable = {}
    index_of_begin_sequences = {}
    index_of_complete_sequences = {}
    for route_id in stop_sequences_df['route_id'].unique():
        #select the route with the route_id selected by the loop iteration
        route_sequences_route_id = select_stop_sequences(stop_sequences_df, route_id)
        for index_trip, trip in route_sequences_route_id.iterrows():
            #checks the extentions possible for the trip that can follow after its last stop
            possible_extentions_after = get_extentions('after', route_sequences_route_id, trip)
            #checks the extentions possible for the trip that can follow before its first stop
            possible_extentions_behind = get_extentions('behind', route_sequences_route_id, trip)
            #put all the sequences that can be extended either from the beginning either from the end together
            possible_extentions = possible_extentions_after.append(possible_extentions_behind, ignore_index = True)
            if not possible_extentions.empty:
                if route_id not in index_of_extendable:
                    index_of_extendable[route_id] = []
                index_of_extendable[route_id].append(index_trip)
                if possible_extentions_behind.empty:
                    if route_id not in index_of_begin_sequences:
                        index_of_begin_sequences[route_id] = []
                    index_of_begin_sequences[route_id].append(index_trip)
            elif possible_extentions.empty:
                if route_id not in index_of_complete_sequences:
                    index_of_complete_sequences[route_id] = []
                index_of_complete_sequences[route_id].append(index_trip)
                
    return index_of_extendable, index_of_begin_sequences, index_of_complete_sequences

'Finds the routes that can be either extended from behind or from after and those which are complete sequences'

In [214]:
'''Creates all the sequences of routes possible to reconstruct the real route and calculates their frequency'''

def possible_sequences_construction(stops_sequences_df, index_of_extendable, index_of_begin_sequences, index_of_complete_sequences):
    '''returns the first part of the route_creation, two others need to be added'''
    import copy
    #create an empty df for the process of route creation
    route_creation  = pd.DataFrame()
    for route_id in index_of_extendable:
        #checks if some parts are begin sequences, if not, then we can't build routes with multiple sequences
        if route_id in index_of_begin_sequences:
            #create a copy of the df with only the route considered in the loop iteration
            routes_with_route_id = select_stops_sequences(stops_sequences_df, route_id)
            #set default frequency to NaN
            routes_with_route_id['frequency'] = np.nan
            #create a df where only the routes that have an end stop as their first element of the sequence
            route_creation_route_id = routes_with_route_id.loc[index_of_begin_sequences[route_id]][['route_id', 'hash', 'stops_sequence', 'dates', 'departure_time_last','frequency']]
            #create a df with the exentable sequences for that route_id
            route_creation_extensions_route_id = routes_with_route_id.loc[index_of_extendable[route_id]][['route_id', 'hash', 'stops_sequence', 'dates', 'departure_time_first', 'departure_time_last','frequency']]    
            #make the hash column as a column of lists
            route_creation_route_id['hash'] = route_creation_route_id['hash'].apply(lambda x: [x])
            route_creation_route_id = route_creation_route_id.reset_index(drop=True)
            #to stop the while loop when all the routes are complete in the df for the route_id of the loop iteration
            complete_routes = 0
            while complete_routes < len(route_creation_route_id.index):
                #use a deepcopy to not impact the iterrows of the main loop
                route_creation_deep_copy = copy.deepcopy(route_creation_route_id)
                for index_original, route_part in route_creation_deep_copy.iterrows():
                    #create a dataframe of the possible extentions for each route_part
                    #select an extention only if the extention is the next part of the route 
                    #and also that no other station are repeated in the sequence if this extention is added(otherwise it might cause an infinite loop)
                    possible_extentions = get_extentions('after', route_creation_extensions_route_id, route_part)
                    #checks whether any extention fullfilling the criterias has been found
                    if not possible_extentions.empty:
                        #if so, extend it with every single possibilities
                        for index_extention, possible_extention in possible_extentions.iterrows():
                            #must create a deepcopy, otherwise the orignal hash list will change as well (mutable)
                            updated_hash = copy.deepcopy(route_part['hash'])
                            updated_hash.append(possible_extention['hash'])
                            updated_route_sequence = route_part['stops_sequence'] + possible_extention['stops_sequence'][1:]
                            common_dates = possible_extention['dates'] & route_part['dates']
                            new_departure_time_last = take_leftovers_list_c_from_intersection_AAndB(list(possible_extentions['departure_time_first'])[0], list(route_part['departure_time_last']), list(possible_extentions['departure_time_last'])[0])
                            new_frequency = len(new_departure_time_last) * len(common_dates)
                            route_creation_route_id.loc[max(route_creation_route_id.index)+1] = [route_id, updated_hash, updated_route_sequence, common_dates, new_departure_time_last, new_frequency]
                        #then delete the route with the index (see loop here above)
                        route_creation_route_id = route_creation_route_id.drop(index = index_original)            
                    #the route can't be extended anymore
                    else:
                        complete_routes += 1
            #adds all the possible routes created with the trips of the route_id of the main loop
            route_creation = route_creation.append(route_creation_route_id, ignore_index = True)
    if 'departure_time_last' in route_creation.columns:
        route_creation = route_creation.drop(['dates', 'departure_time_last'], axis=1)
    route_creation = route_creation.reindex(columns=['route_id','hash','stops_sequence', 'frequency'])
    return route_creation

'Creates all the sequences of routes possible to reconstruct the real route and calculates their frequency'

In [349]:
'''Adds the full sequences to the route_creation dataframe'''

def add_full_sequences(stop_sequences_df, route_creation, index_of_complete_sequences):
    '''returns the second part of the route_creation, one other needs to be added'''
    for route_id in index_of_complete_sequences:
        #findes all the complete sequences for that route_id
        copy_complete_sequences_df = stop_sequences_df.loc[index_of_complete_sequences[route_id]][['route_id','hash','stops_sequence', 'dates', 'departure_time_last']].copy()
        copy_complete_sequences_df = calculate_frequency(copy_complete_sequences_df)
        #adds each of them in the route_creation dataframe
        for index_complete_sequence, complete_sequence in copy_complete_sequences_df.iterrows():
            route_creation = route_creation.append(complete_sequence, ignore_index = True)
    route_creation = route_creation.sort_values(by=['route_id'], ignore_index = True)
    return route_creation 

'Adds the full sequences to the route_creation dataframe'

In [355]:
'''Adds the sequences that were not yet added in the route_creation dataframe'''

def add_unused_sequences(stop_sequences_df, route_creation):
    '''returns the third part of the route_creation'''
    for route_id in route_creation['route_id'].unique():
        #get a set of the hashes that were employed to create the routes for that route_id
        used_sequences_hash = set(route_creation[route_creation['route_id'] == route_id].apply(lambda x: pd.Series(x['hash']),axis=1).stack().reset_index(level=1, drop=True))
        copy_sequences_route_id = select_stop_sequences(stop_sequences_df, route_id)[['route_id','hash','stops_sequence', 'dates', 'departure_time_last']]
        copy_sequences_route_id = calculate_frequency(copy_sequences_route_id)
        #adds the hashes that were not employed in any route creations for that route_id
        for index_trip, trip in copy_sequences_route_id.iterrows():
            #first element of the list because there is always only one element
            if float(trip['hash'][0]) not in used_sequences_hash:
                route_creation = route_creation.append(trip, ignore_index = True)
    return route_creation

'Adds the sequences that were not yet added in the route_creation dataframe'

In [231]:
'''Creates a column in the df that calculates the travel time between the first and last stop (waiting time included)
and another column with the waiting time (calculated with a weighted average based on the frequency)'''
from datetime import datetime
from datetime import timedelta
FMT = '%H:%M:%S'
day_in_seconds = timedelta(days=1).total_seconds()

def give_begin_end_time(route_creation_frequency_single, trips_hash_stops_sequence, stops_cleaned_stop_times_trips_merge_dates):
    #create a copy to not change the input DataFrame
    route_creation_frequency_single = route_creation_frequency_single.copy()
    #makes a column with the a representative begin time and end time of the route
    route_creation_frequency_single['travel_time'] = np.nan
    for index_sequence, sequence in route_creation_frequency_single.iterrows():
        constructed_route = pd.DataFrame()
        for index_hash, hash_value in enumerate(sequence['hash']):
            index_plus_one = index_hash + 1
            #take all the trips with that hash
            next_representative_trips = trips_hash_stops_sequence[(trips_hash_stops_sequence['hash'] == hash_value) & (trips_hash_stops_sequence['route_id'] == sequence['route_id'])].copy()['trip_id']
            #take all the stop sequences and their time that belongs 
            full_times = stops_cleaned_stop_times_trips_merge_dates[stops_cleaned_stop_times_trips_merge_dates['trip_id'].isin(next_representative_trips)].copy()
            #select) only the last stop sequences of full_times for each trip_id
            new_index_max_per_trip_id = full_times.reset_index().groupby(['route_id', 'trip_id'])['stop_sequence'].idxmax()
            max_per_trip_id = full_times.reset_index().loc[new_index_max_per_trip_id]
            #select only the first stop sequences of full_times for each trip_id            
            new_index_min_per_trip_id = full_times.reset_index().groupby(['route_id', 'trip_id'])['stop_sequence'].idxmin()            
            min_per_trip_id = full_times.reset_index().loc[new_index_min_per_trip_id]
            #merge max_per_trip_id and min_per_trip_id
            merged = min_per_trip_id[['trip_id', 'dates', 'departure_time']].merge(max_per_trip_id[['trip_id', 'arrival_time', 'departure_time']], on='trip_id')
            #take all the stop sequences except the first one, and the last one if it is not the last sequence of the route
            if index_hash == len(sequence['hash']) - 1:
                rest_per_trip_id = full_times.reset_index().drop(pd.concat([new_index_min_per_trip_id,new_index_max_per_trip_id]))
            else:
                rest_per_trip_id = full_times.reset_index().drop(new_index_min_per_trip_id)            
            #ONLY NEEDED FOR SWITZERLAND
            rest_per_trip_id = rest_per_trip_id.dropna()
            if not rest_per_trip_id.empty:
                rest_per_trip_id = calculate_time_difference(rest_per_trip_id, 'departure_time', 'arrival_time', 'waiting_time')
                #calculate the total waiting_time
                rest_per_trip_id_grouped = rest_per_trip_id.groupby(['trip_id'], as_index=False)['waiting_time'].sum()
                merged_waiting_time = merged.merge(rest_per_trip_id_grouped, on='trip_id')
            #in case there are only two stops in for the hash
            else:
                merged_waiting_time = merged.copy()
                merged_waiting_time['waiting_time'] = 0
            #rename the columns     
            merged_waiting_time = merged_waiting_time.rename(columns = {'trip_id': 'trip_id_' + str(index_plus_one),'departure_time_x':'departure_time_'+ str(index_plus_one), 'arrival_time':'arrival_time_'+ str(index_plus_one),
                                          'departure_time_y':'departure_time_'+ str(index_plus_one + 1), 'waiting_time': 'waiting_time_' + str(index_plus_one)})
            if index_hash == 0:
                constructed_route = merged_waiting_time
            elif index_hash > 0:
                constructed_route = constructed_route.merge(merged_waiting_time, how='inner', on=['departure_time_' + str(index_plus_one)])
                #take the intersection of the dates => only get the common dates and retain those rows with common dates
                constructed_route['dates'] = [a & b for a,b in zip(constructed_route['dates_x'], constructed_route['dates_y'])]
                constructed_route = constructed_route[constructed_route['dates'].map(lambda d: len(d)) > 0]
                constructed_route = constructed_route.drop(['dates_x','dates_y'], axis=1)        
        #make a list of all the columns of waiting_times
        list_column_waiting_time = []
        for i in range(1, index_plus_one + 1):
            list_column_waiting_time.append('waiting_time_' + str(i))
        #sum all the waiting times together for each route itinerary
        constructed_route['waiting_time'] = constructed_route[list_column_waiting_time].astype(int).sum(1)
        
        #sometimes it is impossible to find trips that follow each other
        if not constructed_route.empty:
            #when the loop is finished, take the last arrival time, that will be used to calculate the travel time
            time_constructed_route = constructed_route[['departure_time_1', 'arrival_time_' + str(index_plus_one), 'waiting_time', 'dates']]
            time_constructed_route = time_constructed_route.rename(columns = {'departure_time_1':'departure_time', 'arrival_time_' + str(index_plus_one):'arrival_time'})
            time_constructed_route = calculate_time_difference(time_constructed_route, 'arrival_time', 'departure_time', 'time_diff_min')
            #add here a new column count dates that is the sum of the common dates
            time_constructed_route['count_dates'] = time_constructed_route['dates'].apply(lambda x: len(x))
            sum_count_dates = time_constructed_route['count_dates'].sum()
            #take the first most frequent one
            #create the weighted sum
            time_constructed_route['WS_travel_time'] = (time_constructed_route['time_diff_min'] * time_constructed_route['count_dates'])/sum_count_dates
            time_constructed_route['WS_waiting_time'] = (time_constructed_route['waiting_time'] * time_constructed_route['count_dates'])/sum_count_dates    
            weighted_sum_tt = time_constructed_route['WS_travel_time'].sum()
            weighted_sum_wt = time_constructed_route['WS_waiting_time'].sum()
            #Add this to the first dataframe
            route_creation_frequency_single.loc[index_sequence,'travel_time'] = weighted_sum_tt
            route_creation_frequency_single.loc[index_sequence,'waiting_time'] = weighted_sum_wt
        #if there is no trips that follow each other with the hash from the array
        else:
            route_creation_frequency_single = route_creation_frequency_single.drop(index_sequence)
            
    return route_creation_frequency_single

'Creates a column in the df that calculates the travel time between the first and last stop (waiting time included)\nand another column with the waiting time (calculated with a weighted average based on the frequency)'

In [225]:
def calculate_hash_route_creation(route_creation): 
    '''calculates the hash and the hash inverse of the route_creation'''
    #copy the route_creation dataFrame
    route_creation_hash = route_creation.copy()
    #calculate the hash and the hash inverse using the lists in stop_sequence
    route_creation_hash['hash'] = route_creation_hash['stops_sequence'].apply(lambda x: hash(tuple(x)))
    route_creation_hash['hash_inverse'] = route_creation_hash['stops_sequence'].apply(lambda x: hash(tuple(x[::-1])))
    return route_creation_hash

In [236]:
'''Regroup the routes that are the same (even though they are in the opposite direction)'''

def regroup_same_stops_sequences(route_creation_hash):
    '''regroups the stops_sequences that are the same'''
    
    route_creation_max_hash = route_creation_hash.copy()
    route_creation_max_hash['max_hash'] = route_creation_max_hash[['hash', 'hash_inverse']].max(axis=1)
    #create a df that sums the frequence of the trips going from opposite directions
    route_creation_max_hash_freq = route_creation_max_hash.groupby(['route_id','max_hash'], as_index = False)[['frequency']].sum()
    #renames the max_hash column into hash so it the dataframe can be merged with route_hash_without_freq
    route_creation_max_hash_freq = route_creation_max_hash_freq.rename(columns = {'max_hash':'hash'})
    #drops the column freq_sequence_route because the one that is of interest is in route_creation_max_hash_freq
    route_hash_without_freq = route_creation_hash.copy().drop(['frequency'], axis = 1)
    route_hash_without_freq = route_hash_without_freq.drop_duplicates(subset=['route_id', 'hash'])
    route_hash_freq_combined_first_merge = pd.merge(route_creation_max_hash_freq, route_hash_without_freq, on=['route_id', 'hash'], how='left')
    #selects the part of the dataset that doesn't have NaN (because for the NaN, their hash_value that was max was the one in hash_inverse and it didn't exist in the other df), so we can concatenate it with the part that had NaN later
    route_hash_freq_first_part = route_hash_freq_combined_first_merge[pd.notnull(route_hash_freq_combined_first_merge['stops_sequence'])]
    #selects one part the part of the dataset that does have NaN, so we can concatenate it with the part that has no NaN later on.
    #but first, we will need to fill those NaN values (done in the code lines behind this one)
    route_hash_freq_second_part = route_hash_freq_combined_first_merge[pd.isnull(route_hash_freq_combined_first_merge['stops_sequence'])][['route_id', 'hash', 'frequency']]
    #renames the hash column into hash_inverse so it the dataframe can be merged with route_hash_without_freq (because it didn't work with 'hash' on the first merge)
    route_hash_freq_second_part = route_hash_freq_second_part.rename(columns = {'hash':'hash_inverse'})
    route_hash_freq_second_part = pd.merge(route_hash_freq_second_part, route_hash_without_freq, on=['route_id', 'hash_inverse'], how='left')
    #the hash that is of interest in the final df will be hash and not hash_inverse
    route_hash_freq_combined_not_sorted = pd.concat([route_hash_freq_first_part, route_hash_freq_second_part])
    route_hash_freq_combined = route_hash_freq_combined_not_sorted.sort_values(by = ['route_id'])
    route_hash_freq_combined = route_hash_freq_combined.reset_index(drop = True)
    return route_hash_freq_combined

'Regroup the routes that are the same (even though they are in the opposite direction)'

In [268]:
'''Deletes the routes that do not represent 10% of the total route frequency and creates new route, if some of them are different'''

def apply_treshold_route_creation(route_hash_freq_combined): 
    #calculates the total frequency per route_id
    frequency_each_route = route_hash_freq_combined.groupby(['route_id'], as_index = False)['frequency'].sum()
    frequency_treshold = frequency_each_route.copy()
    #calculates the treshold (here 10%)
    frequency_treshold['frequency'] = frequency_treshold['frequency']/10
    frequency_treshold.rename(columns = {'frequency':'frequency_treshold'}, inplace = True)
    route_hash_freq_treshold = route_hash_freq_combined.merge(frequency_treshold, on='route_id', how = 'left')
    #find the sequences that are not more than 10% of the route frequency and delete them
    index_names = route_hash_freq_treshold[route_hash_freq_treshold['frequency'] < route_hash_freq_treshold['frequency_treshold']].index
    route_hash_freq_treshold.drop(index_names, inplace = True)
    #drop the routes with the same hash as others
    route_hash_freq_treshold['max_hash'] = route_hash_freq_treshold[['hash', 'hash_inverse']].max(axis=1)
    route_hash_freq_treshold = route_hash_freq_treshold.drop_duplicates(subset='max_hash')
    route_hash_freq_treshold  = route_hash_freq_treshold.drop(['hash_inverse', 'max_hash'], axis = 1)
    #selects the sequences that are not the first most frequent per route_id
    sequences_max_freq = route_hash_freq_treshold.groupby(['route_id'],as_index = False)['frequency'].max()
    sequences_max_freq.rename(columns = {'frequency':'max_frequency'}, inplace = True)
    sequences_max_freq_merged = route_hash_freq_treshold.merge(sequences_max_freq, on='route_id', how='left')
    sequences_max_freq_index = sequences_max_freq_merged[sequences_max_freq_merged['frequency'] == sequences_max_freq_merged['max_frequency']].drop_duplicates(subset='route_id').index
    sequences_non_max_freq_index = sequences_max_freq_merged[~sequences_max_freq_merged.index.isin(sequences_max_freq_index)].index
    #those selected sequences get a new route_id that starts from routes['route_id'].max() + 1 and increments by one for each new route
    if route_hash_freq_combined['route_id'].dtype == np.int64:
        route_id_creation = route_hash_freq_combined['route_id'].max() + 1
    else:
        route_id_creation =  0 + 1
    new_route_id_column = list(range(route_id_creation, route_id_creation + len(sequences_non_max_freq_index)))    
    sequences_max_freq_merged.loc[sequences_non_max_freq_index, 'route_id'] = new_route_id_column
    #keep only the column route_id and stops_sequence
    final_routes = sequences_max_freq_merged.drop(sequences_max_freq_merged[sequences_max_freq_merged['frequency'] == 0].index)
    final_routes = final_routes.drop(columns=['hash', 'frequency', 'frequency_treshold', 'max_frequency'])
    return final_routes

'Deletes the routes that do not represent 10% of the total route frequency and creates new route, if some of them are different'

In [227]:
''' To keep only the routes that have at least one country station in their route_sequence'''

def keep_country_routes(final_routes, stops_country_series):
    non_country_routes = set()
    for index_route, route in final_routes.iterrows():
        is_in_country = False
        for stop in route['stops_sequence']:
            if stop in set(stops_country_series):
                is_in_country = True
                break
        if not is_in_country:
            route_id = route['route_id']
            non_country_routes.add(route_id)
    country_routes = final_routes.loc[~final_routes['route_id'].isin(non_country_routes)]    

    return country_routes

' To keep only the routes that have at least one country station in their route_sequence'

In [258]:
'''Calculates the distances of the trip, by taking the distance between each stop of the stop_sequence'''

def calculate_distance_from_lat_long(name_first, name_second, stop_df):
        lon_first, lat_first = math.radians(stop_df[stop_df['stop_name'] == name_first].iloc[0]['stop_lon']), math.radians(stop_df[stop_df['stop_name'] == name_first].iloc[0]['stop_lat'])
        lon_second, lat_second = math.radians(stop_df[stop_df['stop_name'] == name_second].iloc[0]['stop_lon']), math.radians(stop_df[stop_df['stop_name'] == name_second].iloc[0]['stop_lat'])
        # The radius of the earth
        R = 6373.0 
        # To calculate the change in coordinates
        dlon = lon_second - lon_first
        dlat = lat_second - lat_first
        # To use the Haversine formula to get the distance in kilometers between the starting_station and the ending_station
        a = math.sin(dlat / 2)**2 + math.cos(lat_first) * math.cos(lat_second) * math.sin(dlon / 2)**2
        c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
        # To calculate the distance
        distance = R * c
        return distance

def calculate_distance(stop_sequence, stop_df):
    distance = 0
    for index_stop ,stop in enumerate(stop_sequence):
        index_plus_one = index_stop + 1
        if index_plus_one <= len(stop_sequence) - 1:
            distance += calculate_distance_from_lat_long(stop, stop_sequence[index_plus_one], stop_df)
    return distance

'Calculates the distances of the trip, by taking the distance between each stop of the stop_sequence'

In [229]:
'''Makes a df that can be used for building the nodes and edges of the graph using Networkx package'''

def create_df_for_Networkx(final_routes):
    '''return df_for_edges a df that can be used to build a Networkx L-space graph'''
    #takes the list stop sequence and make it a new column for each stop
    stops_sequence_values = final_routes.apply(lambda x: pd.Series(x['stops_sequence']),axis=1).stack().reset_index(level=1, drop=True)
    stops_sequence_values.name = 'stops_sequence'
    final_routes_stops = final_routes.drop('stops_sequence', axis=1).join(stops_sequence_values)
    final_routes_stops = final_routes_stops.reset_index(drop=True)
    #Creates a shifted instance of the df to use it for the final result
    final_routes_stops_shifted = final_routes_stops.shift()
    #Check if which of the rows are followed by a row with the same trip_id
    final_routes_stops_shifted['match'] = final_routes_stops_shifted['route_id'].eq(final_routes_stops['route_id'])
    #Drop the rows for which this condition is not satisfied
    final_routes_stops_shifted.drop(final_routes_stops_shifted[final_routes_stops_shifted['match'] == False].index, inplace = True)
    final_routes_stops_shifted.rename(columns=
      {"stops_sequence": "stop_name_1",
      "stop_name": "stop_name_1"}, inplace=True)
    #joins the df with its shifted version sothat each sequence of two stations is represented in the table as a row
    df_for_edges = final_routes_stops_shifted.join(final_routes_stops[['stops_sequence']], lsuffix='_caller', rsuffix='_other', how='left')
    df_for_edges.rename(columns=
      {"stops_sequence": "stop_name_2",
      "stop_name": "stop_name_2"}, inplace=True)

    df_for_edges = df_for_edges.drop_duplicates()
    df_for_edges = df_for_edges[['route_id','stop_name_1', 'stop_name_2']]
    df_for_edges = df_for_edges.reset_index(drop=True)
    return df_for_edges

'Makes a df that can be used for building the nodes and edges of the graph using Networkx package'

In [243]:
'''Applies all the functions from 1 get_extention_indexes to 11 create_df_for_Networkx'''

def full_route_creation(stops_sequences_df, trips_hash_stops_sequence, stops_cleaned_stop_times_trips_merge_dates, stops_cleaned, stops_country_series):
    '''return a df that can be used to make a Networkx L-space (with treshold applied of 10%)'''
    index_of_extendable, index_of_begin_sequences, index_of_complete_sequences = get_extention_indexes(stops_sequences_df)
    route_creation_first = possible_sequences_construction(stops_sequences_df, index_of_extendable, index_of_begin_sequences, index_of_complete_sequences)
    route_creation_second = add_full_sequences(stops_sequences_df, route_creation_first, index_of_complete_sequences)
    route_creation_third = add_unused_sequences(stops_sequences_df, route_creation_second)
    route_creation_frequency_single_travel_time = give_begin_end_time(route_creation_third, trips_hash_stops_sequence, stops_cleaned_stop_times_trips_merge_dates)
    route_creation_hash = calculate_hash_route_creation(route_creation_frequency_single_travel_time)
    route_hash_freq_combined = regroup_same_stops_sequences(route_creation_hash)
    final_routes = apply_treshold_route_creation(route_hash_freq_combined)
    country_routes = keep_country_routes(final_routes, stops_country_series)
    country_routes['distance'] = country_routes['stops_sequence'].apply(lambda x: calculate_distance(x, stops_cleaned))
    df_for_edges = create_df_for_Networkx(country_routes)
    
    return country_routes, df_for_edges

'Applies all the functions from 1 get_extention_indexes to 11 create_df_for_Networkx'

## Acutal route creation

In [None]:
belgian_routes, df_for_edges_Belgium = full_route_creation(routes_hash_Belgium, generic_trips_information_Belgium, trips_hash_Belgium, stops_cleaned_Belgium, stops_Belgium_series)

In [None]:
'''Save those two last DataFrames as .csv files'''
#belgian_routes.to_csv(f'{routes_loc}belgian_routes_Belgium.csv')
#df_for_edges_Belgium.to_csv(f'{df_for_edges_loc}df_for_edges_Belgium.csv')

In [None]:
stops_sequences_df, trips_hash_stops_sequence, stops_cleaned_stop_times_trips_merge_dates, stops_cleaned, stops_country_series = routes_hash_Belgium, generic_trips_information_Belgium, trips_hash_Belgium, stops_cleaned_Belgium, stops_Belgium_series

index_of_extendable, index_of_begin_sequences, index_of_complete_sequences = get_extention_indexes(stops_sequences_df)
route_creation_first = possible_sequences_construction(stops_sequences_df, index_of_extendable, index_of_begin_sequences, index_of_complete_sequences)
route_creation_second = add_full_sequences(stops_sequences_df, route_creation_first, index_of_complete_sequences)
route_creation_third = add_unused_sequences(stops_sequences_df, route_creation_second)
route_creation_frequency_single_travel_time = give_begin_end_time(route_creation_third, trips_hash_stops_sequence, stops_cleaned_stop_times_trips_merge_dates)
route_creation_hash = calculate_hash_route_creation(route_creation_frequency_single_travel_time)
route_hash_freq_combined = regroup_same_stops_sequences(route_creation_hash)
final_routes = apply_treshold_route_creation(route_hash_freq_combined)
country_routes = keep_country_routes(final_routes, stops_country_series)
country_routes['distance'] = country_routes['stops_sequence'].apply(lambda x: calculate_distance(x, stops_cleaned))
df_for_edges = create_df_for_Networkx(country_routes)

In [269]:
stops_sequences_df, trips_hash_stops_sequence, stops_cleaned_stop_times_trips_merge_dates, stops_cleaned, stops_country_series = routes_hash_Belgium, generic_trips_information_Belgium, trips_hash_Belgium, stops_cleaned_Belgium, stops_Belgium_series

final_routes = apply_treshold_route_creation(route_hash_freq_combined)
country_routes = keep_country_routes(final_routes, stops_country_series)
country_routes['distance'] = country_routes['stops_sequence'].apply(lambda x: calculate_distance(x, stops_cleaned))
df_for_edges = create_df_for_Networkx(country_routes)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  country_routes['distance'] = country_routes['stops_sequence'].apply(lambda x: calculate_distance(x, stops_cleaned))


In [260]:
stops_country_series

0        SCHAERBEEK
1             EVERE
2         HAREN-SUD
3              BUDA
4             HAREN
           ...     
559           YPRES
560       POPERINGE
561         ROULERS
562          IZEGEM
563    INGELMUNSTER
Name: stop_name, Length: 564, dtype: object

In [270]:
final_routes[final_routes['route_id'] == 167]

Unnamed: 0,route_id,stops_sequence,travel_time,waiting_time
50,167,"[ANVERS-CENTRAL, ANVERS-BERCHEM, MORTSEL, BOEC...",52.30303,6.69697


In [247]:
country_routes = keep_country_routes(final_routes, stops_country_series)


In [271]:
country_routes

Unnamed: 0,route_id,stops_sequence,travel_time,waiting_time,distance
0,115,"[TOURNAI, FROYENNES, HERSEAUX, MOUSCRON]",16.000000,0.000000,18.856662
1,116,"[KNOKKE, DUINBERGEN, HEIST, BRUGES-SAINT-PIERR...",21.608187,0.690058,18.936973
2,117,"[SPA-GERONSTERE, SPA, FRANCHIMONT, THEUX, JUSL...",28.034483,3.034483,14.412581
3,118,"[GAND-SAINT-PIERRE, DE PINTE, DEINZE, AARSELE,...",69.000000,4.000000,83.752756
4,119,"[GRAMMONT, SCHENDELBEKE, IDEGEM, ZANDBERGEN, A...",27.000000,1.000000,20.655690
...,...,...,...,...,...
614,730,"[HAVERSIN, AYE, MARLOIE, ROCHEFORT-JEMELLE, FO...",44.000000,2.000000,44.867352
615,900,"[PARIS NORD (FR), HAL]",142.203125,0.000000,248.752866
616,732,"[PARIS NORD (FR), HAL, BRUXELLES-MIDI]",150.203125,0.000000,261.930239
617,901,"[HAL, BRUXELLES-MIDI]",8.000000,0.000000,13.177373


In [273]:
route_hash_freq_combined[route_hash_freq_combined['route_id'] == 732]

Unnamed: 0,route_id,hash,frequency,stops_sequence,travel_time,waiting_time,hash_inverse
1431,732,-1407187577167226770,128.0,"[PARIS NORD (FR), HAL]",142.203125,0.0,-4.238869e+18
1432,732,-995155914250935308,454.0,"[PARIS NORD (FR), HAL, BRUXELLES-MIDI]",150.203125,0.0,-3.401468e+18
1433,732,6765455899021930200,128.0,"[HAL, BRUXELLES-MIDI]",8.0,0.0,-3.216745e+18


In [334]:
routes_hash_Belgium[routes_hash_Belgium['route_id'] == 115]

Unnamed: 0,route_id,hash,hash_inverse,service_id,trip_id,departure_time_first,departure_time_last,stops_sequence,dates,count_service_id
0,115,-1054191588631304569,8515140851473385823,14,"[88____:007::8885704:8885001:4:1023:20210418, ...","[10:07:00, 10:07:00, 10:07:00, 10:07:00, 11:07...","[10:23:00, 10:23:00, 10:23:00, 10:23:00, 11:23...","[MOUSCRON, HERSEAUX, FROYENNES, TOURNAI]","{20210417, 20210418}",2
48,115,-1054191588631304569,8515140851473385823,25,"[88____:007::8885704:8885001:4:23:20210418, 88...","[00:07:00, 00:07:00, 00:07:00, 00:07:00]","[00:23:00, 00:23:00, 00:23:00, 00:23:00]","[MOUSCRON, HERSEAUX, FROYENNES, TOURNAI]",{20210418},1
52,115,-1054191588631304569,8515140851473385823,31,"[88____:007::8885704:8885001:4:1723:20210417, ...","[17:07:00, 17:07:00, 17:07:00, 17:07:00, 18:07...","[17:23:00, 17:23:00, 17:23:00, 17:23:00, 18:23...","[MOUSCRON, HERSEAUX, FROYENNES, TOURNAI]",{20210417},1
80,115,-1054191588631304569,8515140851473385823,42,"[88____:007::8885704:8885001:4:1023:20210530, ...","[10:07:00, 10:07:00, 10:07:00, 10:07:00, 11:07...","[10:23:00, 10:23:00, 10:23:00, 10:23:00, 11:23...","[MOUSCRON, HERSEAUX, FROYENNES, TOURNAI]","{20210529, 20210530}",2
128,115,-1054191588631304569,8515140851473385823,93,"[88____:007::8885704:8885001:4:23:20210530, 88...","[00:07:00, 00:07:00, 00:07:00, 00:07:00]","[00:23:00, 00:23:00, 00:23:00, 00:23:00]","[MOUSCRON, HERSEAUX, FROYENNES, TOURNAI]",{20210530},1
132,115,-1054191588631304569,8515140851473385823,122,"[88____:007::8885704:8885001:4:1723:20210529, ...","[17:07:00, 17:07:00, 17:07:00, 17:07:00, 18:07...","[17:23:00, 17:23:00, 17:23:00, 17:23:00, 18:23...","[MOUSCRON, HERSEAUX, FROYENNES, TOURNAI]",{20210529},1
160,115,8515140851473385823,-1054191588631304569,14,"[88____:007::8885001:8885704:4:1052:20210418, ...","[10:36:00, 10:36:00, 10:36:00, 10:36:00, 11:36...","[10:52:00, 10:52:00, 10:52:00, 10:52:00, 11:52...","[TOURNAI, FROYENNES, HERSEAUX, MOUSCRON]","{20210417, 20210418}",2
208,115,8515140851473385823,-1054191588631304569,25,"[88____:007::8885001:8885704:4:52:20210418, 88...","[00:36:00, 00:36:00, 00:36:00, 00:36:00]","[00:52:00, 00:52:00, 00:52:00, 00:52:00]","[TOURNAI, FROYENNES, HERSEAUX, MOUSCRON]",{20210418},1
212,115,8515140851473385823,-1054191588631304569,31,"[88____:007::8885001:8885704:4:1752:20210417, ...","[17:36:00, 17:36:00, 17:36:00, 17:36:00, 18:36...","[17:52:00, 17:52:00, 17:52:00, 17:52:00, 18:52...","[TOURNAI, FROYENNES, HERSEAUX, MOUSCRON]",{20210417},1
240,115,8515140851473385823,-1054191588631304569,42,"[88____:007::8885001:8885704:4:1052:20210530, ...","[10:36:00, 10:36:00, 10:36:00, 10:36:00, 11:36...","[10:52:00, 10:52:00, 10:52:00, 10:52:00, 11:52...","[TOURNAI, FROYENNES, HERSEAUX, MOUSCRON]","{20210529, 20210530}",2


In [280]:
route_creation_first[route_creation_first['route_id'] == 732]

Unnamed: 0,route_id,hash,stops_sequence,frequency
1073,732,"[-1407187577167226770, 6765455899021930200]","[PARIS NORD (FR), HAL, BRUXELLES-MIDI]",1.0
1074,732,"[-1407187577167226770, 6765455899021930200]","[PARIS NORD (FR), HAL, BRUXELLES-MIDI]",26.0
1075,732,"[-1407187577167226770, 6765455899021930200]","[PARIS NORD (FR), HAL, BRUXELLES-MIDI]",11.0
1076,732,"[-1407187577167226770, 6765455899021930200]","[PARIS NORD (FR), HAL, BRUXELLES-MIDI]",3.0
1077,732,"[-1407187577167226770, 6765455899021930200]","[PARIS NORD (FR), HAL, BRUXELLES-MIDI]",23.0


In [281]:
route_creation_third[route_creation_third['route_id'] == 732]

Unnamed: 0,route_id,hash,stops_sequence,frequency
4609,732,[-3401467598889741995],"[BRUXELLES-MIDI, HAL, PARIS NORD (FR)]",81.0
4610,732,[-3401467598889741995],"[BRUXELLES-MIDI, HAL, PARIS NORD (FR)]",78.0
4611,732,[-3401467598889741995],"[BRUXELLES-MIDI, HAL, PARIS NORD (FR)]",36.0
4612,732,"[-1407187577167226770, 6765455899021930200]","[PARIS NORD (FR), HAL, BRUXELLES-MIDI]",1.0
4613,732,"[-1407187577167226770, 6765455899021930200]","[PARIS NORD (FR), HAL, BRUXELLES-MIDI]",26.0
4614,732,"[-1407187577167226770, 6765455899021930200]","[PARIS NORD (FR), HAL, BRUXELLES-MIDI]",11.0
4615,732,"[-1407187577167226770, 6765455899021930200]","[PARIS NORD (FR), HAL, BRUXELLES-MIDI]",3.0
4616,732,"[-1407187577167226770, 6765455899021930200]","[PARIS NORD (FR), HAL, BRUXELLES-MIDI]",23.0
8392,732,[-3401467598889741995],"[BRUXELLES-MIDI, HAL, PARIS NORD (FR)]",81.0
8393,732,[-3401467598889741995],"[BRUXELLES-MIDI, HAL, PARIS NORD (FR)]",36.0


In [326]:
'''Adds the sequences that were not yet added in the route_creation dataframe'''

def add_unused_sequences(stop_sequences_df, route_creation):
    '''returns the third part of the route_creation'''
    for route_id in route_creation['route_id'].unique():
        #get a set of the hashes that were employed to create the routes for that route_id
        used_sequences_hash = set(route_creation[route_creation['route_id'] == route_id].apply(lambda x: pd.Series(x['hash']),axis=1).stack().reset_index(level=1, drop=True))
        copy_sequences_route_id = select_stop_sequences(stop_sequences_df, route_id)[['route_id','hash','stops_sequence', 'dates', 'departure_time_last']]
        copy_sequences_route_id = calculate_frequency(copy_sequences_route_id)
        #adds the hashes that were not employed in any route creations for that route_id
        for index_trip, trip in copy_sequences_route_id.iterrows():
            #first element of the list because there is always only one element
            if trip['hash'][0] not in used_sequences_hash:
                route_creation = route_creation.append(trip, ignore_index = True)
    return route_creation, used_sequences_hash

'Adds the sequences that were not yet added in the route_creation dataframe'

In [327]:
route_creation_third, used_sequences_hash = add_unused_sequences(stops_sequences_df, route_creation_second)
route_creation_third[route_creation_third['route_id'] == 732]

Unnamed: 0,route_id,hash,stops_sequence,frequency
4609,732,[-3401467598889741995],"[BRUXELLES-MIDI, HAL, PARIS NORD (FR)]",81.0
4610,732,[-3401467598889741995],"[BRUXELLES-MIDI, HAL, PARIS NORD (FR)]",78.0
4611,732,[-3401467598889741995],"[BRUXELLES-MIDI, HAL, PARIS NORD (FR)]",36.0
4612,732,"[-1407187577167226770, 6765455899021930200]","[PARIS NORD (FR), HAL, BRUXELLES-MIDI]",1.0
4613,732,"[-1407187577167226770, 6765455899021930200]","[PARIS NORD (FR), HAL, BRUXELLES-MIDI]",26.0
4614,732,"[-1407187577167226770, 6765455899021930200]","[PARIS NORD (FR), HAL, BRUXELLES-MIDI]",11.0
4615,732,"[-1407187577167226770, 6765455899021930200]","[PARIS NORD (FR), HAL, BRUXELLES-MIDI]",3.0
4616,732,"[-1407187577167226770, 6765455899021930200]","[PARIS NORD (FR), HAL, BRUXELLES-MIDI]",23.0
8392,732,[-3401467598889741995],"[BRUXELLES-MIDI, HAL, PARIS NORD (FR)]",81.0
8393,732,[-3401467598889741995],"[BRUXELLES-MIDI, HAL, PARIS NORD (FR)]",36.0


In [284]:
route_creation_second[route_creation_second['route_id'] == 732]

Unnamed: 0,route_id,hash,stops_sequence,frequency
4609,732,[-3401467598889741995],"[BRUXELLES-MIDI, HAL, PARIS NORD (FR)]",81.0
4610,732,[-3401467598889741995],"[BRUXELLES-MIDI, HAL, PARIS NORD (FR)]",78.0
4611,732,[-3401467598889741995],"[BRUXELLES-MIDI, HAL, PARIS NORD (FR)]",36.0
4612,732,"[-1407187577167226770, 6765455899021930200]","[PARIS NORD (FR), HAL, BRUXELLES-MIDI]",1.0
4613,732,"[-1407187577167226770, 6765455899021930200]","[PARIS NORD (FR), HAL, BRUXELLES-MIDI]",26.0
4614,732,"[-1407187577167226770, 6765455899021930200]","[PARIS NORD (FR), HAL, BRUXELLES-MIDI]",11.0
4615,732,"[-1407187577167226770, 6765455899021930200]","[PARIS NORD (FR), HAL, BRUXELLES-MIDI]",3.0
4616,732,"[-1407187577167226770, 6765455899021930200]","[PARIS NORD (FR), HAL, BRUXELLES-MIDI]",23.0


In [305]:
used_sequences_hash

{-3.401467598889742e+18, -1.407187577167227e+18, 6.76545589902193e+18}

In [318]:
float(route_creation_third.iloc[0]['hash'][0]) in used_sequences_hash

True

In [331]:
stops_sequences_df = routes_hash_Belgium.loc[[416827, 416817]]

index_of_extendable, index_of_begin_sequences, index_of_complete_sequences = get_extention_indexes(stops_sequences_df)
route_creation_first = possible_sequences_construction(stops_sequences_df, index_of_extendable, index_of_begin_sequences, index_of_complete_sequences)
route_creation_second = add_full_sequences(stops_sequences_df, route_creation_first, index_of_complete_sequences)
route_creation_third = add_unused_sequences(stops_sequences_df, route_creation_second)

route_creation_first
route_creation_third

Unnamed: 0,route_id,hash,stops_sequence,frequency
0,732,"[-1407187577167226770, 6765455899021930200]","[PARIS NORD (FR), HAL, BRUXELLES-MIDI]",3.0


(   route_id                                         hash  \
 0       732  [-1407187577167226770, 6765455899021930200]   
 
                            stops_sequence  frequency  
 0  [PARIS NORD (FR), HAL, BRUXELLES-MIDI]        3.0  ,
 {-1407187577167226770, 6765455899021930200})

In [None]:
416827
416817

In [328]:
test = routes_hash_Belgium.loc[[416827, 416817]]

In [329]:
test

Unnamed: 0,route_id,hash,hash_inverse,service_id,trip_id,departure_time_first,departure_time_last,stops_sequence,dates,count_service_id
416827,732,6765455899021930200,-3216744546913690617,1575,"[88____:005::8814308:8814001:2:2119:20210430, ...","[21:11:00, 21:11:00]","[21:19:00, 21:19:00]","[HAL, BRUXELLES-MIDI]","{20210425, 20210430, 20210423}",3
416817,732,-1407187577167226770,-4238868800365589660,1575,"[87____:005::8727100:8814308:2:2111:20210430, ...","[18:49:00, 18:49:00]","[21:11:00, 21:11:00]","[PARIS NORD (FR), HAL]","{20210425, 20210430, 20210423}",3
