# Import of packages


In [1]:
'''Import the required packages.'''
import pandas as pd
import numpy as np
import networkx as nx
import collections
import matplotlib.pyplot as plt
import math
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import os

# Settings

In [2]:
'''Display all output results of a Jupyter cell.'''
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
'''Ensure that the output results of extensive output results are not truncated.'''
#pd.options.display.max_rows = 4000

'Ensure that the output results of extensive output results are not truncated.'

In [4]:
'''Change the width of the Notebook to see the output on the screen'''
#from IPython.core.display import display, HTML
#display(HTML("<style>.container { width:100% !important; }</style>"))

'Change the width of the Notebook to see the output on the screen'

# File locations

In [5]:
'''Register the GitHub link or the file relative location'''
#the Github link
#repository_loc = "https://raw.githubusercontent.com/polkuleuven/Thesis_Train/main"
#the local link
repository_loc = os.getcwd()

'Register the GitHub link or the file relative location'

In [6]:
'''Get the other folder locations'''
belgian_GTFS_loc = repository_loc + '/gtfs_train_Belgium_1503/'
dutch_GTFS_loc = repository_loc + '/gtfs_train_Netherlands_1503/'
swiss_GTFS_loc = repository_loc + '/gtfs_train_Switzerland_1503/'

stops_series_loc = repository_loc + '/country_stops_series/'
stops_cleaned_loc = repository_loc + '/stops_cleaned/'

'Get the other folder locations'

# Import of the datasets

## Functions

In [7]:
'''Import all the DataFrames that are common for the three train networks'''

def common_imports(datalink):
    #To import the agency dataset that contains limited information about the railway agency.
    agency = pd.read_csv(datalink + "agency.txt", sep=",")
    #To import the calendar_dates dataset that gives for each service_id all the exact dates when that service_id is valid.
    calendar_dates = pd.read_csv(datalink + "calendar_dates.txt", sep=",")
    #To import the routes dataset that provides the id, the name and the type of vehicle used for all railway routes.
    routes = pd.read_csv(datalink + "routes.txt", sep=",")
    #To import the stops dataset that contains information about the ids, the names and the geographical coordinates of the railway stations.
    stops = pd.read_csv(datalink + "stops.txt", sep=",")
    #To import the transfers dataset that gives the minimum transfer time to switch routes at each railway station.
    transfers = pd.read_csv(datalink + "transfers.txt", sep=",")
    #To import the trips dataset that gives for all routes an overview of the trips and the headsigns of these trips belonging to the railway route.
    #The service_id is an indication of all the dates this trip is valid (consultable in the calendar_dates dataset).
    trips = pd.read_csv(datalink + "trips.txt", sep=",")
    return agency, calendar_dates, routes, stops, transfers, trips

'Import all the DataFrames that are common for the three train networks'

## Acutal imports

### Belgium

In [8]:
'''Apply common_import()'''
agency_Belgium, calendar_dates_Belgium, routes_Belgium, stops_Belgium, transfers_Belgium, trips_Belgium = common_imports(belgian_GTFS_loc)

'Apply common_import()'

In [9]:
'''Import other DataFrames'''
#To import the translations dataset that provides the French-, Dutch-, German- and English-language translations of the Belgian railway stations.
translations_Belgium = pd.read_csv(belgian_GTFS_loc + "translations.txt", sep=",")
#To import the stop_times dataset that gives for all trips an overview of the ids of the stations served and the sequence in which these stations are served. 
#In addition, for all the trips the arrival and departure times at the stations served are given.
stop_times_Belgium = pd.read_csv(belgian_GTFS_loc + "stop_times.txt", sep=",")
#To import the calendar dataset that gives the first and last date of all data observations.
calendar_Belgium = pd.read_csv(belgian_GTFS_loc + "calendar.txt", sep=",")
#To import the stop_time_overrides dataset 
stop_time_overrides_Belgium = pd.read_csv(belgian_GTFS_loc + "stop_time_overrides.txt", sep=",")

'Import other DataFrames'

### Netherlands

In [10]:
'''Apply common_import()'''
agency_Netherlands, calendar_dates_Netherlands, routes_Netherlands, stops_Netherlands, transfers_not_cleaned_Netherlands, trips_Netherlands = common_imports(dutch_GTFS_loc)

'Apply common_import()'

In [11]:
'''Import other DataFrames'''
#To import the feed_info dataset that contains limited information about the Dutch NS railway feed.
feed_info_Netherlands = pd.read_csv(dutch_GTFS_loc + "feed_info.txt", sep=",")
#To import the stop_times dataset that gives for all trips an overview of the ids of the stations served and the sequence in which these stations are served. 
#In addition, for all the trips the arrival and departure times at the stations served are given.
stop_times_range = [*range(2, 19)]
stop_times_Netherlands = pd.read_csv(dutch_GTFS_loc + "stop_times-1.csv", sep=",")
for index in stop_times_range:
    stop_times_Netherlands = pd.concat([stop_times_Netherlands, pd.read_csv(dutch_GTFS_loc + "stop_times-" + str(index)+ ".csv", sep=",")])

'Import other DataFrames'

### Switzerland

In [12]:
'''Apply common_import()'''
agency_Switzerland, calendar_dates_Switzerland, routes_Switzerland, stops_Switzerland, transfers_not_cleaned_Switzerland, trips_Switzerland = common_imports(swiss_GTFS_loc)

'Apply common_import()'

In [13]:
'''Import other DataFrames'''
#To import the feed_info dataset that contains limited information about the Swiss SBB railway feed.
feed_info_Switzerland = pd.read_csv(swiss_GTFS_loc + "feed_info.txt", sep=",")
#To import the stop_times dataset that gives for all trips an overview of the ids of the stations served and the sequence in which these stations are served. 
#In addition, for all the trips the arrival and departure times at the stations served are given.
stop_times_Switzerland = pd.read_csv(swiss_GTFS_loc + "stop_times.txt", sep=",")
#To import the calendar dataset that gives the first and last date of all data observations.
calendar_Switzerland = pd.read_csv(swiss_GTFS_loc + "calendar.txt", sep=",")

'Import other DataFrames'

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


# Cleaning of the railway data

## Functions 

In [14]:
'''Clean the calendar_dates DataFrame'''

def clean_calendar_dates(calendar_dates):
    #To filter the dates from the selected begin to the end date
    begin_date = 20210314
    end_date = 20210713
    calendar_dates_cleaned = calendar_dates.copy()
    calendar_dates_cleaned = calendar_dates_cleaned.drop(calendar_dates_cleaned[(calendar_dates_cleaned['date'] > end_date) | (calendar_dates_cleaned['date'] < begin_date)].index)
    return calendar_dates_cleaned

'Clean the calendar_dates DataFrame'

In [15]:
'''Add the country to the stops DataFrame and returns the country filtered DataFrame of stops and the serie of those stops'''

def country_information(stops, country_name, stops_cleaned_loc, stops_series_loc):
    #To initialize the Nominatim API to get the location from the input string 
    geolocator = Nominatim(user_agent="application")
    reverse = RateLimiter(geolocator.reverse, min_delay_seconds=0.2)

    #To get the location with the geolocator.reverse() function and to extract the country from the location instance
    country_list = []
    for index, row in stops.iterrows():
        latitude = row['stop_lat']
        longitude = row['stop_lon']
        # To assign the latitude and longitude into a geolocator.reverse() method
        location = reverse((latitude, longitude), language='en', exactly_one=True)
        # To get the country from the given list and parsed into a dictionary with raw function()
        address = location.raw['address']
        country = address.get('country', '')
        country_list.append(country)

    #To add the values of country_list as a new attribute country 
    stops.loc[:,'country'] = country_list

    #To calculate the total number of Belgian stations in the stops dataset
    country_stops = stops[stops['country'] == country_name]
    country_stops_series = stops.loc[stops['country'] == country_name, 'stop_name']
    
    stops.to_csv(f'{stops_cleaned_loc}stops_cleaned_{country_name}.csv')
    country_stops_series.to_csv(f'{stops_series_loc}stops_{country_name}_series.csv')

'Add the country to the stops DataFrame and returns the country filtered DataFrame of stops and the serie of those stops'

In [16]:
'''Remove the accents from a string'''

def remove_accents(text):
    import unicodedata
    try:
        text = unicode(text, 'utf-8')
    except NameError:
        pass
    text = unicodedata.normalize('NFD', text).encode('ascii', 'ignore').decode("utf-8")
    return str(text)

'Remove the accents from a string'

## Acutal cleaning

### Belgium

In [17]:
'''Clean the routes_Belgium df'''
allowed_route_type = {'IC', 'L', 'P', 'ICT', 'IZY'}
routes_cleaned_Belgium = routes_Belgium[(routes_Belgium['route_short_name'].isin(allowed_route_type)) | (routes_Belgium['route_short_name'].str.startswith('S'))]

'Clean the routes_Belgium df'

In [18]:
'''Apply clean_calendar_dates()'''
calendar_dates_cleaned_Belgium = clean_calendar_dates(calendar_dates_Belgium)

'Apply clean_calendar_dates()'

In [19]:
'''Clean the stops_Belgium df.''' 
#To eliminate the stop_ids in the stops dataset that contain an underscore or that start with a character 'S'. 
stops_cleaned_Belgium = stops_Belgium[(~stops_Belgium['stop_id'].str.contains('_')) & (~stops_Belgium['stop_id'].str.contains('S'))]

#To modify the object datatype of the stop_id column to the NumPy int64 datatype
stops_cleaned_Belgium.loc[:,'stop_id'] = stops_cleaned_Belgium.loc[:,'stop_id'].astype(np.int64)

#To remove the accents from the stop_name and to change to uppercase
stops_cleaned_Belgium.loc[:,'stop_name'] = stops_cleaned_Belgium.loc[:,'stop_name'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
stops_cleaned_Belgium.loc[:,'stop_name'] = stops_cleaned_Belgium.loc[:,'stop_name'].str.upper()

'Clean the stops_Belgium df.'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [20]:
'''Apply country_information() and take the DataFrames from the files'''
country_name = 'Belgium'
#country_information(stops_cleaned_Belgium, country_name, stops_cleaned_loc, stops_series_loc)
stops_cleaned_Belgium = pd.read_csv(f"{stops_cleaned_loc}stops_cleaned_{country_name}.csv", sep=",")

'Apply country_information() and take the DataFrames from the files'

### Netherlands

In [21]:
'''Clean the routes_Netherlands DataFrame'''
#To keep the train routes
routes_cleaned_Netherlands = routes_Netherlands[routes_Netherlands['route_type'] == 2]
routes_cleaned_Netherlands = routes_cleaned_Netherlands.astype(str)
routes_cleaned_Netherlands.describe(include=['object'])

#To change the route_id object datatype to a NumPy int64 datatype
routes_cleaned_Netherlands.loc[:,'route_id'] = routes_cleaned_Netherlands.loc[:,'route_id'].astype(np.int64)

'Clean the routes_Netherlands DataFrame'

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_desc,route_type,route_color,route_text_color,route_url
count,145,145,145,145,145.0,145,145.0,145.0,145.0
unique,145,11,15,144,1.0,1,1.0,1.0,1.0
top,17803,IFF:NS,Sprinter,Nachtnettrein Utrecht Centraal <-> Rotterdam C...,,2,,,
freq,1,87,47,2,145.0,145,145.0,145.0,145.0


In [22]:
'''Apply clean_calendar_dates()'''
calendar_dates_cleaned_Netherlands = clean_calendar_dates(calendar_dates_Netherlands)

'Apply clean_calendar_dates()'

In [23]:
'''Clean the stops DataFrame'''
#To take from the stops_initial_Netherlands df all stop_ids that contain a 'stoparea:' to get the correct stop coordinates
stops_cleaned_Netherlands = stops_Netherlands[stops_Netherlands['stop_id'].str.contains('stoparea:')]

#To remove the accents from the accented characters and to convert the remaining characters to uppercase characters
stops_cleaned_Netherlands.loc[:,'stop_name'] = stops_cleaned_Netherlands.loc[:,'stop_name'].apply(remove_accents)
stops_cleaned_Netherlands.loc[:,'stop_name'] = stops_cleaned_Netherlands.loc[:,'stop_name'].str.upper()

'Clean the stops DataFrame'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [24]:
'''Apply country_information() and take the DataFrames from the files'''
country_name = 'Netherlands'
#country_information(stops_cleaned_Netherlands, country_name, stops_cleaned_loc, stops_series_loc)
stops_cleaned_Netherlands = pd.read_csv(f"{stops_cleaned_loc}stops_cleaned_{country_name}.csv", sep=",")

'Apply country_information() and take the DataFrames from the files'

In [25]:
'''Clean the stop_times df'''
stop_times_cleaned_Netherlands = stop_times_Netherlands.copy()
stop_times_cleaned_Netherlands.loc[:,'stop_id'] = stop_times_cleaned_Netherlands.stop_id.apply(str)
stop_times_cleaned_Netherlands = pd.merge(stop_times_cleaned_Netherlands, stops_Netherlands[['stop_id', 'stop_name']], on='stop_id')
stop_times_cleaned_Netherlands.loc[:,'stop_name'] = stop_times_cleaned_Netherlands.loc[:,'stop_name'].apply(remove_accents)
stop_times_cleaned_Netherlands.loc[:,'stop_name'] = stop_times_cleaned_Netherlands.loc[:,'stop_name'].str.upper()

'Clean the stop_times df'

### Switzerland

In [26]:
'''Clean the routes_Switzerland DataFrame'''
#To keep the train routes
routes_cleaned_Switzerland = routes_Switzerland[routes_Switzerland['route_type'] == 2]

'Clean the routes_Switzerland DataFrame'

In [27]:
'''Apply clean_calendar_dates()'''
calendar_dates_cleaned_Switzerland = clean_calendar_dates(calendar_dates_Switzerland)

'Apply clean_calendar_dates()'

In [28]:
'''Clean the stop_times_Switzerland DataFrame'''
# To remove the superfluous characters of the stop_id (platform codes)
stop_times_cleaned_Switzerland = stop_times_Switzerland.copy()
stop_times_cleaned_Switzerland_column = stop_times_cleaned_Switzerland['stop_id'].str.split(':').str[0]
stop_times_cleaned_Switzerland.loc[:,'stop_id'] = stop_times_cleaned_Switzerland_column

# To make the stop_ids numerical 
stop_times_cleaned_Switzerland.loc[:,'stop_id'] = stop_times_cleaned_Switzerland.loc[:,'stop_id'].astype(np.int64)

'Clean the stop_times_Switzerland DataFrame'

In [29]:
'''Clean the stops_Switzerland DataFrame'''
#To remove the superfluous characters (platform codes)
stops_cleaned_Switzerland_column = stops_Switzerland['stop_id'].str.split(':').str[0]
stops_cleaned_Switzerland = stops_Switzerland.copy()
stops_cleaned_Switzerland.loc[:,'stop_id'] = stops_cleaned_Switzerland_column

#To make the stop_ids numerical and to remove the duplicate stop_ids
stops_cleaned_Switzerland = stops_cleaned_Switzerland[['stop_id', 'stop_name', 'stop_lat', 'stop_lon']]
stops_cleaned_Switzerland.loc[:,'stop_id'] = stops_cleaned_Switzerland.loc[:,'stop_id'].astype(np.int64)
stops_cleaned_Switzerland = stops_cleaned_Switzerland.drop_duplicates()

#To remove the accents from the stop_name and to change to uppercase
stops_cleaned_Switzerland.loc[:,'stop_name'] = stops_cleaned_Switzerland.loc[:,'stop_name'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
stops_cleaned_Switzerland.loc[:,'stop_name'] = stops_cleaned_Switzerland.loc[:,'stop_name'].str.upper()

'Clean the stops_Switzerland DataFrame'

In [30]:
'''Apply country_information() and take the DataFrames from the files'''
country_name = 'Switzerland'
#country_information(stops_cleaned_Switzerland, country_name, stops_cleaned_loc, stops_series_loc)
stops_cleaned_Switzerland = pd.read_csv(f"{stops_cleaned_loc}stops_cleaned_{country_name}.csv", sep=",")

'Apply country_information() and take the DataFrames from the files'

# Merge the DataFrames

## Functions

In [59]:
'''Merge the DataFrames'''

def merge_df(stop_times, stops, routes, trips, calendar_dates, on_stop):
    list_columns = ['stop_name', 'stop_lat', 'stop_lon', 'country']
    if on_stop == 'stop_id':
        list_columns.append('stop_id')
    #To merge the stop_times df with the stops df on stop_id
    stop_times_stops = pd.merge(stop_times, stops[list_columns], on= on_stop)

    #To merge the trips df with the routes df on route_id
    routes_trips = pd.merge(routes[['route_id']], trips, on='route_id')

    #To merge the stop_times_stops df with the trips_routes df on trip_id
    uncleaned_railway_system_information = pd.merge(routes_trips, stop_times_stops, on='trip_id')

    #To take only the service_ids present in both the routes_trips_stop_times_stops df and the calendar_dates df into account
    calendar_dates_unique = calendar_dates['service_id'].unique()
    railway_system_information = uncleaned_railway_system_information[(uncleaned_railway_system_information['service_id'].isin(calendar_dates_unique))]
    
    return railway_system_information

'Merge the DataFrames'

## Actual merging

### Belgium 

In [60]:
'''Select all required fields'''
agency_cleaned_Belgium = agency_Belgium[['agency_id', 'agency_name', 'agency_url', 'agency_timezone']]
routes_cleaned_Belgium = routes_cleaned_Belgium[['route_id', 'agency_id', 'route_short_name', 'route_long_name', 'route_type']]
trips_cleaned_Belgium = trips_Belgium[['trip_id', 'route_id', 'service_id', 'trip_headsign']]
calendar_dates_cleaned_Belgium = calendar_dates_cleaned_Belgium[['service_id', 'date']]
stops_cleaned_Belgium = stops_cleaned_Belgium[['stop_id', 'stop_name', 'stop_lat', 'stop_lon', 'country']]
stop_times_cleaned_Belgium = stop_times_Belgium[['trip_id', 'stop_id', 'arrival_time', 'departure_time', 'stop_sequence']]

'Select all required fields'

In [61]:
'''Apply merge_df()'''
railway_system_information_Belgium = merge_df(stop_times_cleaned_Belgium, stops_cleaned_Belgium, routes_cleaned_Belgium, trips_cleaned_Belgium, calendar_dates_cleaned_Belgium, 'stop_id')

'Apply merge_df()'

### Netherlands

In [62]:
'''Select all required fields'''
agency_cleaned_Netherlands = agency_Netherlands[['agency_id', 'agency_name', 'agency_url', 'agency_timezone']]
routes_cleaned_Netherlands = routes_cleaned_Netherlands[['route_id', 'agency_id', 'route_short_name', 'route_long_name', 'route_type']]
trips_cleaned_Netherlands = trips_Netherlands[['trip_id', 'route_id', 'service_id', 'trip_headsign']]
calendar_dates_cleaned_Netherlands = calendar_dates_cleaned_Netherlands[['service_id', 'date']]
stops_cleaned_Netherlands = stops_cleaned_Netherlands[['stop_name', 'stop_lat', 'stop_lon', 'country']]
stop_times_cleaned_Netherlands = stop_times_cleaned_Netherlands[['trip_id', 'stop_name', 'arrival_time', 'departure_time', 'stop_sequence']]

'Select all required fields'

In [63]:
'''Apply merge_df()'''
railway_system_information_Netherlands = merge_df(stop_times_cleaned_Netherlands, stops_cleaned_Netherlands, routes_cleaned_Netherlands, trips_cleaned_Netherlands, calendar_dates_cleaned_Netherlands, 'stop_name')

'Apply merge_df()'

### Switzerland

In [64]:
'''Select all required fields'''
agency_cleaned_Switzerland = agency_Switzerland[['agency_id', 'agency_name', 'agency_url', 'agency_timezone']]
routes_cleaned_Switzerland = routes_cleaned_Switzerland[['route_id', 'agency_id', 'route_short_name', 'route_long_name', 'route_type']]
trips_cleaned_Switzerland = trips_Switzerland[['trip_id', 'route_id', 'service_id', 'trip_headsign']]
calendar_dates_cleaned_Switzerland = calendar_dates_cleaned_Switzerland[['service_id', 'date']]
stops_cleaned_Switzerland = stops_cleaned_Switzerland[['stop_id', 'stop_name', 'stop_lat', 'stop_lon', 'country']]
stop_times_cleaned_Switzerland = stop_times_cleaned_Switzerland[['trip_id', 'stop_id', 'arrival_time', 'departure_time', 'stop_sequence']]

'Select all required fields'

In [65]:
'''Apply merge_df()'''
railway_system_information_Switzerland = merge_df(stop_times_cleaned_Switzerland, stops_cleaned_Switzerland, routes_cleaned_Switzerland, trips_cleaned_Switzerland, calendar_dates_cleaned_Switzerland, 'stop_id')

'Apply merge_df()'

# Preparation for the space-of-stops representation of the railway systems


In [66]:
railway_system_information_Belgium

Unnamed: 0,route_id,trip_id,service_id,trip_headsign,stop_id,arrival_time,departure_time,stop_sequence,stop_name,stop_lat,stop_lon,country
0,115,88____:007::8885704:8885001:4:523:20210418,14,Tournai,8885001,05:23:00,05:23:00,4,TOURNAI,50.61313,3.396940,Belgium
1,115,88____:007::8885704:8885001:4:523:20210418,14,Tournai,8885068,05:19:00,05:19:00,3,FROYENNES,50.62989,3.354835,Belgium
2,115,88____:007::8885704:8885001:4:523:20210418,14,Tournai,8885753,05:12:00,05:12:00,2,HERSEAUX,50.71390,3.245961,Belgium
3,115,88____:007::8885704:8885001:4:523:20210418,14,Tournai,8885704,05:07:00,05:07:00,1,MOUSCRON,50.74100,3.228449,Belgium
4,115,88____:007::8885704:8885001:4:623:20210418,14,Tournai,8885001,06:23:00,06:23:00,4,TOURNAI,50.61313,3.396940,Belgium
...,...,...,...,...,...,...,...,...,...,...,...,...
431487,734,88____:007::8821105:8812005:22:1723:20210418,25,Bruxelles-Nord,8811270,17:01:00,17:01:00,19,VELTEM,50.90052,4.633520,Belgium
431488,734,88____:007::8821105:8812005:22:1723:20210418,25,Bruxelles-Nord,8811288,16:59:00,16:59:00,18,HERENT,50.90353,4.672190,Belgium
431489,734,88____:007::8821105:8812005:22:1723:20210418,25,Bruxelles-Nord,8819406,17:10:00,17:12:00,23,BRUSSELS AIRPORT-ZAVENTEM,50.89646,4.482072,Belgium
431490,734,88____:007::8821105:8812005:22:1723:20210418,25,Bruxelles-Nord,8821063,16:11:00,16:11:00,5,ANVERS-LUCHTBAL,51.24413,4.425033,Belgium


# Preparation for the space-of-stops representation of the railway systems


## Functions

In [68]:
'''Create a DataFrame with the departure time form the first stop sequence and with the one from last stop sequence for each trip_id'''

def create_trip_departure_times(railway_system_information):
    departure_time_first = railway_system_information.reset_index().loc[railway_system_information.reset_index().groupby(['trip_id'])['stop_sequence'].idxmin()][['route_id', 'trip_id', 'departure_time']].copy()
    departure_time_first = departure_time_first.rename(columns = {'departure_time': 'departure_time_first'})
    departure_time_last = railway_system_information.reset_index().loc[railway_system_information.reset_index().groupby(['trip_id'])['stop_sequence'].idxmax()][['route_id', 'trip_id', 'departure_time']].copy()
    departure_time_last = departure_time_last.rename(columns = {'departure_time': 'departure_time_last'})
    trip_departure_times = departure_time_first.merge(departure_time_last[['trip_id', 'departure_time_last']], on='trip_id')
    return trip_departure_times

'Create a DataFrame with the departure time form the first stop sequence and with the one from last stop sequence for each trip_id'

In [69]:
'''Put the stop_names per trip_id in a list in the new trip_stops_sequence DataFrame and
Calculate the hash of the stop sequence in both order (ascending and descending)'''

def create_trip_stop_sequence(trip_departure_times):    
    #Put the stop_names per trip_id in a list in the new trip_stops_sequence DataFrame
    trip_stop_sequence = trip_departure_times.groupby('trip_id')['stop_name'].apply(lambda group_series: group_series.tolist()).reset_index()
    trip_stop_sequence.rename(columns={'stop_name':'stop_sequence'}, inplace=True)
    #Calculate the hash of the stop sequence in both order (ascending and descending)
    trip_stop_sequence['hash'] = trip_stop_sequence['stop_sequence'].apply(lambda x: hash(tuple(x)))
    trip_stop_sequence['hash_inverse'] = trip_stop_sequence['stop_sequence'].apply(lambda x: hash(tuple(x[::-1])))

'Put the stop_names per trip_id in a list in the new trip_stops_sequence DataFrame and\nCalculate the hash of the stop sequence in both order (ascending and descending)'

In [70]:
'''Regroup the days per service id in a set and count them'''

def create_service_id_dates(calendar_dates):
    service_id_dates = calendar_dates.groupby('service_id')['date'].apply(lambda group_series: set(group_series.tolist())).reset_index()
    service_id_dates.rename(columns={'date':'dates'}, inplace=True)
    service_id_dates['count_service_id'] = service_id_dates['dates'].apply(lambda x: len(x))
    return service_id_dates

'Regroup the days per service id in a set and count them'

In [71]:
'''Put the different trip_ids in a list and add the departure_time first and last lists'''

def create_routes_hash(trips_hash):
    common_columns = ['route_id','route_long_name','hash', 'hash_inverse', 'service_id']
    routes_hash = trips_hash.groupby(common_columns)['trip_id'].apply(lambda group_series: group_series.tolist()).reset_index()
    route_hash_dep_first = trips_hash.groupby(common_columns)['departure_time_first'].apply(lambda group_series: group_series.tolist()).reset_index()
    route_hash_dep_last = trips_hash.groupby(common_columns)['departure_time_last'].apply(lambda group_series: group_series.tolist()).reset_index()
    routes_hash = route_hash.merge(route_hash_dep_first, on= common_columns)
    routes_hash = route_hash.merge(route_hash_dep_last, on= common_columns)
    return routes_hash

'Put the different trip_ids in a list and add the departure_time first and last lists'

In [73]:
'''Create DataFrames that will be used for the route_creation process'''

def prepartion_space(railway_system_information, calendar_dates):    
    #Sort values by the route_id, the trip_id, and the stop_sequence fields
    railway_system_information.sort_values(by=['route_id', 'trip_id', 'stop_sequence'])

    trip_departure_times = create_trip_departure_times(railway_system_information)

    #Merge railway_system_information with trip_departure_times
    trip_departure_times = railway_system_information.merge(trip_departure_times[['trip_id','departure_time_first','departure_time_last']], on='trip_id')

    trip_stop_sequence = create_trip_stop_sequence(trip_departure_times)
    
    #Add the stop_sequence of stations to the trip_departure_times dataset by joining on trip_id
    trips_hash = pd.merge(trip_departure_times, trip_stop_sequence, on='trip_id')
    
    service_id_dates = create_service_id_dates(calendar_dates)
    
    #Merge trips_hash with service_id_dates
    trips_hash = pd.merge(trips_hash, service_id_dates, on='service_id', how='left')
    
    routes_hash = create_routes_hash(trips_hash)
    
    #Add the sequence of stops, dates and service_id_count to the route_hash_freq_dep dataset
    routes_hash = pd.merge(routes_hash, trips_hash[['route_id','hash', 'hash_inverse', 'service_id','stop_sequence','dates','service_id_count']], on=['route_id', 'hash', 'hash_inverse', 'service_id'])
    
    #route_hash = route_hash_freq_seq.drop_duplicates( subset = ['route_id', 'hash', 'service_id'], keep = 'first')
    
    return route_hash

'Create DataFrames that will be used for the route_creation process'

## Acutal preparation

### Belgium