# Import of packages


In [1]:
'''Import the required packages.'''
import pandas as pd
import numpy as np
import networkx as nx
import collections
import matplotlib.pyplot as plt
import math
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import os
import itertools

# Settings

In [2]:
'''Display all output results of a Jupyter cell.'''
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
'''Ensure that the output results of extensive output results are not truncated.'''
#pd.options.display.max_rows = 4000

'Ensure that the output results of extensive output results are not truncated.'

In [4]:
'''Change the width of the Notebook to see the output on the screen'''
#from IPython.core.display import display, HTML
#display(HTML("<style>.container { width:100% !important; }</style>"))

'Change the width of the Notebook to see the output on the screen'

# File locations

In [5]:
'''If your computer is a Windows and if you are using the file locally (repository_loc == os.getcwd()) put True, False otherwise'''
windows_locally = False 

'If your computer is a Windows and if you are using the file locally (repository_loc == os.getcwd()) put True, False otherwise'

In [6]:
'''Register the GitHub link or the file relative location'''
#the Github link
#repository_loc, windows_locally = "https://raw.githubusercontent.com/polkuleuven/Thesis_Train/main", False
#the local link
repository_loc = os.getcwd()

'Register the GitHub link or the file relative location'

In [7]:
'''Get the other folder locations'''

belgian_GTFS_loc = repository_loc + '/gtfs_train_Belgium_1503/'
dutch_GTFS_loc = repository_loc + '/gtfs_train_Netherlands_1503/'
swiss_GTFS_loc = repository_loc + '/gtfs_train_Switzerland_1503/'

stops_series_loc = repository_loc + '/country_stops_series/'
stops_cleaned_loc = repository_loc + '/stops_cleaned/'
df_for_edges_loc = repository_loc + '/df_for_edges/'
routes_loc = repository_loc + '/routes/'

if windows_locally:
    belgian_GTFS_loc = belgian_GTFS_loc.replace('/', " \\ ").replace(' ', "")
    dutch_GTFS_loc = dutch_GTFS_loc.replace('/', " \\ ").replace(' ', "")
    swiss_GTFS_loc = swiss_GTFS_loc.replace('/', " \\ ").replace(' ', "")

    stops_series_loc = stops_series_loc.replace('/', " \\ ").replace(' ', "")
    stops_cleaned_loc = stops_cleaned_loc.replace('/', " \\ ").replace(' ', "")
    df_for_edges_loc = df_for_edges_loc.replace('/', " \\ ").replace(' ', "")
    routes_loc = routes_loc.replace('/', " \\ ").replace(' ', "")

'Get the other folder locations'

# Import of the datasets

## Functions

In [8]:
'''Import all the DataFrames that are common for the three train networks (except stop_times)'''

def common_imports(datalink):
    #To import the agency dataset that contains limited information about the railway agency.
    agency = pd.read_csv(datalink + "agency.txt", sep=",")
    #To import the calendar_dates dataset that gives for each service_id all the exact dates when that service_id is valid.
    calendar_dates = pd.read_csv(datalink + "calendar_dates.txt", sep=",")
    #To import the routes dataset that provides the id, the name and the type of vehicle used for all railway routes.
    routes = pd.read_csv(datalink + "routes.txt", sep=",")
    #To import the stops dataset that contains information about the ids, the names and the geographical coordinates of the railway stations.
    stops = pd.read_csv(datalink + "stops.txt", sep=",")
    #To import the transfers dataset that gives the minimum transfer time to switch routes at each railway station.
    transfers = pd.read_csv(datalink + "transfers.txt", sep=",")
    #To import the trips dataset that gives for all routes an overview of the trips and the headsigns of these trips belonging to the railway route.
    #The service_id is an indication of all the dates this trip is valid (consultable in the calendar_dates dataset).
    trips = pd.read_csv(datalink + "trips.txt", sep=",")
    return agency, calendar_dates, routes, stops, transfers, trips

'Import all the DataFrames that are common for the three train networks (except stop_times)'

## Actual imports

### Belgium

In [9]:
'''Apply common_import()'''
agency_Belgium, calendar_dates_Belgium, routes_Belgium, stops_Belgium, transfers_Belgium, trips_Belgium = common_imports(belgian_GTFS_loc)
agency_Belgium
calendar_dates_Belgium
routes_Belgium
stops_Belgium
trips_Belgium

'Apply common_import()'

Unnamed: 0,agency_id,agency_name,agency_url,agency_timezone,agency_lang,agency_phone
0,NMBS/SNCB,NMBS/SNCB,http://www.belgiantrain.be/,Europe/Brussels,fr,


Unnamed: 0,service_id,date,exception_type
0,1,20210314,1
1,2,20210315,1
2,2,20210316,1
3,2,20210317,1
4,2,20210318,1
...,...,...,...
487715,0,20211207,1
487716,0,20211208,1
487717,0,20211209,1
487718,0,20211210,1


Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_desc,route_type,route_url,route_color,route_text_color
0,1,NMBS/SNCB,BUS,Lichtervelde -- La Panne,,700,,,
1,10,NMBS/SNCB,BUS,Charleroi-Sud -- Tamines,,700,,,
2,100,NMBS/SNCB,BUS,Courtrai -- Roulers,,700,,,
3,101,NMBS/SNCB,BUS,Mons -- La Louvière-Sud,,700,,,
4,102,NMBS/SNCB,BUS,Lierre -- Aarschot,,700,,,
...,...,...,...,...,...,...,...,...,...
729,95,NMBS/SNCB,BUS,Bruxelles-Midi -- Nivelles,,700,,,
730,96,NMBS/SNCB,BUS,Bertrix -- Virton,,700,,,
731,97,NMBS/SNCB,BUS,Audenarde -- Renaix,,700,,,
732,98,NMBS/SNCB,BUS,Mons -- Jurbise,,700,,,


Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station,platform_code
0,8015345,,Aachen Hbf (DE),,50.77083,6.105277,,,0,,
1,8200100,,Luxembourg (LU),,49.60000,6.133333,,,0,,
2,8200101,,Dommeldange (LU),,49.63390,6.136765,,,0,,
3,8200102,,Pfaffenthal-Kirchberg (LU),,49.61913,6.132853,,,0,,
4,8200110,,Mersch (LU),,49.74889,6.106111,,,0,,
...,...,...,...,...,...,...,...,...,...,...,...
2638,8896909,,Izegem,,50.92115,3.212089,,,0,S8896909,
2639,S8896925,,Ingelmunster,,50.91433,3.255416,,,1,,
2640,8896925_1,,Ingelmunster,,50.91433,3.255416,,,0,S8896925,1
2641,8896925_2,,Ingelmunster,,50.91433,3.255416,,,0,S8896925,2


Unnamed: 0,route_id,service_id,trip_id,trip_headsign,trip_short_name,direction_id,block_id,shape_id,trip_type
0,1,1,88____:049::8892338:8892205:6:1925:20210314,Lichtervelde,11000,,1,,1
1,2,2,88____:049::8883006:8881000:8:2446:20210319,Mons,11000,,2,,1
2,3,3,82____:049::8200100:8869054:2:614:20210321,Arlon,11002,,3,,1
3,3,3,88____:049::8869054:8866001:2:624:20210321,Arlon,11002,,3,,1
4,3,3,82____:049::8200100:8869054:2:714:20210321,Arlon,11003,,4,,1
...,...,...,...,...,...,...,...,...,...
30831,731,598,88____:046::8865003:8864345:6:1730:20211211,Marloie,987,,25450,,1
30832,731,212,88____:046::8865003:8864345:6:1830:20211210,Marloie,988,,25451,,1
30833,731,598,88____:046::8865003:8864345:6:1930:20211211,Marloie,989,,25452,,1
30834,731,212,88____:046::8865003:8864345:6:2030:20211210,Marloie,990,,25453,,1


In [10]:
'''Import other DataFrames'''
#To import the translations dataset that provides the French-, Dutch-, German- and English-language translations of the Belgian railway stations.
translations_Belgium = pd.read_csv(belgian_GTFS_loc + "translations.txt", sep=",")
#To import the stop_times dataset that gives for all trips an overview of the ids of the stations served and the sequence in which these stations are served. 
#In addition, for all the trips the arrival and departure times at the stations served are given.
stop_times_Belgium = pd.read_csv(belgian_GTFS_loc + "stop_times.txt", sep=",")
#To import the calendar dataset that gives the first and last date of all data observations.
calendar_Belgium = pd.read_csv(belgian_GTFS_loc + "calendar.txt", sep=",")
#To import the stop_time_overrides dataset 
stop_time_overrides_Belgium = pd.read_csv(belgian_GTFS_loc + "stop_time_overrides.txt", sep=",")

stop_times_Belgium

'Import other DataFrames'

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled
0,88____:049::8892338:8892205:6:1925:20210314,17:35:00,17:35:00,8892338,1,,0,1,
1,88____:049::8892338:8892205:6:1925:20210314,17:50:00,17:50:00,8892320,2,,0,0,
2,88____:049::8892338:8892205:6:1925:20210314,18:05:00,18:05:00,8892304,3,,0,0,
3,88____:049::8892338:8892205:6:1925:20210314,18:35:00,18:35:00,8892452,4,,0,0,
4,88____:049::8892338:8892205:6:1925:20210314,19:05:00,19:05:00,8892403,5,,0,0,
...,...,...,...,...,...,...,...,...,...
460669,88____:046::8865003:8864345:6:2130:20211211,21:06:00,21:06:00,8865227,2,,0,0,
460670,88____:046::8865003:8864345:6:2130:20211211,21:16:00,21:16:00,8864337,3,,0,0,
460671,88____:046::8865003:8864345:6:2130:20211211,21:21:00,21:21:00,8864311,4,,0,0,
460672,88____:046::8865003:8864345:6:2130:20211211,21:24:00,21:25:00,8864006,5,,0,0,


### Netherlands

In [11]:
'''Apply common_import()'''
agency_Netherlands, calendar_dates_Netherlands, routes_Netherlands, stops_Netherlands, transfers_not_cleaned_Netherlands, trips_Netherlands = common_imports(dutch_GTFS_loc)
agency_Netherlands
calendar_dates_Netherlands
routes_Netherlands
stops_Netherlands
trips_Netherlands

'Apply common_import()'

Unnamed: 0,agency_id,agency_name,agency_url,agency_timezone,agency_phone
0,ALLGO,allGo (Keolis),http://www.allgobus.nl,Europe/Amsterdam,003188-0331360
1,ARR,Arriva,https://www.arriva.nl,Europe/Amsterdam,0031900-2022022
2,BRAVO:ARR,Bravo (Arriva),http://www.bravo.info,Europe/Amsterdam,0031800-0232545
3,BRAVO:CXX,Bravo (Hermes),http://www.bravo.info,Europe/Amsterdam,0031800-0222277
4,BRENG,Breng,http://www.breng.nl,Europe/Amsterdam,003126-2142140
5,CXX,Connexxion,http://www.connexxion.nl,Europe/Amsterdam,0031900-2666399
6,DELIJN,De Lijn,http://www.delijn.be,Europe/Amsterdam,0031900-7289965
7,EBS,EBS,http://www.ebs-ov.nl,Europe/Amsterdam,0031800-0327
8,FF,Westerschelde Ferry,http://www.westerscheldeferry.nl,Europe/Amsterdam,003188-0760400
9,GVB,GVB,http://www.gvb.nl,Europe/Amsterdam,0031900-8011


Unnamed: 0,service_id,date,exception_type
0,1,20210314,1
1,2,20210314,1
2,2,20210412,1
3,2,20210419,1
4,2,20210426,1
...,...,...,...
181865,4090,20210802,1
181866,4090,20210803,1
181867,4090,20210804,1
181868,4090,20210805,1


Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_desc,route_type,route_color,route_text_color,route_url
0,75937,ARR,2,Brunssum/Hoensbroek via Heerlen - Bleijerheide,,3,,,
1,74020,RET,574,STOPenGO Lansingerland,,3,,,
2,74018,RET,526,STOPenGO Maassluis,,3,,,
3,73358,BRENG,78,Papendal - Beekdal Lyceum,,3,,,http://www.breng.nl/dienstregeling/lijn?ID=A078
4,67306,BRENG,352,Wageningen Station - Arnhem CS,,3,,,http://www.breng.nl/dienstregeling/lijn?ID=A352
...,...,...,...,...,...,...,...,...,...
2391,62989,ARR,6381,"Nijetrijne - Wolvega, Station",,3,,,
2392,62990,ARR,6371,"Nijelamer - Wolvega, Station",,3,,,
2393,62991,ARR,7133,Jislum - Oentsjerk,,3,,,
2394,62992,ARR,7152,"Burdaard - Dokkum, Sionsberg",,3,,,


Unnamed: 0,stop_id,stop_code,stop_name,stop_lat,stop_lon,location_type,parent_station,stop_timezone,wheelchair_boarding,platform_code,zone_id
0,2343279,61400100,"Zetten, Hoofdstraat",51.931488,5.711353,0,stoparea:372303,,1.0,,
1,2324685,,Warffum,53.390191,6.566536,0,stoparea:17893,,,,IFF:wfm
2,2324787,,Zutphen,52.145054,6.195361,0,stoparea:18004,,,1b,IFF:zp
3,2323270,,Amsterdam Sloterdijk,52.388946,4.838405,0,stoparea:18177,,,6,IFF:ass
4,2385461,HA6030,"Onbekend, Zwembad Krimpen",51.910786,4.593716,0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
61569,stoparea:195075,,"Weert, Bassin",51.257289,5.706368,1,,,0.0,,
61570,stoparea:32953,,"Amsterdam, Artis",52.366650,4.911110,1,,,0.0,,
61571,stoparea:2612,,"Amsterdam, Lutmastraat",52.351663,4.903220,1,,,0.0,,
61572,stoparea:1133,,"IJlst, Station",53.014301,5.615621,1,,,0.0,,


Unnamed: 0,route_id,service_id,trip_id,realtime_trip_id,trip_headsign,trip_short_name,trip_long_name,direction_id,block_id,shape_id,wheelchair_accessible,bikes_allowed
0,60662,1,127986932,GVB:1:203,Osdorp De Aker,203.0,,1,,954059.0,1,
1,60662,1,127986896,GVB:1:131,Osdorp De Aker,131.0,,1,,954059.0,1,
2,60662,1,127986851,GVB:1:42,Muiderpoortstation,42.0,,0,,936112.0,1,
3,60662,1,127986878,GVB:1:95,Osdorp De Aker,95.0,,1,,954059.0,1,
4,60662,1,127986921,GVB:1:181,Muiderpoortstation,181.0,,0,,936112.0,1,
...,...,...,...,...,...,...,...,...,...,...,...,...
622313,67301,4088,128464524,CXX:L402:4047,Eindhoven Station,4047.0,,0,,934473.0,1,
622314,67301,4088,128464526,CXX:L402:4051,Eindhoven Station,4051.0,,0,,934473.0,1,
622315,67301,4088,128464528,CXX:L402:4055,Eindhoven Station,4055.0,,0,,934473.0,1,
622316,67301,4088,128464529,CXX:L402:4056,Veldhoven via PR Meerhoven,4056.0,,1,,899806.0,1,


In [12]:
'''Import other DataFrames'''
#To import the feed_info dataset that contains limited information about the Dutch NS railway feed.
feed_info_Netherlands = pd.read_csv(dutch_GTFS_loc + "feed_info.txt", sep=",")
#To import the stop_times dataset that gives for all trips an overview of the ids of the stations served and the sequence in which these stations are served. 
#In addition, for all the trips the arrival and departure times at the stations served are given.
stop_times_range = [*range(2, 19)]
stop_times_Netherlands = pd.read_csv(dutch_GTFS_loc + "stop_times-1.csv", sep=",")
for index in stop_times_range:
    stop_times_Netherlands = pd.concat([stop_times_Netherlands, pd.read_csv(dutch_GTFS_loc + "stop_times-" + str(index)+ ".csv", sep=",")])

stop_times_Netherlands

'Import other DataFrames'

Unnamed: 0,trip_id,stop_sequence,stop_id,stop_headsign,arrival_time,departure_time,pickup_type,drop_off_type,timepoint,shape_dist_traveled,fare_units_traveled
0,127986896,24,15250,,13:36:00,13:36:00,1,0,0.0,12272.0,12272.0
1,127986896,11,15131,,13:15:24,13:15:42,0,0,0.0,5407.0,5407.0
2,127986896,8,15661,,13:09:12,13:09:30,0,0,0.0,3632.0,3632.0
3,127986896,6,14784,,13:05:36,13:05:54,0,0,0.0,2855.0,2855.0
4,127986896,4,15774,,13:00:38,13:00:56,0,0,0.0,1704.0,1704.0
...,...,...,...,...,...,...,...,...,...,...,...
507727,128464528,16,537637,,16:02:00,16:02:00,0,0,0.0,9600.0,9903.0
507728,128464529,14,1420429,Veldhoven Sondervick,15:28:00,15:28:00,0,0,0.0,7745.0,8044.0
507729,128464529,3,939748,,15:12:00,15:12:00,0,0,0.0,1146.0,1168.0
507730,128464537,14,1420429,Veldhoven Sondervick,17:28:00,17:28:00,0,0,0.0,7745.0,8044.0


### Switzerland

In [13]:
'''Apply common_import()'''
agency_Switzerland, calendar_dates_Switzerland, routes_Switzerland, stops_Switzerland, transfers_not_cleaned_Switzerland, trips_Switzerland = common_imports(swiss_GTFS_loc)
agency_Switzerland
calendar_dates_Switzerland
routes_Switzerland
stops_Switzerland
trips_Switzerland

'Apply common_import()'

Unnamed: 0,agency_id,agency_name,agency_url,agency_timezone,agency_lang,agency_phone
0,000038,ASM-bti (Aare Seeland mobil (bti)),http://www.sbb.ch,Europe/Berlin,,
1,000056,ASM-rvo (Aare Seeland mobil (rvo)),http://www.sbb.ch,Europe/Berlin,,
2,000081,ASM-snb (Aare Seeland mobil (snb)),http://www.sbb.ch,Europe/Berlin,,
3,000031,AVA-bd (Aargau Verkehr AG),http://www.sbb.ch,Europe/Berlin,,
4,000096,AVA-wsb (Aargau Verkehr AG),http://www.sbb.ch,Europe/Berlin,,
...,...,...,...,...,...,...
57,000053,TPF (Transports publics fribourgeois),http://www.sbb.ch,Europe/Berlin,,
58,327000,TN (TreNord),http://www.sbb.ch,Europe/Berlin,,
59,009014,VDBB (Verein Dampfbahn Bern),http://www.sbb.ch,Europe/Berlin,,
60,000157,WAB (Wengernalpbahn),http://www.sbb.ch,Europe/Berlin,,


Unnamed: 0,service_id,date,exception_type
0,5,20201214,1
1,5,20201215,1
2,5,20201216,1
3,5,20201217,1
4,5,20201218,1
...,...,...,...
1536338,12559,20211208,1
1536339,12559,20211209,1
1536340,12559,20211210,1
1536341,12560,20210603,1


Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_desc,route_type,route_url,route_color,route_text_color
0,04236.06____.020:4236,06____,RE,RE 4236,,2,,,
1,04700.06____.001:4700,06____,RE,RE 4700,,2,,,
2,04700.06____.014:4700,06____,RE,RE 4700,,2,,,
3,04701.06____.002:4701,06____,RE,RE 4701,,2,,,
4,04701.06____.015:4701,06____,RE,RE 4701,,2,,,
...,...,...,...,...,...,...,...,...,...
49346,87945.L7____.001:5,L7____,S5,S 5,,2,,FFEA00,000000
49347,87946.L7____.001:5,L7____,S5,S 5,,2,,FFEA00,000000
49348,87947.L7____.001:5,L7____,S5,S 5,,2,,FFEA00,000000
49349,87948.L7____.001:5,L7____,S5,S 5,,2,,FFEA00,000000


Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,stop_elevation,zone_id,stop_url,location_type,parent_station,platform_code,ch_station_long_name,ch_station_synonym1,ch_station_synonym2,ch_station_synonym3,ch_station_synonym4
0,0000132,,Bahn-2000-Strecke,,47.196374,7.689360,0,,,0,,,Bahn-2000-Strecke,,,,
1,0000133,,Centovalli,,46.154371,8.603653,0,,,0,,,Centovalli,,,,
2,0000134,,Furka,,46.538322,8.435913,0,,,0,,,Furka,,,,
3,0000135,,Lötschberg-Basistunnel,,46.356888,7.773846,0,,,0,,,Lötschberg-Basistunnel,,,,
4,0000136,,Lötschberg-Bergstrecke,,46.433756,7.717215,0,,,0,,,Lötschberg-Bergstrecke,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6371,8083840:,,Schopfheim-West,,47.645541,7.801715,0,,,0,8083840.0,,,,,,
6372,8021703:,SSCH,Schopfheim-Schlattholz,,47.653802,7.833793,382,,,0,8021703.0,,,,,,
6373,8014448:,FN,Fahrnau,,47.661560,7.839425,374,,,0,8014448.0,,,,,,
6374,8014449:,HSN,Hausen-Raitbach,,47.679310,7.846041,400,,,0,8014449.0,,,,,,


Unnamed: 0,route_id,service_id,trip_id,trip_headsign,trip_short_name,direction_id,block_id,shape_id,bikes_allowed,attributes_ch
0,04236.06____.020:4236,19311,0:1,Karlsruhe Hbf,4236,,,,0,MO
1,04700.06____.001:4700,133763,1:1,Karlsruhe Hbf,4700,,,,0,MO
2,04700.06____.001:4700,1,1:2,Karlsruhe Hbf,4700,,,,0,MO
3,04700.06____.001:4700,13336,1:3,Karlsruhe Hbf,4700,,,,0,MO
4,04700.06____.001:4700,2610,1:4,Karlsruhe Hbf,4700,,,,0,MO
...,...,...,...,...,...,...,...,...,...,...
90223,87947.L7____.001:5,2364,49348:1,Weil am Rhein,5,,,,0,MO
90224,87948.L7____.001:5,56518,49349:1,Lörrach Hbf,5,,,,0,MO
90225,87948.L7____.001:5,17102,49349:2,Zell (Wiesental),5,,,,0,MO
90226,87949.L7____.001:5,17102,49350:1,Weil am Rhein,5,,,,0,MO


In [14]:
'''Import other DataFrames'''
#To import the feed_info dataset that contains limited information about the Swiss SBB railway feed.
feed_info_Switzerland = pd.read_csv(swiss_GTFS_loc + "feed_info.txt", sep=",")
#To import the stop_times dataset that gives for all trips an overview of the ids of the stations served and the sequence in which these stations are served. 
#In addition, for all the trips the arrival and departure times at the stations served are given.
stop_times_Switzerland = pd.read_csv(swiss_GTFS_loc + "stop_times.txt", sep=",")
#To import the calendar dataset that gives the first and last date of all data observations.
calendar_Switzerland = pd.read_csv(swiss_GTFS_loc + "calendar.txt", sep=",")

stop_times_Switzerland

'Import other DataFrames'

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled,attributes_ch
0,0:1,18:16:00,18:16:00,8014554:1,0,,0,0,,
1,0:1,18:28:00,18:30:00,8014534:3,1,,0,0,,
2,0:1,18:40:00,18:40:00,8014529:1,2,,0,0,,
3,0:1,18:49:00,18:50:00,8014521:2,3,,0,0,,
4,0:1,18:58:00,18:59:00,8014518:1,4,,0,0,,
...,...,...,...,...,...,...,...,...,...,...
1034556,49350:2,10:38:00,10:38:00,8069220:1,3,,3,3,,X
1034557,49350:2,10:40:00,10:40:00,8014429:1,4,,3,3,,X
1034558,49350:2,10:41:00,10:42:00,8060979:1,5,,3,3,,X
1034559,49350:2,10:43:00,10:43:00,8060978:1,6,,3,3,,X


# Cleaning of the railway data

## Functions 

In [15]:
'''Clean the calendar_dates DataFrame'''

def clean_calendar_dates(calendar_dates):
    #To filter the dates from the selected begin to the end date
    begin_date = 20210314
    end_date = 20210713
    calendar_dates_cleaned = calendar_dates.copy()
    calendar_dates_cleaned = calendar_dates_cleaned.drop(calendar_dates_cleaned[(calendar_dates_cleaned['date'] > end_date) | (calendar_dates_cleaned['date'] < begin_date)].index)
    return calendar_dates_cleaned

'Clean the calendar_dates DataFrame'

In [16]:
'''Add the country to the stops DataFrame and returns the country filtered DataFrame of stops and the serie of those stops'''

def country_information(stops, country_name, stops_cleaned_loc, stops_series_loc):
    #To initialize the Nominatim API to get the location from the input string 
    geolocator = Nominatim(user_agent="application")
    reverse = RateLimiter(geolocator.reverse, min_delay_seconds=0.2)

    #To get the location with the geolocator.reverse() function and to extract the country from the location instance
    country_list = []
    for index, row in stops.iterrows():
        latitude = row['stop_lat']
        longitude = row['stop_lon']
        # To assign the latitude and longitude into a geolocator.reverse() method
        location = reverse((latitude, longitude), language='en', exactly_one=True)
        # To get the country from the given list and parsed into a dictionary with raw function()
        address = location.raw['address']
        country = address.get('country', '')
        country_list.append(country)

    #To add the values of country_list as a new attribute country 
    stops.loc[:,'country'] = country_list

    #To calculate the total number of Belgian stations in the stops dataset
    country_stops = stops[stops['country'] == country_name]
    country_stops_series = stops.loc[stops['country'] == country_name, 'stop_name']
    
    stops.to_csv(f'{stops_cleaned_loc}stops_cleaned_{country_name}.csv')
    country_stops_series.to_csv(f'{stops_series_loc}stops_{country_name}_series.csv')

'Add the country to the stops DataFrame and returns the country filtered DataFrame of stops and the serie of those stops'

In [17]:
'''Remove the accents from a string'''

def remove_accents(text):
    import unicodedata
    try:
        text = unicode(text, 'utf-8')
    except NameError:
        pass
    text = unicodedata.normalize('NFD', text).encode('ascii', 'ignore').decode("utf-8")
    return str(text)

'Remove the accents from a string'

## Actual cleaning

### Belgium

In [18]:
'''Clean the routes_Belgium df'''
allowed_route_type = {'IC', 'L', 'P', 'ICT', 'IZY'}
routes_cleaned_Belgium = routes_Belgium[(routes_Belgium['route_short_name'].isin(allowed_route_type)) | (routes_Belgium['route_short_name'].str.startswith('S'))]
routes_cleaned_Belgium

'Clean the routes_Belgium df'

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_desc,route_type,route_url,route_color,route_text_color
18,115,NMBS/SNCB,IC,Tournai -- Mouscron,,103,,,
19,116,NMBS/SNCB,IC,Bruges -- Knokke,,103,,,
20,117,NMBS/SNCB,L,Verviers-Central -- Spa-Geronstère,,100,,,
21,118,NMBS/SNCB,IC,Knokke -- Gand-Saint-Pierre,,103,,,
22,119,NMBS/SNCB,L,Grammont -- Denderleeuw,,100,,,
...,...,...,...,...,...,...,...,...,...
701,730,NMBS/SNCB,L,Haversin -- Libramont,,100,,,
702,731,NMBS/SNCB,L,Marloie -- Libramont,,100,,,
703,732,NMBS/SNCB,IZY,Paris Nord (FR) -- Bruxelles-Midi,,101,,,
704,733,NMBS/SNCB,IC,Den Haag HS (NL) -- Bruxelles-Midi,,103,,,


In [19]:
'''Apply clean_calendar_dates()'''
calendar_dates_cleaned_Belgium = clean_calendar_dates(calendar_dates_Belgium)
calendar_dates_cleaned_Belgium

'Apply clean_calendar_dates()'

Unnamed: 0,service_id,date,exception_type
0,1,20210314,1
1,2,20210315,1
2,2,20210316,1
3,2,20210317,1
4,2,20210318,1
...,...,...,...
487564,0,20210709,1
487565,0,20210710,1
487566,0,20210711,1
487567,0,20210712,1


In [20]:
'''Clean the stops_Belgium df''' 
#To eliminate the stop_ids in the stops dataset that contain an underscore or that start with a character 'S'. 
stops_cleaned_Belgium = stops_Belgium[(~stops_Belgium['stop_id'].str.contains('_')) & (~stops_Belgium['stop_id'].str.contains('S'))]

#To modify the object datatype of the stop_id column to the NumPy int64 datatype
stops_cleaned_Belgium.loc[:,'stop_id'] = stops_cleaned_Belgium.loc[:,'stop_id'].astype(np.int64)

#To remove the accents from the stop_name and to change to uppercase
stops_cleaned_Belgium.loc[:,'stop_name'] = stops_cleaned_Belgium.loc[:,'stop_name'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
stops_cleaned_Belgium.loc[:,'stop_name'] = stops_cleaned_Belgium.loc[:,'stop_name'].str.upper()

'Clean the stops_Belgium df'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [21]:
'''Apply country_information() and take the DataFrames from the files'''
country_name = 'Belgium'
#THE FOLLOWING LINE MIGHT BE RUNNED IF WANTED TO RECALCULATE THE COUNTRIES, BUT IT TAKES A LONG TIME
#country_information(stops_cleaned_Belgium, country_name, stops_cleaned_loc, stops_series_loc)
stops_cleaned_Belgium = pd.read_csv(f"{stops_cleaned_loc}stops_cleaned_{country_name}.csv", sep=",")
stops_Belgium_series = pd.read_csv(f"{stops_series_loc}stops_{country_name}_series.csv", sep=",")['stop_name']
stops_cleaned_Belgium.iloc[:,1:]

'Apply country_information() and take the DataFrames from the files'

Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station,platform_code,country
0,8015345,,AACHEN HBF (DE),,50.77083,6.105277,,,0,,,Germany
1,8200100,,LUXEMBOURG (LU),,49.60000,6.133333,,,0,,,Luxembourg
2,8200101,,DOMMELDANGE (LU),,49.63390,6.136765,,,0,,,Luxembourg
3,8200102,,PFAFFENTHAL-KIRCHBERG (LU),,49.61913,6.132853,,,0,,,Luxembourg
4,8200110,,MERSCH (LU),,49.74889,6.106111,,,0,,,Luxembourg
...,...,...,...,...,...,...,...,...,...,...,...,...
603,8896503,,YPRES,,50.84740,2.876590,,,0,S8896503,,Belgium
604,8896735,,POPERINGE,,50.85445,2.736345,,,0,S8896735,,Belgium
605,8896800,,ROULERS,,50.94903,3.130415,,,0,S8896800,,Belgium
606,8896909,,IZEGEM,,50.92115,3.212089,,,0,S8896909,,Belgium


### Netherlands

In [22]:
'''Clean the routes_Netherlands DataFrame'''
#To keep the train routes
routes_cleaned_Netherlands = routes_Netherlands[routes_Netherlands['route_type'] == 2]
routes_cleaned_Netherlands = routes_cleaned_Netherlands.astype(str)

#To change the route_id object datatype to a NumPy int64 datatype
routes_cleaned_Netherlands.loc[:,'route_id'] = routes_cleaned_Netherlands.loc[:,'route_id'].astype(np.int64)
routes_cleaned_Netherlands

'Clean the routes_Netherlands DataFrame'

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_desc,route_type,route_color,route_text_color,route_url
11,67394,IFF:NS,Intercity,Den Haag Centraal <-> Eindhoven Centraal IC1100,,2,,,
13,67395,IFF:NS,Intercity,Nachtnettrein Rotterdam Centraal <-> Eindhoven...,,2,,,
17,67399,IFF:NS,Intercity,Den Haag Centraal <-> Amersfoort Centraal IC2000,,2,,,
19,67400,IFF:NS,Intercity,Eindhoven Centraal <-> Venlo IC13500,,2,,,
20,67402,IFF:NS,Intercity,Nachtnettrein Utrecht Centraal <-> Rotterdam C...,,2,,,
...,...,...,...,...,...,...,...,...,...
2332,41238,IFF:NS,Intercity,Lelystad Centrum <-> Dordrecht IC2400,,2,,,
2333,41239,IFF:NS,Sprinter,Den Haag Centraal <-> Dordrecht SPR5100,,2,,,
2337,18752,IFF:NS,Intercity,Nachtnettrein Utrecht Centraal <-> Nijmegen,,2,,,
2338,17828,IFF:NS,Sprinter,Haarlem <-> Den Haag Centraal SPR6300,,2,,,


In [23]:
'''Apply clean_calendar_dates()'''
calendar_dates_cleaned_Netherlands = clean_calendar_dates(calendar_dates_Netherlands)
calendar_dates_cleaned_Netherlands

'Apply clean_calendar_dates()'

Unnamed: 0,service_id,date,exception_type
0,1,20210314,1
1,2,20210314,1
2,2,20210412,1
3,2,20210419,1
4,2,20210426,1
...,...,...,...
181776,4074,20210713,1
181779,4075,20210712,1
181780,4075,20210713,1
181784,4076,20210712,1


In [24]:
'''Clean the stops DataFrame'''
#To take from the stops_initial_Netherlands df all stop_ids that contain a 'stoparea:' to get the correct stop coordinates
stops_cleaned_Netherlands = stops_Netherlands[stops_Netherlands['stop_id'].str.contains('stoparea:')]

#To remove the accents from the accented characters and to convert the remaining characters to uppercase characters
stops_cleaned_Netherlands.loc[:,'stop_name'] = stops_cleaned_Netherlands.loc[:,'stop_name'].apply(remove_accents)
stops_cleaned_Netherlands.loc[:,'stop_name'] = stops_cleaned_Netherlands.loc[:,'stop_name'].str.upper()

'Clean the stops DataFrame'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [25]:
'''Apply country_information() and take the DataFrames from the files'''
country_name = 'Netherlands'
#THE FOLLOWING LINE MIGHT BE RUNNED IF WANTED TO RECALCULATE THE COUNTRIES, BUT IT TAKES A LONG TIME
#country_information(stops_cleaned_Netherlands, country_name, stops_cleaned_loc, stops_series_loc)
stops_cleaned_Netherlands = pd.read_csv(f"{stops_cleaned_loc}stops_cleaned_{country_name}.csv", sep=",")
stops_Netherlands_series = pd.read_csv(f"{stops_series_loc}stops_{country_name}_series.csv", sep=",")['stop_name']
stops_cleaned_Netherlands

'Apply country_information() and take the DataFrames from the files'

Unnamed: 0,stop_id,stop_code,stop_name,stop_lat,stop_lon,location_type,parent_station,stop_timezone,wheelchair_boarding,platform_code,zone_id,country
0,stoparea:37036,lillee,LILLE EUROPE,50.639444,3.075000,1,,Europe/Amsterdam,0.0,,,France
1,stoparea:18059,gerp,GRONINGEN EUROPAPARK,53.204708,6.585811,1,,Europe/Amsterdam,0.0,,,Netherlands
2,stoparea:111951,buende,BUNDE (WESTF.),52.202170,8.574140,1,,Europe/Amsterdam,0.0,,,Germany
3,stoparea:345453,amf,AMERSFOORT CENTRAAL,52.153418,5.373988,1,,Europe/Amsterdam,0.0,,,Netherlands
4,stoparea:42566,ehw,ROSENDAHL-HOLTWICK,51.998474,7.122610,1,,Europe/Amsterdam,0.0,,,Germany
...,...,...,...,...,...,...,...,...,...,...,...,...
524,stoparea:226115,esn,ESSEN (B),51.462691,4.451378,1,,Europe/Amsterdam,0.0,,,Belgium
525,stoparea:226117,eenp,ENNEPETAL,51.304434,7.343200,1,,Europe/Amsterdam,0.0,,,Germany
526,stoparea:226121,kkd,KOLN MESSE/DEUTZ,50.940906,6.974600,1,,Europe/Amsterdam,0.0,,,Germany
527,stoparea:226123,minden,MINDEN (WESTF),52.290439,8.934465,1,,Europe/Amsterdam,0.0,,,Germany


In [26]:
'''Clean the stop_times df'''
stop_times_cleaned_Netherlands = stop_times_Netherlands.copy()
stop_times_cleaned_Netherlands.loc[:,'stop_id'] = stop_times_cleaned_Netherlands.stop_id.apply(str)
stop_times_cleaned_Netherlands = pd.merge(stop_times_cleaned_Netherlands, stops_Netherlands[['stop_id', 'stop_name']], on='stop_id')
stop_times_cleaned_Netherlands.loc[:,'stop_name'] = stop_times_cleaned_Netherlands.loc[:,'stop_name'].apply(remove_accents)
stop_times_cleaned_Netherlands.loc[:,'stop_name'] = stop_times_cleaned_Netherlands.loc[:,'stop_name'].str.upper()
stop_times_cleaned_Netherlands

'Clean the stop_times df'

Unnamed: 0,trip_id,stop_sequence,stop_id,stop_headsign,arrival_time,departure_time,pickup_type,drop_off_type,timepoint,shape_dist_traveled,fare_units_traveled,stop_name
0,127986896,24,15250,,13:36:00,13:36:00,1,0,0.0,12272.0,12272.0,"AMSTERDAM, MATTERHORN"
1,127986859,24,15250,,10:01:00,10:01:00,1,0,0.0,12272.0,12272.0,"AMSTERDAM, MATTERHORN"
2,127986890,24,15250,,13:06:00,13:06:00,1,0,0.0,12272.0,12272.0,"AMSTERDAM, MATTERHORN"
3,127986998,24,15250,,22:05:00,22:05:00,1,0,0.0,12272.0,12272.0,"AMSTERDAM, MATTERHORN"
4,127986861,24,15250,,10:32:00,10:32:00,1,0,0.0,12272.0,12272.0,"AMSTERDAM, MATTERHORN"
...,...,...,...,...,...,...,...,...,...,...,...,...
10945566,130566937,28,2383583,,09:32:00,09:32:00,0,0,0.0,,16928.0,"HEES, HEESSTRAAT"
10945567,130566801,26,2383715,,09:31:00,09:31:00,0,0,0.0,,16085.0,"HEES, TOMBESTRAAT"
10945568,130566937,26,2383715,,09:31:00,09:31:00,0,0,0.0,,16085.0,"HEES, TOMBESTRAAT"
10945569,130566801,3,2383782,,08:56:00,08:56:00,0,0,0.0,,972.0,"SCHOONBEEK, APPELVELDSTRAAT"


### Switzerland

In [27]:
'''Clean the routes_Switzerland DataFrame'''
#To keep the train routes
routes_cleaned_Switzerland = routes_Switzerland[routes_Switzerland['route_type'] == 2]
routes_cleaned_Switzerland

'Clean the routes_Switzerland DataFrame'

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_desc,route_type,route_url,route_color,route_text_color
0,04236.06____.020:4236,06____,RE,RE 4236,,2,,,
1,04700.06____.001:4700,06____,RE,RE 4700,,2,,,
2,04700.06____.014:4700,06____,RE,RE 4700,,2,,,
3,04701.06____.002:4701,06____,RE,RE 4701,,2,,,
4,04701.06____.015:4701,06____,RE,RE 4701,,2,,,
...,...,...,...,...,...,...,...,...,...
49346,87945.L7____.001:5,L7____,S5,S 5,,2,,FFEA00,000000
49347,87946.L7____.001:5,L7____,S5,S 5,,2,,FFEA00,000000
49348,87947.L7____.001:5,L7____,S5,S 5,,2,,FFEA00,000000
49349,87948.L7____.001:5,L7____,S5,S 5,,2,,FFEA00,000000


In [28]:
'''Apply clean_calendar_dates()'''
calendar_dates_cleaned_Switzerland = clean_calendar_dates(calendar_dates_Switzerland)
calendar_dates_cleaned_Switzerland

'Apply clean_calendar_dates()'

Unnamed: 0,service_id,date,exception_type
63,5,20210315,1
64,5,20210316,1
65,5,20210317,1
66,5,20210318,1
67,5,20210319,1
...,...,...,...
1536188,12559,20210711,1
1536189,12559,20210712,1
1536190,12559,20210713,1
1536341,12560,20210603,1


In [29]:
'''Clean the stop_times_Switzerland DataFrame'''
# To remove the superfluous characters of the stop_id (platform codes)
stop_times_cleaned_Switzerland = stop_times_Switzerland.copy()
stop_times_cleaned_Switzerland_column = stop_times_cleaned_Switzerland['stop_id'].str.split(':').str[0]
stop_times_cleaned_Switzerland.loc[:,'stop_id'] = stop_times_cleaned_Switzerland_column

# To make the stop_ids numerical 
stop_times_cleaned_Switzerland.loc[:,'stop_id'] = stop_times_cleaned_Switzerland.loc[:,'stop_id'].astype(np.int64)
stop_times_cleaned_Switzerland

'Clean the stop_times_Switzerland DataFrame'

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled,attributes_ch
0,0:1,18:16:00,18:16:00,8014554,0,,0,0,,
1,0:1,18:28:00,18:30:00,8014534,1,,0,0,,
2,0:1,18:40:00,18:40:00,8014529,2,,0,0,,
3,0:1,18:49:00,18:50:00,8014521,3,,0,0,,
4,0:1,18:58:00,18:59:00,8014518,4,,0,0,,
...,...,...,...,...,...,...,...,...,...,...
1034556,49350:2,10:38:00,10:38:00,8069220,3,,3,3,,X
1034557,49350:2,10:40:00,10:40:00,8014429,4,,3,3,,X
1034558,49350:2,10:41:00,10:42:00,8060979,5,,3,3,,X
1034559,49350:2,10:43:00,10:43:00,8060978,6,,3,3,,X


In [30]:
'''Clean the stops_Switzerland DataFrame'''
#To remove the superfluous characters (platform codes)
stops_cleaned_Switzerland_column = stops_Switzerland['stop_id'].str.split(':').str[0]
stops_cleaned_Switzerland = stops_Switzerland.copy()
stops_cleaned_Switzerland.loc[:,'stop_id'] = stops_cleaned_Switzerland_column

#To make the stop_ids numerical and to remove the duplicate stop_ids
stops_cleaned_Switzerland = stops_cleaned_Switzerland[['stop_id', 'stop_name', 'stop_lat', 'stop_lon']]
stops_cleaned_Switzerland.loc[:,'stop_id'] = stops_cleaned_Switzerland.loc[:,'stop_id'].astype(np.int64)
stops_cleaned_Switzerland = stops_cleaned_Switzerland.drop_duplicates()

#To remove the accents from the stop_name and to change to uppercase
stops_cleaned_Switzerland.loc[:,'stop_name'] = stops_cleaned_Switzerland.loc[:,'stop_name'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
stops_cleaned_Switzerland.loc[:,'stop_name'] = stops_cleaned_Switzerland.loc[:,'stop_name'].str.upper()

'Clean the stops_Switzerland DataFrame'

In [31]:
'''Apply country_information() and take the DataFrames from the files'''
country_name = 'Switzerland'
#THE FOLLOWING LINE MIGHT BE RUNNED IF WANTED TO RECALCULATE THE COUNTRIES, BUT IT TAKES A LONG TIME
#country_information(stops_cleaned_Switzerland, country_name, stops_cleaned_loc, stops_series_loc)
stops_cleaned_Switzerland = pd.read_csv(f"{stops_cleaned_loc}stops_cleaned_{country_name}.csv", sep=",")
stops_Switzerland_series = pd.read_csv(f"{stops_series_loc}stops_{country_name}_series.csv", sep=",")['stop_name']
stops_cleaned_Switzerland

'Apply country_information() and take the DataFrames from the files'

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,country
0,8014554,ENGEN,47.856347,8.772786,Germany
1,8014534,IMMENDINGEN,47.936007,8.729536,Germany
2,8014529,DONAUESCHINGEN,47.947786,8.498919,Germany
3,8014521,VILLINGEN (SCHWARZW),48.058022,8.465261,Germany
4,8014518,ST GEORGEN (SCHWARZW),48.123813,8.341955,Germany
...,...,...,...,...,...
2603,8014448,FAHRNAU,47.661560,7.839425,Germany
2604,8014449,HAUSEN-RAITBACH,47.679310,7.846041,Germany
2605,8014450,ZELL (WIESENTAL),47.705626,7.849149,Germany
2606,8087021,RIEHEN NIEDERHOLZ,47.569473,7.633441,Switzerland


# Merge the DataFrames

## Functions

In [32]:
'''Merge the DataFrames'''

def merge_df(stop_times, stops, routes, trips, calendar_dates, on_stop):
    list_columns = ['stop_name', 'stop_lat', 'stop_lon', 'country']
    if on_stop == 'stop_id':
        list_columns.append('stop_id')
    #To merge the stop_times df with the stops df on stop_id
    stop_times_stops = pd.merge(stop_times, stops[list_columns], on= on_stop)

    #To merge the trips df with the routes df on route_id
    routes_trips = pd.merge(routes[['route_id']], trips, on='route_id')

    #To merge the stop_times_stops df with the trips_routes df on trip_id
    uncleaned_railway_system_information = pd.merge(routes_trips, stop_times_stops, on='trip_id')

    #To take only the service_ids present in both the routes_trips_stop_times_stops df and the calendar_dates df into account
    calendar_dates_unique = calendar_dates['service_id'].unique()
    railway_system_information = uncleaned_railway_system_information[(uncleaned_railway_system_information['service_id'].isin(calendar_dates_unique))]
    
    return railway_system_information

'Merge the DataFrames'

## Actual merging

### Belgium 

In [33]:
'''Select all required fields'''
agency_cleaned_Belgium = agency_Belgium[['agency_id', 'agency_name', 'agency_url', 'agency_timezone']]
routes_cleaned_Belgium = routes_cleaned_Belgium[['route_id', 'agency_id', 'route_short_name', 'route_long_name', 'route_type']]
trips_cleaned_Belgium = trips_Belgium[['trip_id', 'route_id', 'service_id', 'trip_headsign']]
calendar_dates_cleaned_Belgium = calendar_dates_cleaned_Belgium[['service_id', 'date']]
stops_cleaned_Belgium = stops_cleaned_Belgium[['stop_id', 'stop_name', 'stop_lat', 'stop_lon', 'country']]
stop_times_cleaned_Belgium = stop_times_Belgium[['trip_id', 'stop_id', 'arrival_time', 'departure_time', 'stop_sequence']]

'Select all required fields'

In [34]:
'''Apply merge_df()'''
railway_system_information_Belgium = merge_df(stop_times_cleaned_Belgium, stops_cleaned_Belgium, routes_cleaned_Belgium, trips_cleaned_Belgium, calendar_dates_cleaned_Belgium, 'stop_id')
railway_system_information_Belgium

'Apply merge_df()'

Unnamed: 0,route_id,trip_id,service_id,trip_headsign,stop_id,arrival_time,departure_time,stop_sequence,stop_name,stop_lat,stop_lon,country
0,115,88____:007::8885704:8885001:4:523:20210418,14,Tournai,8885001,05:23:00,05:23:00,4,TOURNAI,50.61313,3.396940,Belgium
1,115,88____:007::8885704:8885001:4:523:20210418,14,Tournai,8885068,05:19:00,05:19:00,3,FROYENNES,50.62989,3.354835,Belgium
2,115,88____:007::8885704:8885001:4:523:20210418,14,Tournai,8885753,05:12:00,05:12:00,2,HERSEAUX,50.71390,3.245961,Belgium
3,115,88____:007::8885704:8885001:4:523:20210418,14,Tournai,8885704,05:07:00,05:07:00,1,MOUSCRON,50.74100,3.228449,Belgium
4,115,88____:007::8885704:8885001:4:623:20210418,14,Tournai,8885001,06:23:00,06:23:00,4,TOURNAI,50.61313,3.396940,Belgium
...,...,...,...,...,...,...,...,...,...,...,...,...
431487,734,88____:007::8821105:8812005:22:1723:20210418,25,Bruxelles-Nord,8811270,17:01:00,17:01:00,19,VELTEM,50.90052,4.633520,Belgium
431488,734,88____:007::8821105:8812005:22:1723:20210418,25,Bruxelles-Nord,8811288,16:59:00,16:59:00,18,HERENT,50.90353,4.672190,Belgium
431489,734,88____:007::8821105:8812005:22:1723:20210418,25,Bruxelles-Nord,8819406,17:10:00,17:12:00,23,BRUSSELS AIRPORT-ZAVENTEM,50.89646,4.482072,Belgium
431490,734,88____:007::8821105:8812005:22:1723:20210418,25,Bruxelles-Nord,8821063,16:11:00,16:11:00,5,ANVERS-LUCHTBAL,51.24413,4.425033,Belgium


### Netherlands

In [35]:
'''Select all required fields'''
agency_cleaned_Netherlands = agency_Netherlands[['agency_id', 'agency_name', 'agency_url', 'agency_timezone']]
routes_cleaned_Netherlands = routes_cleaned_Netherlands[['route_id', 'agency_id', 'route_short_name', 'route_long_name', 'route_type']]
trips_cleaned_Netherlands = trips_Netherlands[['trip_id', 'route_id', 'service_id', 'trip_headsign']]
calendar_dates_cleaned_Netherlands = calendar_dates_cleaned_Netherlands[['service_id', 'date']]
stops_cleaned_Netherlands = stops_cleaned_Netherlands[['stop_name', 'stop_lat', 'stop_lon', 'country']]
stop_times_cleaned_Netherlands = stop_times_cleaned_Netherlands[['trip_id', 'stop_name', 'arrival_time', 'departure_time', 'stop_sequence']]

'Select all required fields'

In [36]:
'''Apply merge_df()'''
railway_system_information_Netherlands = merge_df(stop_times_cleaned_Netherlands, stops_cleaned_Netherlands, routes_cleaned_Netherlands, trips_cleaned_Netherlands, calendar_dates_cleaned_Netherlands, 'stop_name')
railway_system_information_Netherlands

'Apply merge_df()'

Unnamed: 0,route_id,trip_id,service_id,trip_headsign,stop_name,arrival_time,departure_time,stop_sequence,stop_lat,stop_lon,country
0,67394,121351438,1810,Eindhoven Centraal,DEN HAAG CENTRAAL,20:47:00,20:47:00,1,52.081131,4.324054,Netherlands
1,67394,121351438,1810,Eindhoven Centraal,TILBURG,21:51:00,21:53:00,18,51.560548,5.083457,Netherlands
2,67394,121351438,1810,Eindhoven Centraal,EINDHOVEN CENTRAAL,22:15:00,22:15:00,23,51.442376,5.479941,Netherlands
3,67394,121351438,1810,Eindhoven Centraal,ROTTERDAM CENTRAAL,21:12:00,21:14:00,8,51.924383,4.469746,Netherlands
4,67394,121351438,1810,Eindhoven Centraal,DELFT,21:00:00,21:00:00,5,52.006539,4.356516,Netherlands
...,...,...,...,...,...,...,...,...,...,...,...
341374,20895,123745346,1868,Leiden Centraal,AMSTERDAM BIJLMER ARENA,26:39:00,26:39:00,1,52.312204,4.947109,Netherlands
341375,20895,123745346,1868,Leiden Centraal,SCHIPHOL AIRPORT,26:50:00,27:03:00,4,52.309456,4.762284,Netherlands
341376,20895,123745344,1868,Amsterdam Bijlmer ArenA,LEIDEN CENTRAAL,25:47:00,25:47:00,1,52.166353,4.482068,Netherlands
341377,20895,123745344,1868,Amsterdam Bijlmer ArenA,AMSTERDAM BIJLMER ARENA,26:22:00,26:22:00,8,52.312204,4.947109,Netherlands


### Switzerland

In [37]:
'''Select all required fields'''
agency_cleaned_Switzerland = agency_Switzerland[['agency_id', 'agency_name', 'agency_url', 'agency_timezone']]
routes_cleaned_Switzerland = routes_cleaned_Switzerland[['route_id', 'agency_id', 'route_short_name', 'route_long_name', 'route_type']]
trips_cleaned_Switzerland = trips_Switzerland[['trip_id', 'route_id', 'service_id', 'trip_headsign']]
calendar_dates_cleaned_Switzerland = calendar_dates_cleaned_Switzerland[['service_id', 'date']]
stops_cleaned_Switzerland = stops_cleaned_Switzerland[['stop_id', 'stop_name', 'stop_lat', 'stop_lon', 'country']]
stop_times_cleaned_Switzerland = stop_times_cleaned_Switzerland[['trip_id', 'stop_id', 'arrival_time', 'departure_time', 'stop_sequence']]

'Select all required fields'

In [38]:
'''Apply merge_df()'''
railway_system_information_Switzerland = merge_df(stop_times_cleaned_Switzerland, stops_cleaned_Switzerland, routes_cleaned_Switzerland, trips_cleaned_Switzerland, calendar_dates_cleaned_Switzerland, 'stop_id')
railway_system_information_Switzerland

'Apply merge_df()'

Unnamed: 0,route_id,trip_id,service_id,trip_headsign,stop_id,arrival_time,departure_time,stop_sequence,stop_name,stop_lat,stop_lon,country
0,04236.06____.020:4236,0:1,19311,Karlsruhe Hbf,8014554,18:16:00,18:16:00,0,ENGEN,47.856347,8.772786,Germany
1,04236.06____.020:4236,0:1,19311,Karlsruhe Hbf,8014534,18:28:00,18:30:00,1,IMMENDINGEN,47.936007,8.729536,Germany
2,04236.06____.020:4236,0:1,19311,Karlsruhe Hbf,8014529,18:40:00,18:40:00,2,DONAUESCHINGEN,47.947786,8.498919,Germany
3,04236.06____.020:4236,0:1,19311,Karlsruhe Hbf,8014521,18:49:00,18:50:00,3,VILLINGEN (SCHWARZW),48.058022,8.465261,Germany
4,04236.06____.020:4236,0:1,19311,Karlsruhe Hbf,8014518,18:58:00,18:59:00,4,ST GEORGEN (SCHWARZW),48.123813,8.341955,Germany
...,...,...,...,...,...,...,...,...,...,...,...,...
991431,87949.L7____.001:5,49350:2,56518,Weil am Rhein,8014429,10:40:00,10:40:00,4,WEIL AM RHEIN-OST,47.590629,7.634780,Germany
991432,87949.L7____.001:5,49350:2,56518,Weil am Rhein,8069220,10:38:00,10:38:00,3,LORRACH DAMMSTRASSE,47.597713,7.655224,Germany
991433,87949.L7____.001:5,49350:2,56518,Weil am Rhein,8014440,10:37:00,10:37:00,2,LORRACH-STETTEN,47.601418,7.659032,Germany
991434,87949.L7____.001:5,49350:2,56518,Weil am Rhein,8069221,10:35:00,10:35:00,1,LORRACH MUSEUM/BURGHOF,47.607961,7.661196,Germany


# Preparation for the space-of-stops representation of the railway systems


## Functions

In [39]:
'''Create a DataFrame with the departure time form the first stop sequence and with the one from last stop sequence for each trip_id'''

def create_trip_departure_times(railway_system_information):
    departure_time_first = railway_system_information.reset_index().loc[railway_system_information.reset_index().groupby(['trip_id'])['stop_sequence'].idxmin()][['route_id', 'trip_id', 'departure_time']].copy()
    departure_time_first = departure_time_first.rename(columns = {'departure_time': 'departure_time_first'})
    departure_time_last = railway_system_information.reset_index().loc[railway_system_information.reset_index().groupby(['trip_id'])['stop_sequence'].idxmax()][['route_id', 'trip_id', 'departure_time']].copy()
    departure_time_last = departure_time_last.rename(columns = {'departure_time': 'departure_time_last'})
    trip_departure_times = departure_time_first.merge(departure_time_last[['trip_id', 'departure_time_last']], on='trip_id')
    return trip_departure_times

'Create a DataFrame with the departure time form the first stop sequence and with the one from last stop sequence for each trip_id'

In [40]:
'''Put the stop_names per trip_id in a list in the new trip_stops_sequence DataFrame and
Calculate the hash of the stop sequence in both order (ascending and descending)'''

def create_trip_stop_sequence(trip_departure_times):    
    #Put the stop_names per trip_id in a list in the new trip_stops_sequence DataFrame
    trip_stop_sequence = trip_departure_times.groupby('trip_id')['stop_name'].apply(lambda group_series: group_series.tolist()).reset_index()
    trip_stop_sequence.rename(columns={'stop_name':'stops_sequence'}, inplace=True)
    #Calculate the hash of the stop sequence in both order (ascending and descending)
    trip_stop_sequence['hash'] = trip_stop_sequence['stops_sequence'].apply(lambda x: hash(tuple(x)))
    trip_stop_sequence['hash_inverse'] = trip_stop_sequence['stops_sequence'].apply(lambda x: hash(tuple(x[::-1])))
    return trip_stop_sequence

'Put the stop_names per trip_id in a list in the new trip_stops_sequence DataFrame and\nCalculate the hash of the stop sequence in both order (ascending and descending)'

In [41]:
'''Regroup the days by service_id in a set and count them'''

def create_service_id_dates(calendar_dates):
    service_id_dates = calendar_dates.groupby('service_id')['date'].apply(lambda group_series: set(group_series.tolist())).reset_index()
    service_id_dates.rename(columns={'date':'dates'}, inplace=True)
    service_id_dates['count_service_id'] = service_id_dates['dates'].apply(lambda x: len(x))
    return service_id_dates

'Regroup the days by service_id in a set and count them'

In [42]:
'''Put the different trip_ids in a list and add the departure_time first and last lists'''

def create_routes_hash(trips_hash):
    common_columns = ['route_id','hash', 'hash_inverse', 'service_id']
    routes_hash = trips_hash.groupby(common_columns)['trip_id'].apply(lambda group_series: group_series.tolist()).reset_index()
    route_hash_dep_first = trips_hash.groupby(common_columns)['departure_time_first'].apply(lambda group_series: group_series.tolist()).reset_index()
    route_hash_dep_last = trips_hash.groupby(common_columns)['departure_time_last'].apply(lambda group_series: group_series.tolist()).reset_index()
    routes_hash = routes_hash.merge(route_hash_dep_first, on= common_columns)
    routes_hash = routes_hash.merge(route_hash_dep_last, on= common_columns)
    return routes_hash

'Put the different trip_ids in a list and add the departure_time first and last lists'

In [43]:
'''Create DataFrames that will be used for the route_creation process'''

def prepartion_space(railway_system_information, calendar_dates):    
    #Sort values by the route_id, the trip_id, and the stop_sequence fields
    railway_system_information = railway_system_information.sort_values(by=['route_id', 'trip_id','service_id', 'stop_sequence'])

    trip_departure_times = create_trip_departure_times(railway_system_information)

    #Merge railway_system_information with trip_departure_times
    trip_departure_times = railway_system_information.merge(trip_departure_times[['trip_id','departure_time_first','departure_time_last']], on='trip_id')

    trip_stop_sequence = create_trip_stop_sequence(trip_departure_times)
    
    #Add the stop_sequence of stations to the trip_departure_times dataset by joining on trip_id
    trips_hash = pd.merge(trip_departure_times, trip_stop_sequence, on='trip_id')
    
    service_id_dates = create_service_id_dates(calendar_dates)
    
    #Merge trips_hash with service_id_dates
    trips_hash = pd.merge(trips_hash, service_id_dates, on='service_id', how='left')
    
    #Calculate generic_trips_information
    generic_trips_information = trips_hash.groupby(['route_id', 'trip_id', 'service_id', 'hash', 'hash_inverse', 'departure_time_first','departure_time_last', 'count_service_id'], as_index=False)[['stops_sequence', 'dates']].first()
    
    routes_hash = create_routes_hash(generic_trips_information)
    
    #Add the sequence of stops, dates and service_id_count to the route_hash_freq_dep dataset
    routes_hash = pd.merge(routes_hash, trips_hash[['route_id','hash', 'hash_inverse', 'service_id','stops_sequence','dates','count_service_id']], on=['route_id', 'hash', 'hash_inverse', 'service_id'], how='left')
    routes_hash = routes_hash.drop_duplicates( subset = ['route_id', 'hash', 'service_id'], keep = 'first')
    
    #Sort the rows, so that they will always output on the same order
    routes_hash['departure_time_first_first'] = routes_hash['departure_time_first'].apply(lambda x: x[0])
    routes_hash = routes_hash.sort_values(by=['route_id', 'service_id', 'departure_time_first_first'])
    routes_hash = routes_hash.drop(columns=['departure_time_first_first'])
    routes_hash = routes_hash.reset_index(drop=True)
    
    return trips_hash, generic_trips_information, routes_hash

'Create DataFrames that will be used for the route_creation process'

## Actual preparation

### Belgium

In [44]:
trips_hash_Belgium, generic_trips_information_Belgium, routes_hash_Belgium = prepartion_space(railway_system_information_Belgium, calendar_dates_cleaned_Belgium)
trips_hash_Belgium 
generic_trips_information_Belgium
routes_hash_Belgium

Unnamed: 0,route_id,trip_id,service_id,trip_headsign,stop_id,arrival_time,departure_time,stop_sequence,stop_name,stop_lat,stop_lon,country,departure_time_first,departure_time_last,stops_sequence,hash,hash_inverse,dates,count_service_id
0,115,88____:007::8885001:8885704:4:1052:20210418,14,Mouscron,8885001,10:36:00,10:36:00,1,TOURNAI,50.61313,3.396940,Belgium,10:36:00,10:52:00,"[TOURNAI, FROYENNES, HERSEAUX, MOUSCRON]",2108900864639546293,3978835794570310647,"{20210417, 20210418}",2
1,115,88____:007::8885001:8885704:4:1052:20210418,14,Mouscron,8885068,10:40:00,10:40:00,2,FROYENNES,50.62989,3.354835,Belgium,10:36:00,10:52:00,"[TOURNAI, FROYENNES, HERSEAUX, MOUSCRON]",2108900864639546293,3978835794570310647,"{20210417, 20210418}",2
2,115,88____:007::8885001:8885704:4:1052:20210418,14,Mouscron,8885753,10:47:00,10:47:00,3,HERSEAUX,50.71390,3.245961,Belgium,10:36:00,10:52:00,"[TOURNAI, FROYENNES, HERSEAUX, MOUSCRON]",2108900864639546293,3978835794570310647,"{20210417, 20210418}",2
3,115,88____:007::8885001:8885704:4:1052:20210418,14,Mouscron,8885704,10:52:00,10:52:00,4,MOUSCRON,50.74100,3.228449,Belgium,10:36:00,10:52:00,"[TOURNAI, FROYENNES, HERSEAUX, MOUSCRON]",2108900864639546293,3978835794570310647,"{20210417, 20210418}",2
4,115,88____:007::8885001:8885704:4:1052:20210530,42,Mouscron,8885001,10:36:00,10:36:00,1,TOURNAI,50.61313,3.396940,Belgium,10:36:00,10:52:00,"[TOURNAI, FROYENNES, HERSEAUX, MOUSCRON]",2108900864639546293,3978835794570310647,"{20210529, 20210530}",2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
417588,734,88____:007::8821105:8812005:22:1723:20210418,25,Bruxelles-Nord,8811254,17:05:00,17:05:00,21,KORTENBERG,50.89307,4.543300,Belgium,16:01:00,17:23:00,"[NOORDERKEMPEN (BRECHT), ANVERS-LUCHTBAL, ANVE...",2807447681034197785,-4873696204104425705,{20210418},1
417589,734,88____:007::8821105:8812005:22:1723:20210418,25,Bruxelles-Nord,8811247,17:07:00,17:07:00,22,NOSSEGEM,50.88331,4.506110,Belgium,16:01:00,17:23:00,"[NOORDERKEMPEN (BRECHT), ANVERS-LUCHTBAL, ANVE...",2807447681034197785,-4873696204104425705,{20210418},1
417590,734,88____:007::8821105:8812005:22:1723:20210418,25,Bruxelles-Nord,8819406,17:10:00,17:12:00,23,BRUSSELS AIRPORT-ZAVENTEM,50.89646,4.482072,Belgium,16:01:00,17:23:00,"[NOORDERKEMPEN (BRECHT), ANVERS-LUCHTBAL, ANVE...",2807447681034197785,-4873696204104425705,{20210418},1
417591,734,88____:007::8821105:8812005:22:1723:20210418,25,Bruxelles-Nord,8811007,17:20:00,17:20:00,24,SCHAERBEEK,50.87851,4.378640,Belgium,16:01:00,17:23:00,"[NOORDERKEMPEN (BRECHT), ANVERS-LUCHTBAL, ANVE...",2807447681034197785,-4873696204104425705,{20210418},1


Unnamed: 0,route_id,trip_id,service_id,hash,hash_inverse,departure_time_first,departure_time_last,count_service_id,stops_sequence,dates
0,115,88____:007::8885001:8885704:4:1052:20210418,14,2108900864639546293,3978835794570310647,10:36:00,10:52:00,2,"[TOURNAI, FROYENNES, HERSEAUX, MOUSCRON]","{20210417, 20210418}"
1,115,88____:007::8885001:8885704:4:1052:20210530,42,2108900864639546293,3978835794570310647,10:36:00,10:52:00,2,"[TOURNAI, FROYENNES, HERSEAUX, MOUSCRON]","{20210529, 20210530}"
2,115,88____:007::8885001:8885704:4:1152:20210418,14,2108900864639546293,3978835794570310647,11:36:00,11:52:00,2,"[TOURNAI, FROYENNES, HERSEAUX, MOUSCRON]","{20210417, 20210418}"
3,115,88____:007::8885001:8885704:4:1152:20210530,42,2108900864639546293,3978835794570310647,11:36:00,11:52:00,2,"[TOURNAI, FROYENNES, HERSEAUX, MOUSCRON]","{20210529, 20210530}"
4,115,88____:007::8885001:8885704:4:1252:20210418,14,2108900864639546293,3978835794570310647,12:36:00,12:52:00,2,"[TOURNAI, FROYENNES, HERSEAUX, MOUSCRON]","{20210417, 20210418}"
...,...,...,...,...,...,...,...,...,...,...
25178,734,84____:007::8400280:8821105:4:1600:20210418,25,-7504798920915766465,8899951303900289287,14:40:00,16:01:00,1,"[DEN HAAG HS (NL), ROTTERDAM CS (NL), BREDA (N...",{20210418}
25179,734,88____:007::8812005:8400131:23:1218:20210418,25,4275775146826484424,-441074925134835438,10:37:00,12:26:00,1,"[BRUXELLES-NORD, SCHAERBEEK, BRUSSELS AIRPORT-...",{20210418}
25180,734,88____:007::8812005:8400131:23:1618:20210418,25,4275775146826484424,-441074925134835438,14:37:00,16:26:00,1,"[BRUXELLES-NORD, SCHAERBEEK, BRUSSELS AIRPORT-...",{20210418}
25181,734,88____:007::8821105:8812005:22:1323:20210418,25,2807447681034197785,-4873696204104425705,12:01:00,13:23:00,1,"[NOORDERKEMPEN (BRECHT), ANVERS-LUCHTBAL, ANVE...",{20210418}


Unnamed: 0,route_id,hash,hash_inverse,service_id,trip_id,departure_time_first,departure_time_last,stops_sequence,dates,count_service_id
0,115,3978835794570310647,2108900864639546293,14,"[88____:007::8885704:8885001:4:1023:20210418, ...","[10:07:00, 11:07:00, 12:07:00, 13:07:00, 14:07...","[10:23:00, 11:23:00, 12:23:00, 13:23:00, 14:23...","[MOUSCRON, HERSEAUX, FROYENNES, TOURNAI]","{20210417, 20210418}",2
1,115,2108900864639546293,3978835794570310647,14,"[88____:007::8885001:8885704:4:1052:20210418, ...","[10:36:00, 11:36:00, 12:36:00, 13:36:00, 14:36...","[10:52:00, 11:52:00, 12:52:00, 13:52:00, 14:52...","[TOURNAI, FROYENNES, HERSEAUX, MOUSCRON]","{20210417, 20210418}",2
2,115,3978835794570310647,2108900864639546293,25,[88____:007::8885704:8885001:4:23:20210418],[00:07:00],[00:23:00],"[MOUSCRON, HERSEAUX, FROYENNES, TOURNAI]",{20210418},1
3,115,2108900864639546293,3978835794570310647,25,[88____:007::8885001:8885704:4:52:20210418],[00:36:00],[00:52:00],"[TOURNAI, FROYENNES, HERSEAUX, MOUSCRON]",{20210418},1
4,115,3978835794570310647,2108900864639546293,31,"[88____:007::8885704:8885001:4:1723:20210417, ...","[17:07:00, 18:07:00, 19:07:00, 20:07:00, 21:07...","[17:23:00, 18:23:00, 19:23:00, 20:23:00, 21:23...","[MOUSCRON, HERSEAUX, FROYENNES, TOURNAI]",{20210417},1
...,...,...,...,...,...,...,...,...,...,...
5874,733,6480905685706156076,-6077219170471498809,1566,"[88____:007::8821105:8814001:22:1716:20211211,...","[16:01:00, 20:01:00]","[17:16:00, 21:16:00]","[NOORDERKEMPEN (BRECHT), ANVERS-LUCHTBAL, ANVE...","{20210703, 20210320, 20210321, 20210704, 20210...",36
5875,734,4275775146826484424,-441074925134835438,25,"[88____:007::8812005:8400131:23:1218:20210418,...","[10:37:00, 14:37:00]","[12:26:00, 16:26:00]","[BRUXELLES-NORD, SCHAERBEEK, BRUSSELS AIRPORT-...",{20210418},1
5876,734,-7504798920915766465,8899951303900289287,25,"[84____:007::8400280:8821105:4:1200:20210418, ...","[10:40:00, 14:40:00]","[12:01:00, 16:01:00]","[DEN HAAG HS (NL), ROTTERDAM CS (NL), BREDA (N...",{20210418},1
5877,734,2807447681034197785,-4873696204104425705,25,"[88____:007::8821105:8812005:22:1323:20210418,...","[12:01:00, 16:01:00]","[13:23:00, 17:23:00]","[NOORDERKEMPEN (BRECHT), ANVERS-LUCHTBAL, ANVE...",{20210418},1


### Netherlands

In [45]:
trips_hash_Netherlands, generic_trips_information_Netherlands, routes_hash_Netherlands = prepartion_space(railway_system_information_Netherlands, calendar_dates_cleaned_Netherlands)
trips_hash_Netherlands 
generic_trips_information_Netherlands
routes_hash_Netherlands 

Unnamed: 0,route_id,trip_id,service_id,trip_headsign,stop_name,arrival_time,departure_time,stop_sequence,stop_lat,stop_lon,country,departure_time_first,departure_time_last,stops_sequence,hash,hash_inverse,dates,count_service_id
0,17522,121351233,3877,Amsterdam Centraal,ROTTERDAM CENTRAAL,06:55:00,06:55:00,1,51.924383,4.469746,Netherlands,06:55:00,07:35:00,"[ROTTERDAM CENTRAAL, SCHIPHOL AIRPORT, AMSTERD...",-9029127007526563089,-3134271495944065964,"{20210701, 20210702, 20210703, 20210705, 20210...",85
1,17522,121351233,3877,Amsterdam Centraal,SCHIPHOL AIRPORT,07:19:00,07:21:00,3,52.309456,4.762284,Netherlands,06:55:00,07:35:00,"[ROTTERDAM CENTRAAL, SCHIPHOL AIRPORT, AMSTERD...",-9029127007526563089,-3134271495944065964,"{20210701, 20210702, 20210703, 20210705, 20210...",85
2,17522,121351233,3877,Amsterdam Centraal,AMSTERDAM CENTRAAL,07:35:00,07:35:00,6,52.378920,4.900889,Netherlands,06:55:00,07:35:00,"[ROTTERDAM CENTRAAL, SCHIPHOL AIRPORT, AMSTERD...",-9029127007526563089,-3134271495944065964,"{20210701, 20210702, 20210703, 20210705, 20210...",85
3,17522,121351234,3877,Amsterdam Centraal,ROTTERDAM CENTRAAL,07:26:00,07:26:00,1,51.924383,4.469746,Netherlands,07:26:00,08:07:00,"[ROTTERDAM CENTRAAL, SCHIPHOL AIRPORT, AMSTERD...",-9029127007526563089,-3134271495944065964,"{20210701, 20210702, 20210703, 20210705, 20210...",85
4,17522,121351234,3877,Amsterdam Centraal,SCHIPHOL AIRPORT,07:51:00,07:53:00,3,52.309456,4.762284,Netherlands,07:26:00,08:07:00,"[ROTTERDAM CENTRAAL, SCHIPHOL AIRPORT, AMSTERD...",-9029127007526563089,-3134271495944065964,"{20210701, 20210702, 20210703, 20210705, 20210...",85
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
340654,77590,130698050,3519,Schagen,SCHAGEN,23:52:00,23:52:00,4,52.785308,4.805043,Netherlands,23:34:00,23:52:00,"[DEN HELDER, DEN HELDER ZUID, ANNA PAULOWNA, S...",-6332000831105432864,5734412086342458475,"{20210328, 20210327}",2
340655,77590,130698051,3519,Schagen,DEN HELDER,24:04:00,24:04:00,1,52.956821,4.760637,Netherlands,24:04:00,24:22:00,"[DEN HELDER, DEN HELDER ZUID, ANNA PAULOWNA, S...",-6332000831105432864,5734412086342458475,"{20210328, 20210327}",2
340656,77590,130698051,3519,Schagen,DEN HELDER ZUID,24:08:00,24:08:00,2,52.932919,4.764340,Netherlands,24:04:00,24:22:00,"[DEN HELDER, DEN HELDER ZUID, ANNA PAULOWNA, S...",-6332000831105432864,5734412086342458475,"{20210328, 20210327}",2
340657,77590,130698051,3519,Schagen,ANNA PAULOWNA,24:14:00,24:15:00,3,52.867698,4.811470,Netherlands,24:04:00,24:22:00,"[DEN HELDER, DEN HELDER ZUID, ANNA PAULOWNA, S...",-6332000831105432864,5734412086342458475,"{20210328, 20210327}",2


Unnamed: 0,route_id,trip_id,service_id,hash,hash_inverse,departure_time_first,departure_time_last,count_service_id,stops_sequence,dates
0,17522,121351233,3877,-9029127007526563089,-3134271495944065964,06:55:00,07:35:00,85,"[ROTTERDAM CENTRAAL, SCHIPHOL AIRPORT, AMSTERD...","{20210701, 20210702, 20210703, 20210705, 20210..."
1,17522,121351234,3877,-9029127007526563089,-3134271495944065964,07:26:00,08:07:00,85,"[ROTTERDAM CENTRAAL, SCHIPHOL AIRPORT, AMSTERD...","{20210701, 20210702, 20210703, 20210705, 20210..."
2,17522,121351235,3872,-9029127007526563089,-3134271495944065964,07:55:00,08:35:00,68,"[ROTTERDAM CENTRAAL, SCHIPHOL AIRPORT, AMSTERD...","{20210701, 20210702, 20210705, 20210706, 20210..."
3,17522,121351236,3918,-9029127007526563089,-3134271495944065964,07:55:00,08:35:00,17,"[ROTTERDAM CENTRAAL, SCHIPHOL AIRPORT, AMSTERD...","{20210529, 20210626, 20210501, 20210619, 20210..."
4,17522,121351237,3878,-3134271495944065964,-9029127007526563089,05:53:00,06:34:00,14,"[AMSTERDAM CENTRAAL, SCHIPHOL AIRPORT, ROTTERD...","{20210630, 20210407, 20210505, 20210602, 20210..."
...,...,...,...,...,...,...,...,...,...,...
33083,77590,130698022,3481,5734412086342458475,-6332000831105432864,24:06:00,24:26:00,1,"[SCHAGEN, ANNA PAULOWNA, DEN HELDER ZUID, DEN ...",{20210327}
33084,77590,130698023,3544,5734412086342458475,-6332000831105432864,24:06:00,24:25:00,1,"[SCHAGEN, ANNA PAULOWNA, DEN HELDER ZUID, DEN ...",{20210328}
33085,77590,130698032,3519,5734412086342458475,-6332000831105432864,25:12:00,25:32:00,2,"[SCHAGEN, ANNA PAULOWNA, DEN HELDER ZUID, DEN ...","{20210328, 20210327}"
33086,77590,130698050,3519,-6332000831105432864,5734412086342458475,23:34:00,23:52:00,2,"[DEN HELDER, DEN HELDER ZUID, ANNA PAULOWNA, S...","{20210328, 20210327}"


Unnamed: 0,route_id,hash,hash_inverse,service_id,trip_id,departure_time_first,departure_time_last,stops_sequence,dates,count_service_id
0,17522,-9029127007526563089,-3134271495944065964,1079,[127996391],[06:26:00],[07:07:00],"[ROTTERDAM CENTRAAL, SCHIPHOL AIRPORT, AMSTERD...","{20210315, 20210317, 20210318, 20210319, 20210...",6
1,17522,-9029127007526563089,-3134271495944065964,1857,[130675680],[06:26:00],[07:07:00],"[ROTTERDAM CENTRAAL, SCHIPHOL AIRPORT, AMSTERD...","{20210316, 20210327}",2
2,17522,-9029127007526563089,-3134271495944065964,3075,[123727419],[06:26:00],[07:07:00],"[ROTTERDAM CENTRAAL, SCHIPHOL AIRPORT, AMSTERD...","{20210401, 20210320, 20210322, 20210325, 20210...",7
3,17522,-9029127007526563089,-3134271495944065964,3272,[130675677],[06:26:00],[07:07:00],"[ROTTERDAM CENTRAAL, SCHIPHOL AIRPORT, AMSTERD...",{20210323},1
4,17522,-9029127007526563089,-3134271495944065964,3683,[130675678],[06:24:00],[07:07:00],"[ROTTERDAM CENTRAAL, SCHIPHOL AIRPORT, AMSTERD...",{20210402},1
...,...,...,...,...,...,...,...,...,...,...
12879,77589,-4354356783192865880,7095920333588572768,3705,[130695901],[06:25:00],[09:16:00],"[AMSTERDAM CENTRAAL, SCHIPHOL AIRPORT, ROTTERD...",{20210403},1
12880,77590,5734412086342458475,-6332000831105432864,3481,[130698022],[24:06:00],[24:26:00],"[SCHAGEN, ANNA PAULOWNA, DEN HELDER ZUID, DEN ...",{20210327},1
12881,77590,-6332000831105432864,5734412086342458475,3519,"[130698050, 130698051]","[23:34:00, 24:04:00]","[23:52:00, 24:22:00]","[DEN HELDER, DEN HELDER ZUID, ANNA PAULOWNA, S...","{20210328, 20210327}",2
12882,77590,5734412086342458475,-6332000831105432864,3519,[130698032],[25:12:00],[25:32:00],"[SCHAGEN, ANNA PAULOWNA, DEN HELDER ZUID, DEN ...","{20210328, 20210327}",2


### Switzerland

In [46]:
trips_hash_Switzerland, generic_trips_information_Switzerland, routes_hash_Switzerland = prepartion_space(railway_system_information_Switzerland, calendar_dates_cleaned_Switzerland)
trips_hash_Switzerland
generic_trips_information_Switzerland
routes_hash_Switzerland

Unnamed: 0,route_id,trip_id,service_id,trip_headsign,stop_id,arrival_time,departure_time,stop_sequence,stop_name,stop_lat,stop_lon,country,departure_time_first,departure_time_last,stops_sequence,hash,hash_inverse,dates,count_service_id
0,00001.000044.018:1,33248:1,936,Le Locle,8504392,05:50:00,05:50:00,0,LES BRENETS,47.067210,6.707389,Switzerland,05:50:00,05:58:00,"[LES BRENETS, LES FRETES, LE LOCLE LE CHALET, ...",-745358023142251116,-4890007210341936069,"{20210701, 20210702, 20210705, 20210706, 20210...",83
1,00001.000044.018:1,33248:1,936,Le Locle,8504391,05:53:00,05:54:00,1,LES FRETES,47.058580,6.725787,Switzerland,05:50:00,05:58:00,"[LES BRENETS, LES FRETES, LE LOCLE LE CHALET, ...",-745358023142251116,-4890007210341936069,"{20210701, 20210702, 20210705, 20210706, 20210...",83
2,00001.000044.018:1,33248:1,936,Le Locle,8530260,05:54:00,05:55:00,2,LE LOCLE LE CHALET,47.055918,6.738986,Switzerland,05:50:00,05:58:00,"[LES BRENETS, LES FRETES, LE LOCLE LE CHALET, ...",-745358023142251116,-4890007210341936069,"{20210701, 20210702, 20210705, 20210706, 20210...",83
3,00001.000044.018:1,33248:1,936,Le Locle,8504316,05:58:00,05:58:00,3,LE LOCLE,47.057861,6.746153,Switzerland,05:50:00,05:58:00,"[LES BRENETS, LES FRETES, LE LOCLE LE CHALET, ...",-745358023142251116,-4890007210341936069,"{20210701, 20210702, 20210705, 20210706, 20210...",83
4,00001.000104.001:1,4117:1,5072,Brienzer Rothorn,8508350,07:36:00,07:36:00,0,BRIENZ BRB,46.755214,8.038089,Switzerland,07:36:00,08:36:00,"[BRIENZ BRB, PLANALP, BRIENZER ROTHORN]",235957410571974363,-5468304365227969288,"{20210704, 20210711}",2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
650517,96814.000011.101:96814,23295:1,116609,Lyon Part Dieu,8774500,12:56:00,13:05:00,1,BELLEGARDE (AIN),46.110918,5.825962,France,12:25:00,14:33:00,"[GENEVE, BELLEGARDE (AIN), LYON PART DIEU]",6286760176903130451,3001381519048992360,"{20210410, 20210412, 20210413, 20210414, 20210...",8
650518,96814.000011.101:96814,23295:1,116609,Lyon Part Dieu,8772319,14:33:00,14:33:00,2,LYON PART DIEU,45.760564,4.859990,France,12:25:00,14:33:00,"[GENEVE, BELLEGARDE (AIN), LYON PART DIEU]",6286760176903130451,3001381519048992360,"{20210410, 20210412, 20210413, 20210414, 20210...",8
650519,96818.000011.101:96818,23296:1,44307,Lyon Part Dieu,8501008,19:26:00,19:26:00,0,GENEVE,46.210213,6.142452,Switzerland,19:26:00,21:34:00,"[GENEVE, BELLEGARDE (AIN), LYON PART DIEU]",6286760176903130451,3001381519048992360,"{20210418, 20210411}",2
650520,96818.000011.101:96818,23296:1,44307,Lyon Part Dieu,8774500,19:58:00,20:02:00,1,BELLEGARDE (AIN),46.110918,5.825962,France,19:26:00,21:34:00,"[GENEVE, BELLEGARDE (AIN), LYON PART DIEU]",6286760176903130451,3001381519048992360,"{20210418, 20210411}",2


Unnamed: 0,route_id,trip_id,service_id,hash,hash_inverse,departure_time_first,departure_time_last,count_service_id,stops_sequence,dates
0,00001.000044.018:1,33248:1,936,-745358023142251116,-4890007210341936069,05:50:00,05:58:00,83,"[LES BRENETS, LES FRETES, LE LOCLE LE CHALET, ...","{20210701, 20210702, 20210705, 20210706, 20210..."
1,00001.000104.001:1,4117:1,5072,235957410571974363,-5468304365227969288,07:36:00,08:36:00,2,"[BRIENZ BRB, PLANALP, BRIENZER ROTHORN]","{20210704, 20210711}"
2,00002.000044.017:2,33250:1,936,-4890007210341936069,-745358023142251116,06:01:00,06:08:00,83,"[LE LOCLE, LE LOCLE LE CHALET, LES FRETES, LES...","{20210701, 20210702, 20210705, 20210706, 20210..."
3,00002.000104.001:2,4118:1,5072,-5468304365227969288,235957410571974363,09:06:00,10:10:00,2,"[BRIENZER ROTHORN, PLANALP, BRIENZ BRB]","{20210704, 20210711}"
4,00003.000011.101:3,4144:1,971,3105422662556607215,3023542754790293721,07:40:00,09:00:00,5,"[BASEL BAD BF, BASEL SBB, HAUENSTEIN-BASISTUNN...","{20210315, 20210316, 20210317, 20210318, 20210..."
...,...,...,...,...,...,...,...,...,...,...
57323,96806.000011.102:96806,23292:1,44307,-3654131468586059598,759894161801594875,18:39:00,20:59:00,2,"[GENEVE, BELLEGARDE (AIN), CULOZ, GRENOBLE]","{20210418, 20210411}"
57324,96810.000011.101:96810,23293:1,116609,6286760176903130451,3001381519048992360,06:16:00,08:24:00,8,"[GENEVE, BELLEGARDE (AIN), LYON PART DIEU]","{20210410, 20210412, 20210413, 20210414, 20210..."
57325,96812.000011.101:96812,23294:1,9453,6286760176903130451,3001381519048992360,07:17:00,09:25:00,2,"[GENEVE, BELLEGARDE (AIN), LYON PART DIEU]","{20210417, 20210410}"
57326,96814.000011.101:96814,23295:1,116609,6286760176903130451,3001381519048992360,12:25:00,14:33:00,8,"[GENEVE, BELLEGARDE (AIN), LYON PART DIEU]","{20210410, 20210412, 20210413, 20210414, 20210..."


Unnamed: 0,route_id,hash,hash_inverse,service_id,trip_id,departure_time_first,departure_time_last,stops_sequence,dates,count_service_id
0,00001.000044.018:1,-745358023142251116,-4890007210341936069,936,[33248:1],[05:50:00],[05:58:00],"[LES BRENETS, LES FRETES, LE LOCLE LE CHALET, ...","{20210701, 20210702, 20210705, 20210706, 20210...",83
1,00001.000104.001:1,235957410571974363,-5468304365227969288,5072,[4117:1],[07:36:00],[08:36:00],"[BRIENZ BRB, PLANALP, BRIENZER ROTHORN]","{20210704, 20210711}",2
2,00002.000044.017:2,-4890007210341936069,-745358023142251116,936,[33250:1],[06:01:00],[06:08:00],"[LE LOCLE, LE LOCLE LE CHALET, LES FRETES, LES...","{20210701, 20210702, 20210705, 20210706, 20210...",83
3,00002.000104.001:2,-5468304365227969288,235957410571974363,5072,[4118:1],[09:06:00],[10:10:00],"[BRIENZER ROTHORN, PLANALP, BRIENZ BRB]","{20210704, 20210711}",2
4,00003.000011.101:3,3105422662556607215,3023542754790293721,971,[4144:1],[07:40:00],[09:00:00],"[BASEL BAD BF, BASEL SBB, HAUENSTEIN-BASISTUNN...","{20210315, 20210316, 20210317, 20210318, 20210...",5
...,...,...,...,...,...,...,...,...,...,...
57323,96806.000011.102:96806,-3654131468586059598,759894161801594875,44307,[23292:1],[18:39:00],[20:59:00],"[GENEVE, BELLEGARDE (AIN), CULOZ, GRENOBLE]","{20210418, 20210411}",2
57324,96810.000011.101:96810,6286760176903130451,3001381519048992360,116609,[23293:1],[06:16:00],[08:24:00],"[GENEVE, BELLEGARDE (AIN), LYON PART DIEU]","{20210410, 20210412, 20210413, 20210414, 20210...",8
57325,96812.000011.101:96812,6286760176903130451,3001381519048992360,9453,[23294:1],[07:17:00],[09:25:00],"[GENEVE, BELLEGARDE (AIN), LYON PART DIEU]","{20210417, 20210410}",2
57326,96814.000011.101:96814,6286760176903130451,3001381519048992360,116609,[23295:1],[12:25:00],[14:33:00],"[GENEVE, BELLEGARDE (AIN), LYON PART DIEU]","{20210410, 20210412, 20210413, 20210414, 20210...",8


# Route Creation

## Functions

In [47]:
'''Some functions to better factorize the functions in the coming cells'''

def select_stops_sequences(stops_sequences_df, route_id):
    '''retruns the stop sequences with the selected route_id'''
    return stops_sequences_df[stops_sequences_df['route_id'] == route_id].copy()

def retrieve_matching_trip_id_and_departure_time_lasts(departure_time_last_sequence_1, departure_time_first_sequence_2, departure_time_last_sequence_2, trip_id_sequence_1, trip_id_sequence_2):
    '''returns the trip_ids of the two sequences that match with time and the remaining departure_time_last'''
    new_trip_id_sequence_1 = []
    #creates a dictionnary key = list value and dictionnary value = list index
    index_dtls1_dict = dict((k,i) for i,k in enumerate(departure_time_last_sequence_1))
    index_dtfs2_dict = dict((k,i) for i,k in enumerate(departure_time_first_sequence_2))
    #find the index of the trips that match each other with the timing
    leftover_indexes_sequence_1 = [index_dtls1_dict[x] for x in (set(departure_time_last_sequence_1).intersection(departure_time_first_sequence_2))]
    leftover_indexes_sequence_2 = [index_dtfs2_dict[x] for x in (set(departure_time_last_sequence_1).intersection(departure_time_first_sequence_2))]    
    #sort the list of indexes
    leftover_indexes_sequence_1.sort()
    leftover_indexes_sequence_2.sort() 
    #only keep the trip_id that match with the timing
    for trip_id_list in trip_id_sequence_1:
        new_trip_id_sequence_1.append([trip_id_list[index] for index in leftover_indexes_sequence_1])
    trip_id_total_sequence = new_trip_id_sequence_1.copy()
    trip_id_total_sequence.append([trip_id_sequence_2[index] for index in leftover_indexes_sequence_2])
        #only keep the departure time last that match with the timing
    departure_time_last_total_sequence = [departure_time_last_sequence_2[index] for index in leftover_indexes_sequence_2]
    return trip_id_total_sequence, departure_time_last_total_sequence

def get_extentions (after_or_before, time_compatibility, route_sequences_route_id, trip):
    '''returns the extentions for the trip (before or after)'''
    if after_or_before == 'after':
        #checks the extentions possible for the trip that can follow after its last stop
        possible_extentions = route_sequences_route_id[route_sequences_route_id['stops_sequence'].apply(lambda x: any(item for item in [trip['stops_sequence'][-1]] if (item == x[0]) and not(set(x[1:]) & set(trip['stops_sequence']))))].copy()
    elif after_or_before == 'before':
        #checks the extentions possible for the trip that can follow before its first stop
        possible_extentions = route_sequences_route_id[route_sequences_route_id['stops_sequence'].apply(lambda x: any(item for item in [trip['stops_sequence'][0]] if (item == x[-1]) and not(set(x[:-1]) & set(trip['stops_sequence']))))].copy()        
    if time_compatibility == True:    
        #checks that those extentions have a common date as the trip
        possible_extentions = possible_extentions[possible_extentions['dates'].apply(lambda x: any(item for item in trip['dates'] if item in x))].copy()   
        if not possible_extentions.empty: 
            if after_or_before == 'after':
                #checks that those extentions have a matching time schedule as the trip
                possible_extentions = possible_extentions[possible_extentions['departure_time_first'].apply(lambda x: any(item for item in trip['departure_time_last'] if item in x))].copy()
            elif after_or_before == 'before':
                #checks that those extentions have a matching time schedule as the trip
                possible_extentions = possible_extentions[possible_extentions['departure_time_last'].apply(lambda x: any(item for item in trip['departure_time_first'] if item in x))].copy()
    return possible_extentions      

def calculate_frequency (sequences_df):
    '''calculate the frequency based on the length of the dates and departure_time and put the hash in as a column of list'''
    sequences_df['number_dates'] = sequences_df['dates'].apply(lambda x: len(x))
    sequences_df['number_times'] = sequences_df['departure_time_last'].apply(lambda x: len(x))
    sequences_df['frequency'] = sequences_df['number_dates']* sequences_df['number_times'] 
    sequences_df = sequences_df.drop(['dates', 'departure_time_last', 'number_dates', 'number_times'], axis=1)
    sequences_df['hash'] = sequences_df['hash'].apply(lambda x: [x])
    return sequences_df.copy()
         
from datetime import datetime
from datetime import timedelta
FMT = '%H:%M:%S'
day_in_seconds = timedelta(days=1).total_seconds()
def calculate_time_difference(time_df, later_time, earlier_time, column_name):
    '''calculates the time difference between later time and earlier time and put it in time_df[column_name]'''
    #transform 24:00:00 into 00:00:00
    time_df['departure_time'] = time_df['departure_time'].apply(lambda x: str(int(x[:2])-24) + x[2:] if int(x[:2]) >= 24 else x)
    time_df['arrival_time'] = time_df['arrival_time'].apply(lambda x: str(int(x[:2])-24) + x[2:] if int(x[:2]) >=  24 else x)
    #calculate the waiting_time
    time_df[column_name] = time_df[['arrival_time','departure_time']].apply(lambda x: int((datetime.strptime(x[later_time], FMT) - datetime.strptime(x[earlier_time], FMT)).total_seconds()/60), axis=1)
    #if one day as past, take it into consideration
    time_df[column_name] = time_df[column_name].apply(lambda x: day_in_seconds/60 + x if x < 0 else x)
    return time_df            

'Some functions to better factorize the functions in the coming cells'

In [48]:
'''Finds the routes that can be either extended from before or from after and those which are complete sequences'''

def get_extention_indexes(stop_sequences_df):
    '''returns the four indexes: index_of_extendable, index_of_begin_sequences, index_of_complete_sequences and index_of_unused_sequences'''
    #intiate the dictionnaries, that will be used to retrieve different rows later on
    index_of_extendable = {}
    index_of_begin_sequences = {}
    index_of_complete_sequences = {}
    index_of_unused_sequences = {}
    for route_id in stop_sequences_df['route_id'].unique():
        #select the route with the route_id selected by the loop iteration
        route_sequences_route_id = select_stops_sequences(stop_sequences_df, route_id)
        for index_trip, trip in route_sequences_route_id.iterrows():
            #checks the extentions possible for the trip that can follow after its last stop
            possible_extentions_after = get_extentions('after', True, route_sequences_route_id, trip)
            #checks the extentions possible for the trip that can follow before its first stop
            possible_extentions_before = get_extentions('before', True, route_sequences_route_id, trip)
            #put all the sequences that can be extended either from the beginning either from the end together
            possible_extentions = possible_extentions_after.append(possible_extentions_before, ignore_index = True)
            if not possible_extentions.empty:
                if route_id not in index_of_extendable:
                    index_of_extendable[route_id] = []
                index_of_extendable[route_id].append(index_trip)
                #checks if it can only be extended after and not before
                if possible_extentions_before.empty:
                    if route_id not in index_of_begin_sequences:
                        index_of_begin_sequences[route_id] = []
                    index_of_begin_sequences[route_id].append(index_trip)
            elif possible_extentions.empty:
                #check if the trip is not extendable, just because it is a full sequences and not a problem of matching with time
                if (get_extentions('after', False, route_sequences_route_id, trip).empty) and (get_extentions('before', False, route_sequences_route_id, trip).empty):
                    if route_id not in index_of_complete_sequences:
                        index_of_complete_sequences[route_id] = []
                    index_of_complete_sequences[route_id].append(index_trip)
                #the trip does not match with the time of others but could have been extended
                else:
                    if route_id not in index_of_unused_sequences:
                        index_of_unused_sequences[route_id] = []
                    index_of_unused_sequences[route_id].append(index_trip)
                
    return index_of_extendable, index_of_begin_sequences, index_of_complete_sequences, index_of_unused_sequences

'Finds the routes that can be either extended from before or from after and those which are complete sequences'

In [49]:
'''Creates all the sequences of routes possible to reconstruct the real route and calculates their frequency'''

def possible_sequences_construction(stops_sequences_df, index_of_extendable, index_of_begin_sequences):
    '''returns the first part of the route_creation, two others need to be added'''
    import copy
    #create an empty df for the process of route creation
    route_creation  = pd.DataFrame()
    for route_id in index_of_extendable:
        #checks if some parts are begin sequences, if not, then we can't build routes with multiple sequences
        if route_id in index_of_begin_sequences:
            #create a copy of the df with only the route considered in the loop iteration
            routes_with_route_id = select_stops_sequences(stops_sequences_df, route_id)
            #set default frequency to NaN
            routes_with_route_id['frequency'] = np.nan
            #create a df where only the routes that have an end stop as their first element of the sequence
            route_creation_route_id = routes_with_route_id.loc[index_of_begin_sequences[route_id]][['route_id', 'hash', 'stops_sequence', 'dates', 'departure_time_last','frequency', 'trip_id']]
            #create a df with the exentable sequences for that route_id
            route_creation_extensions_route_id = routes_with_route_id.loc[index_of_extendable[route_id]][['route_id', 'hash', 'stops_sequence', 'dates', 'departure_time_first', 'departure_time_last','frequency', 'trip_id']]    
            #make the hash column as a column of lists
            route_creation_route_id['hash'] = route_creation_route_id['hash'].apply(lambda x: [x])
            route_creation_route_id['trip_id'] = route_creation_route_id['trip_id'].apply(lambda x: [x])
            route_creation_route_id = route_creation_route_id.reset_index(drop=True)
            #to stop the while loop when all the routes are complete in the df for the route_id of the loop iteration
            complete_routes = 0
            while complete_routes < len(route_creation_route_id.index):
                #use a deepcopy to not impact the iterrows of the main loop
                route_creation_deep_copy = copy.deepcopy(route_creation_route_id)
                for index_original, route_part in route_creation_deep_copy.iterrows():
                    #create a dataframe of the possible extentions for each route_part
                    #select an extention only if the extention is the next part of the route 
                    #and also that no other station are repeated in the sequence if this extention is added(otherwise it might cause an infinite loop)
                    possible_extentions = get_extentions('after', True, route_creation_extensions_route_id, route_part)
                    #checks whether any extention fullfilling the criterias has been found
                    if not possible_extentions.empty:
                        #if so, extend it with every single possibilities
                        for index_extention, possible_extention in possible_extentions.iterrows():
                            #must create a deepcopy, otherwise the orignal hash list will change as well (mutable)
                            updated_hash = copy.deepcopy(route_part['hash'])
                            updated_hash.append(possible_extention['hash'])
                            updated_route_sequence = route_part['stops_sequence'] + possible_extention['stops_sequence'][1:]
                            common_dates = possible_extention['dates'] & route_part['dates']
                            new_trip_id, new_departure_time_last = retrieve_matching_trip_id_and_departure_time_lasts(list(route_part['departure_time_last']), list(possible_extentions['departure_time_first'])[0], list(possible_extentions['departure_time_last'])[0], list(route_part['trip_id']), list(possible_extentions['trip_id'])[0])
                            new_frequency = len(new_departure_time_last) * len(common_dates)
                            route_creation_route_id.loc[max(route_creation_route_id.index)+1] = [route_id, updated_hash, updated_route_sequence, common_dates, new_departure_time_last, new_frequency, new_trip_id]
                        #then delete the route with the index (see loop here above)
                        route_creation_route_id = route_creation_route_id.drop(index = index_original)            
                    #the route can't be extended anymore
                    else:
                        complete_routes += 1
            #adds all the possible routes created with the trips of the route_id of the main loop
            route_creation = route_creation.append(route_creation_route_id, ignore_index = True)
    if 'departure_time_last' in route_creation.columns:
        route_creation = route_creation.drop(['dates', 'departure_time_last'], axis=1)
        route_creation['trip_id'] = route_creation['trip_id'].apply(lambda x: list(itertools.chain(*x)))
    route_creation = route_creation.reindex(columns=['route_id','hash','stops_sequence', 'frequency', 'trip_id'])
    return route_creation

'Creates all the sequences of routes possible to reconstruct the real route and calculates their frequency'

In [50]:
'''Adds the full sequences to the route_creation dataframe'''

def add_full_sequences(stop_sequences_df, route_creation, index_of_complete_sequences):
    '''returns the second part of the route_creation, one other needs to be added'''
    for route_id in index_of_complete_sequences:
        #finds all the complete sequences for that route_id
        copy_complete_sequences_df = stop_sequences_df.loc[index_of_complete_sequences[route_id]][['route_id','hash','stops_sequence', 'dates', 'departure_time_last', 'trip_id']].copy()
        copy_complete_sequences_df = calculate_frequency(copy_complete_sequences_df)
        #adds each of them in the route_creation dataframe
        for index_complete_sequence, complete_sequence in copy_complete_sequences_df.iterrows():
            route_creation = route_creation.append(complete_sequence, ignore_index = True)
    route_creation = route_creation.sort_values(by=['route_id', 'frequency'], ignore_index = True)
    return route_creation 

'Adds the full sequences to the route_creation dataframe'

In [51]:
'''Adds the sequences that were not yet added in the route_creation dataframe'''

def add_unused_sequences(stop_sequences_df, route_creation, index_of_unused_sequences):
    '''returns the third part of the route_creation'''
    for route_id in index_of_unused_sequences:
        #finds all the unused sequences for that route_id
        copy_sequences_route_id = select_stops_sequences(stop_sequences_df, route_id)[['route_id','hash','stops_sequence', 'dates', 'departure_time_last', 'trip_id']]
        unused_sequences_route_id = copy_sequences_route_id.loc[index_of_unused_sequences[route_id]]
        unused_sequences_route_id = calculate_frequency(unused_sequences_route_id)
        #adds the unused sequences that were not yet added in the route_creation DataFrame
        for index_trip, trip in unused_sequences_route_id.iterrows():
            route_creation = route_creation.append(trip, ignore_index = True)
    route_creation = route_creation.sort_values(by=['route_id', 'frequency'], ignore_index = True)
    return route_creation

'Adds the sequences that were not yet added in the route_creation dataframe'

In [52]:
'''Creates a column in the df that calculates the travel time between the first and last stop (waiting time included)
and another column with the waiting time (calculated with a weighted average based on the frequency)'''
from datetime import datetime
from datetime import timedelta
FMT = '%H:%M:%S'
day_in_seconds = timedelta(days=1).total_seconds()

def give_begin_end_time(route_creation_frequency_single, trips_hash_stops_sequence, stops_cleaned_stop_times_trips_merge_dates):
    #create a copy to not change the input DataFrame
    route_creation_frequency_single = route_creation_frequency_single.copy()
    #makes a column with the representative begin time and end time of the route
    route_creation_frequency_single['travel_time'] = np.nan
    for index_sequence, sequence in route_creation_frequency_single.iterrows():
        constructed_route = pd.DataFrame()
        for index_hash, hash_value in enumerate(sequence['hash']):
            index_plus_one = index_hash + 1
            #take all the trip_id of the sequence with that hash
            next_representative_trips = trips_hash_stops_sequence[(trips_hash_stops_sequence['trip_id'].isin(sequence['trip_id'])) & (trips_hash_stops_sequence['hash'] == hash_value)].copy()['trip_id']
            #take all the stop sequences and their time that belongs 
            full_times = stops_cleaned_stop_times_trips_merge_dates[stops_cleaned_stop_times_trips_merge_dates['trip_id'].isin(next_representative_trips)].copy()
            #select only the last stop sequences of full_times for each trip_id
            new_index_max_per_trip_id = full_times.reset_index().groupby(['route_id', 'trip_id'])['stop_sequence'].idxmax()
            max_per_trip_id = full_times.reset_index().loc[new_index_max_per_trip_id]
            #select only the first stop sequences of full_times for each trip_id            
            new_index_min_per_trip_id = full_times.reset_index().groupby(['route_id', 'trip_id'])['stop_sequence'].idxmin()            
            min_per_trip_id = full_times.reset_index().loc[new_index_min_per_trip_id]
            #merge max_per_trip_id and min_per_trip_id
            merged = min_per_trip_id[['trip_id', 'dates', 'departure_time']].merge(max_per_trip_id[['trip_id', 'arrival_time', 'departure_time']], on='trip_id')
            #take all the stop sequences except the first one, and the last one if it is not the last sequence of the route
            if index_hash == len(sequence['hash']) - 1:
                rest_per_trip_id = full_times.reset_index().drop(pd.concat([new_index_min_per_trip_id,new_index_max_per_trip_id]))
            else:
                rest_per_trip_id = full_times.reset_index().drop(new_index_min_per_trip_id)            
            #ONLY NEEDED FOR SWITZERLAND
            rest_per_trip_id = rest_per_trip_id.dropna()
            if not rest_per_trip_id.empty:
                rest_per_trip_id = calculate_time_difference(rest_per_trip_id, 'departure_time', 'arrival_time', 'waiting_time')
                #calculate the total waiting_time
                rest_per_trip_id_grouped = rest_per_trip_id.groupby(['trip_id'], as_index=False)['waiting_time'].sum()
                merged_waiting_time = merged.merge(rest_per_trip_id_grouped, on='trip_id')
            #in case there are only two stops in for the hash
            else:
                merged_waiting_time = merged.copy()
                merged_waiting_time['waiting_time'] = 0
            #rename the columns     
            merged_waiting_time = merged_waiting_time.rename(columns = {'trip_id': 'trip_id_' + str(index_plus_one),'departure_time_x':'departure_time_'+ str(index_plus_one), 'arrival_time':'arrival_time_'+ str(index_plus_one),
                                          'departure_time_y':'departure_time_'+ str(index_plus_one + 1), 'waiting_time': 'waiting_time_' + str(index_plus_one)})
            if index_hash == 0:
                constructed_route = merged_waiting_time
            elif index_hash > 0:
                constructed_route = constructed_route.merge(merged_waiting_time, how='inner', on=['departure_time_' + str(index_plus_one)])
                #take the intersection of the dates => only get the common dates and retain those rows with common dates
                constructed_route['dates'] = [a & b for a,b in zip(constructed_route['dates_x'], constructed_route['dates_y'])]
                constructed_route = constructed_route[constructed_route['dates'].map(lambda d: len(d)) > 0]
                constructed_route = constructed_route.drop(['dates_x','dates_y'], axis=1)        
        #make a list of all the columns of waiting_times
        list_column_waiting_time = []
        for i in range(1, index_plus_one + 1):
            list_column_waiting_time.append('waiting_time_' + str(i))
        #sum all the waiting times together for each route itinerary
        constructed_route['waiting_time'] = constructed_route[list_column_waiting_time].astype(int).sum(1)
        
        #sometimes it is impossible to find trips that follow each other
        if not constructed_route.empty:
            #when the loop is finished, take the last arrival time, that will be used to calculate the travel time
            time_constructed_route = constructed_route[['departure_time_1', 'arrival_time_' + str(index_plus_one), 'waiting_time', 'dates']]
            time_constructed_route = time_constructed_route.rename(columns = {'departure_time_1':'departure_time', 'arrival_time_' + str(index_plus_one):'arrival_time'})
            time_constructed_route = calculate_time_difference(time_constructed_route, 'arrival_time', 'departure_time', 'time_diff_min')
            #take the average of those columns
            avg_tt = time_constructed_route['time_diff_min'].mean()
            avg_wt = time_constructed_route['waiting_time'].mean()
            #Add this to the first dataframe
            route_creation_frequency_single.loc[index_sequence,'travel_time'] = avg_tt
            route_creation_frequency_single.loc[index_sequence,'waiting_time'] = avg_wt
        #if there is no trips that follow each other with the hash from the array
        else:
            route_creation_frequency_single = route_creation_frequency_single.drop(index_sequence)
    route_creation_frequency_single = route_creation_frequency_single.sort_values(by=['route_id', 'frequency', 'travel_time'], ignore_index=True)    
    route_creation_frequency_single = route_creation_frequency_single.drop(columns='trip_id')
    return route_creation_frequency_single

'Creates a column in the df that calculates the travel time between the first and last stop (waiting time included)\nand another column with the waiting time (calculated with a weighted average based on the frequency)'

In [53]:
def calculate_hash_route_creation(route_creation): 
    '''calculates the hash and the hash inverse of the route_creation'''
    #copy the route_creation DataFrame
    route_creation_hash = route_creation.copy()
    #calculate the hash and the hash inverse using the lists in stop_sequence
    route_creation_hash['hash'] = route_creation_hash['stops_sequence'].apply(lambda x: hash(tuple(x)))
    route_creation_hash['hash_inverse'] = route_creation_hash['stops_sequence'].apply(lambda x: hash(tuple(x[::-1])))
    return route_creation_hash

In [54]:
'''Regroup the routes that are the same (even though they are in the opposite direction)'''

def regroup_same_stops_sequences(route_creation_hash):
    '''regroups the stops_sequences that are the same'''
    
    route_creation_max_hash = route_creation_hash.copy()
    route_creation_max_hash['max_hash'] = route_creation_max_hash[['hash', 'hash_inverse']].max(axis=1)
    #create a df that sums the frequency of the trips going from opposite directions
    route_creation_max_hash_freq = route_creation_max_hash.groupby(['route_id','max_hash'], as_index = False)[['frequency']].sum()
    #drops the column freq_sequence_route because the one that is of interest is in route_creation_max_hash_freq
    route_hash_without_freq = route_creation_max_hash.copy().drop(['frequency', 'travel_time', 'waiting_time'], axis = 1)
    #retains only one element per pair of route_id and hash
    route_hash_without_freq = route_hash_without_freq.drop_duplicates(subset=['route_id', 'hash'])

    #creates a df for the calculation of the weighted avg of travel_time and waiting_time for each max_hash
    calculation_weighted_avg = pd.merge(route_creation_max_hash, route_creation_max_hash_freq, right_on=['route_id','max_hash'], left_on=['route_id','max_hash'])
    calculation_weighted_avg = calculation_weighted_avg.rename(columns = {'frequency_y': 'sum_frequency', 'frequency_x':'frequency'})
    calculation_weighted_avg['WS_travel_time'] = (calculation_weighted_avg['travel_time'] * calculation_weighted_avg['frequency'])/calculation_weighted_avg['sum_frequency']
    calculation_weighted_avg['WS_waiting_time'] = (calculation_weighted_avg['waiting_time'] * calculation_weighted_avg['frequency'])/calculation_weighted_avg['sum_frequency']    
    calculation_weighted_avg = calculation_weighted_avg.groupby(by=['route_id', 'max_hash'])[['WS_travel_time', 'WS_waiting_time']].sum()
    calculation_weighted_avg = calculation_weighted_avg.rename(columns = {'WS_travel_time': 'travel_time', 'WS_waiting_time':'waiting_time'})
    
    #merge the weighted avg of travel_time and waiting_time with each combination of route_id and hash
    route_hash_without_freq = pd.merge(route_hash_without_freq, calculation_weighted_avg, right_on=['route_id','max_hash'], left_on=['route_id','max_hash'] )
    route_hash_without_freq = route_hash_without_freq.drop(['max_hash'], axis=1)
    #renames the max_hash column into hash so it the dataframe can be merged with route_hash_without_freq
    route_creation_max_hash_freq = route_creation_max_hash_freq.rename(columns = {'max_hash':'hash'})
    route_hash_freq_combined_first_merge = pd.merge(route_creation_max_hash_freq, route_hash_without_freq, on=['route_id', 'hash'], how='left')
    #selects the part of the dataset that doesn't have NaN (because for the NaN, their hash_value that was max was the one in hash_inverse and it didn't exist in the other df), so we can concatenate it with the part that had NaN later
    route_hash_freq_first_part = route_hash_freq_combined_first_merge[pd.notnull(route_hash_freq_combined_first_merge['stops_sequence'])]
    #selects one part the part of the dataset that does have NaN, so we can concatenate it with the part that has no NaN later on.
    #but first, we will need to fill those NaN values (done in the code lines behind this one)
    route_hash_freq_second_part = route_hash_freq_combined_first_merge[pd.isnull(route_hash_freq_combined_first_merge['stops_sequence'])][['route_id', 'hash', 'frequency']]
    #renames the hash column into hash_inverse so it the dataframe can be merged with route_hash_without_freq (because it didn't work with 'hash' on the first merge)
    route_hash_freq_second_part = route_hash_freq_second_part.rename(columns = {'hash':'hash_inverse'})
    route_hash_freq_second_part = pd.merge(route_hash_freq_second_part, route_hash_without_freq, on=['route_id', 'hash_inverse'], how='left')
    #the hash that is of interest in the final df will be hash and not hash_inverse
    route_hash_freq_combined_not_sorted = pd.concat([route_hash_freq_first_part, route_hash_freq_second_part])
    route_hash_freq_combined = route_hash_freq_combined_not_sorted.sort_values(by = ['route_id','frequency', 'travel_time'], ignore_index=True)
    return route_hash_freq_combined

'Regroup the routes that are the same (even though they are in the opposite direction)'

In [55]:
'''Deletes the routes that do not represent 10% of the total route frequency and creates new route, if some of them are different'''

def apply_treshold_route_creation(route_hash_freq_combined): 
    #calculates the total frequency per route_id
    frequency_each_route = route_hash_freq_combined.groupby(['route_id'], as_index = False)['frequency'].sum()
    frequency_treshold = frequency_each_route.copy()
    #calculates the treshold (here 10%)
    frequency_treshold['frequency'] = frequency_treshold['frequency']/10
    frequency_treshold.rename(columns = {'frequency':'frequency_treshold'}, inplace = True)
    route_hash_freq_treshold = route_hash_freq_combined.merge(frequency_treshold, on='route_id', how = 'left')
    #find the sequences that are not more than 10% of the route frequency and delete them
    index_names = route_hash_freq_treshold[route_hash_freq_treshold['frequency'] < route_hash_freq_treshold['frequency_treshold']].index
    route_hash_freq_treshold.drop(index_names, inplace = True)
    #drop the routes with the same hash as others
    route_hash_freq_treshold['max_hash'] = route_hash_freq_treshold[['hash', 'hash_inverse']].max(axis=1)
    
    #Calculate weighted average of the sequences that have the same max hash
    route_creation_max_hash_freq = route_hash_freq_treshold.groupby(['max_hash'], as_index = False)[['frequency']].sum()
    calculation_weighted_avg = pd.merge(route_hash_freq_treshold, route_creation_max_hash_freq, on=['max_hash'])
    calculation_weighted_avg = calculation_weighted_avg.rename(columns = {'frequency_y': 'sum_frequency', 'frequency_x':'frequency'})
    calculation_weighted_avg['WS_travel_time'] = (calculation_weighted_avg['travel_time'] * calculation_weighted_avg['frequency'])/calculation_weighted_avg['sum_frequency']
    calculation_weighted_avg['WS_waiting_time'] = (calculation_weighted_avg['waiting_time'] * calculation_weighted_avg['frequency'])/calculation_weighted_avg['sum_frequency']    
    calculation_weighted_avg = calculation_weighted_avg.groupby(by=['max_hash'])[['WS_travel_time', 'WS_waiting_time']].sum()
    calculation_weighted_avg = calculation_weighted_avg.rename(columns = {'WS_travel_time': 'travel_time', 'WS_waiting_time':'waiting_time'})
    
    #change the travel time and waiting time to the weighted avg value
    route_hash_freq_treshold = route_hash_freq_treshold.drop(columns=['travel_time', 'waiting_time'])
    route_hash_freq_treshold = pd.merge(route_hash_freq_treshold, calculation_weighted_avg, on='max_hash')
    
    route_hash_freq_treshold = route_hash_freq_treshold.drop_duplicates(subset='max_hash')
    route_hash_freq_treshold  = route_hash_freq_treshold.drop(['hash_inverse', 'max_hash'], axis = 1)
    #selects the sequences that are not the first most frequent per route_id
    sequences_max_freq = route_hash_freq_treshold.groupby(['route_id'],as_index = False)['frequency'].max()
    sequences_max_freq.rename(columns = {'frequency':'max_frequency'}, inplace = True)
    sequences_max_freq_merged = route_hash_freq_treshold.merge(sequences_max_freq, on='route_id', how='left')
    sequences_max_freq_index = sequences_max_freq_merged[sequences_max_freq_merged['frequency'] == sequences_max_freq_merged['max_frequency']].drop_duplicates(subset='route_id').index
    sequences_non_max_freq_index = sequences_max_freq_merged[~sequences_max_freq_merged.index.isin(sequences_max_freq_index)].index
    #those selected sequences get a new route_id that starts from routes['route_id'].max() + 1 (except for Switzerland, wherre it starts with the index 1) and increments by one for each new route
    if route_hash_freq_combined['route_id'].dtype == np.int64:
        route_id_creation = route_hash_freq_combined['route_id'].max() + 1
    else:
        route_id_creation =  0 + 1
    new_route_id_column = list(range(route_id_creation, route_id_creation + len(sequences_non_max_freq_index)))    
    sequences_max_freq_merged.loc[sequences_non_max_freq_index, 'route_id'] = new_route_id_column
    #keep only the column route_id and stops_sequence
    final_routes = sequences_max_freq_merged.drop(sequences_max_freq_merged[sequences_max_freq_merged['frequency'] == 0].index)
    final_routes = final_routes.drop(columns=['hash', 'frequency', 'frequency_treshold', 'max_frequency'])
    return final_routes

'Deletes the routes that do not represent 10% of the total route frequency and creates new route, if some of them are different'

In [56]:
''' To keep only the routes that have at least one country station in their route_sequence'''

def keep_country_routes(final_routes, stops_country_series):
    non_country_routes = set()
    for index_route, route in final_routes.iterrows():
        is_in_country = False
        for stop in route['stops_sequence']:
            if stop in set(stops_country_series):
                is_in_country = True
                break
        if not is_in_country:
            route_id = route['route_id']
            non_country_routes.add(route_id)
    country_routes = final_routes.loc[~final_routes['route_id'].isin(non_country_routes)] 
    if country_routes['route_id'].dtype == np.int64:
        country_routes = country_routes.sort_values(by=['route_id'], ignore_index=True)

    return country_routes

' To keep only the routes that have at least one country station in their route_sequence'

In [57]:
'''Calculates the distances of the trip, by taking the distance between each stop of the stop_sequence'''

def calculate_distance_from_lat_long(name_first, name_second, stop_df):
        lon_first, lat_first = math.radians(stop_df[stop_df['stop_name'] == name_first].iloc[0]['stop_lon']), math.radians(stop_df[stop_df['stop_name'] == name_first].iloc[0]['stop_lat'])
        lon_second, lat_second = math.radians(stop_df[stop_df['stop_name'] == name_second].iloc[0]['stop_lon']), math.radians(stop_df[stop_df['stop_name'] == name_second].iloc[0]['stop_lat'])
        # The radius of the earth
        R = 6373.0 
        # To calculate the change in coordinates
        dlon = lon_second - lon_first
        dlat = lat_second - lat_first
        # To use the Haversine formula to get the distance in kilometers between the starting_station and the ending_station
        a = math.sin(dlat / 2)**2 + math.cos(lat_first) * math.cos(lat_second) * math.sin(dlon / 2)**2
        c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
        # To calculate the distance
        distance = R * c
        return distance

def calculate_distance(stop_sequence, stop_df):
    distance = 0
    for index_stop ,stop in enumerate(stop_sequence):
        index_plus_one = index_stop + 1
        if index_plus_one <= len(stop_sequence) - 1:
            distance += calculate_distance_from_lat_long(stop, stop_sequence[index_plus_one], stop_df)
    return distance

'Calculates the distances of the trip, by taking the distance between each stop of the stop_sequence'

In [58]:
'''Makes a df that can be used for building the nodes and edges of the graph using Networkx package'''

def create_df_for_Networkx(final_routes):
    '''return df_for_edges a df that can be used to build a Networkx space-of-stops graph'''
    #takes the list stop sequence and make it a new column for each stop
    stops_sequence_values = final_routes.apply(lambda x: pd.Series(x['stops_sequence']),axis=1).stack().reset_index(level=1, drop=True)
    stops_sequence_values.name = 'stops_sequence'
    final_routes_stops = final_routes.drop('stops_sequence', axis=1).join(stops_sequence_values)
    final_routes_stops = final_routes_stops.reset_index(drop=True)
    #Creates a shifted instance of the df to use it for the final result
    final_routes_stops_shifted = final_routes_stops.shift()
    #Check if which of the rows are followed by a row with the same trip_id
    final_routes_stops_shifted['match'] = final_routes_stops_shifted['route_id'].eq(final_routes_stops['route_id'])
    #Drop the rows for which this condition is not satisfied
    final_routes_stops_shifted.drop(final_routes_stops_shifted[final_routes_stops_shifted['match'] == False].index, inplace = True)
    final_routes_stops_shifted.rename(columns=
      {"stops_sequence": "stop_name_1",
      "stop_name": "stop_name_1"}, inplace=True)
    #joins the df with its shifted version sothat each sequence of two stations is represented in the table as a row
    df_for_edges = final_routes_stops_shifted.join(final_routes_stops[['stops_sequence']], lsuffix='_caller', rsuffix='_other', how='left')
    df_for_edges.rename(columns=
      {"stops_sequence": "stop_name_2",
      "stop_name": "stop_name_2"}, inplace=True)

    df_for_edges = df_for_edges.drop_duplicates()
    df_for_edges = df_for_edges[['route_id','stop_name_1', 'stop_name_2']]
    df_for_edges = df_for_edges.reset_index(drop=True)
    return df_for_edges

'Makes a df that can be used for building the nodes and edges of the graph using Networkx package'

In [59]:
'''Applies all the functions from 1 get_extention_indexes to 11 create_df_for_Networkx'''

def full_route_creation(stops_sequences_df, trips_hash_stops_sequence, stops_cleaned_stop_times_trips_merge_dates, stops_cleaned, stops_country_series):
    '''return a df that can be used to make a Networkx space-of-stops (with treshold applied of 10%)'''
    index_of_extendable, index_of_begin_sequences, index_of_complete_sequences, index_of_unused_sequences = get_extention_indexes(stops_sequences_df)
    route_creation_first = possible_sequences_construction(stops_sequences_df, index_of_extendable, index_of_begin_sequences)
    route_creation_second = add_full_sequences(stops_sequences_df, route_creation_first, index_of_complete_sequences)
    route_creation_third = add_unused_sequences(stops_sequences_df, route_creation_second, index_of_unused_sequences)
    route_creation_frequency_single_travel_time = give_begin_end_time(route_creation_third, trips_hash_stops_sequence, stops_cleaned_stop_times_trips_merge_dates)
    route_creation_hash = calculate_hash_route_creation(route_creation_frequency_single_travel_time)
    route_hash_freq_combined = regroup_same_stops_sequences(route_creation_hash)
    final_routes = apply_treshold_route_creation(route_hash_freq_combined)
    country_routes = keep_country_routes(final_routes, stops_country_series)
    country_routes['distance'] = country_routes['stops_sequence'].apply(lambda x: calculate_distance(x, stops_cleaned))
    df_for_edges = create_df_for_Networkx(country_routes)
    
    return country_routes, df_for_edges

'Applies all the functions from 1 get_extention_indexes to 11 create_df_for_Networkx'

## Actual route creation

### Belgium

In [60]:
#THE FOLLOWING LINE MIGHT BE RUNNED IF WANTED TO RECALCULATE THE ROUTE CREATIONS, BUT IT TAKES A LONG TIME
belgian_routes, df_for_edges_Belgium = full_route_creation(routes_hash_Belgium, generic_trips_information_Belgium, trips_hash_Belgium, stops_cleaned_Belgium, stops_Belgium_series)

In [61]:
'''Save those two last DataFrames as .csv files'''
#belgian_routes.reset_index(drop=True).to_csv(f'{routes_loc}belgian_routes_Belgium.csv')
#df_for_edges_Belgium.reset_index(drop=True).to_csv(f'{df_for_edges_loc}df_for_edges_Belgium.csv')

'Save those two last DataFrames as .csv files'

In [62]:
#belgian_routes = pd.read_csv(routes_loc + 'belgian_routes_Belgium.csv', sep=",")
#df_for_edges_Belgium = pd.read_csv( df_for_edges_loc + 'df_for_edges_Belgium.csv', sep=",")

In [63]:
belgian_routes

Unnamed: 0,route_id,stops_sequence,travel_time,waiting_time,distance
0,115,"[MOUSCRON, HERSEAUX, FROYENNES, TOURNAI]",16.767176,0.000000,18.856662
1,116,"[KNOKKE, DUINBERGEN, HEIST, BRUGES-SAINT-PIERR...",21.202262,0.257485,18.936973
2,117,"[VERVIERS-CENTRAL, PEPINSTER, PEPINSTER-CITE, ...",28.570370,4.192593,14.412581
3,118,"[GAND-SAINT-PIERRE, DE PINTE, DEINZE, AARSELE,...",69.000000,4.000000,83.752756
4,119,"[GRAMMONT, SCHENDELBEKE, IDEGEM, ZANDBERGEN, A...",26.761134,1.000000,20.655690
...,...,...,...,...,...
564,823,"[MOUSCRON, HERSEAUX, FROYENNES, TOURNAI, LEUZE...",143.000000,24.000000,127.207623
565,824,"[MOUSCRON, HERSEAUX, FROYENNES, TOURNAI, LEUZE...",128.000000,18.000000,123.877025
566,825,"[COURTRAI, HARELBEKE, WAREGEM, DEINZE, DE PINT...",63.150485,10.480583,69.662900
567,826,"[BINCHE, LEVAL, LA LOUVIERE-SUD, LA LOUVIERE- ...",158.000000,21.000000,125.808981


### Netherlands

In [64]:
#THE FOLLOWING LINE MIGHT BE RUNNED IF WANTED TO RECALCULATE THE ROUTE CREATIONS, BUT IT TAKES A LONG TIME
dutch_routes, df_for_edges_Netherlands = full_route_creation(routes_hash_Netherlands, generic_trips_information_Netherlands, trips_hash_Netherlands, stops_cleaned_Netherlands, stops_Netherlands_series)

In [65]:
'''Save those two last DataFrames as .csv files'''
#dutch_routes.reset_index(drop=True).to_csv(f'{routes_loc}dutch_routes_Netherlands.csv')
#df_for_edges_Netherlands.reset_index(drop=True).to_csv(f'{df_for_edges_loc}df_for_edges_Netherlands.csv')

'Save those two last DataFrames as .csv files'

In [66]:
#dutch_routes = pd.read_csv(routes_loc + 'dutch_routes_Netherlands.csv', sep=",")
#df_for_edges_Netherlands = pd.read_csv(df_for_edges_loc + 'df_for_edges_Netherlands.csv', sep=",")

In [67]:
dutch_routes

Unnamed: 0,route_id,stops_sequence,travel_time,waiting_time,distance
0,17522,"[AMSTERDAM CENTRAAL, SCHIPHOL AIRPORT, ROTTERD...",40.771873,2.000000,59.444891
1,17562,"[ROTTERDAM CENTRAAL, ROTTERDAM ALEXANDER, GOUD...",37.502840,1.000000,47.994900
2,17573,"[LEIDEN CENTRAAL, LEIDEN LAMMENSCHANS, ALPHEN ...",42.499452,3.500219,46.394804
3,17599,"[ALMERE OOSTVAARDERS, ALMERE BUITEN, ALMERE PA...",62.337773,13.578904,45.962746
4,17601,"[AMERSFOORT SCHOTHORST, AMERSFOORT CENTRAAL, U...",61.028709,8.032569,78.383771
...,...,...,...,...,...
190,77642,"[EMMEN, EMMEN ZUID, COEVORDEN, HARDENBERG, OMM...",53.000000,4.000000,68.807951
191,77643,"[GRONINGEN, ZUIDHORN, FEANWALDEN, LEEUWARDEN]",34.546993,1.000000,52.851600
192,77644,"[SNEEK, SNEEK NOORD, LEEUWARDEN]",18.000000,1.384615,20.557738
193,77645,"[BAD NIEUWESCHANS, WINSCHOTEN, SCHEEMDA, ZUIDB...",43.000000,1.211538,45.893449


### Switzerland

In [68]:
#THE FOLLOWING LINE MIGHT BE RUNNED IF WANTED TO RECALCULATE THE ROUTE CREATIONS, BUT IT TAKES A LONG TIME
swiss_routes, df_for_edges_Switzerland = full_route_creation(routes_hash_Switzerland, generic_trips_information_Switzerland, trips_hash_Switzerland, stops_cleaned_Switzerland, stops_Switzerland_series)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  country_routes['distance'] = country_routes['stops_sequence'].apply(lambda x: calculate_distance(x, stops_cleaned))


In [69]:
'''Save those two last DataFrames as .csv files'''
#swiss_routes.reset_index(drop=True).to_csv(f'{routes_loc}swiss_routes_Switzerland.csv')
#df_for_edges_Switzerland.reset_index(drop=True).to_csv(f'{df_for_edges_loc}df_for_edges_Switzerland.csv')

'Save those two last DataFrames as .csv files'

In [70]:
#swiss_routes = pd.read_csv(routes_loc + 'swiss_routes_Switzerland.csv', sep=",")
#df_for_edges_Switzerland = pd.read_csv(df_for_edges_loc + 'df_for_edges_Switzerland.csv', sep=",")

In [71]:
swiss_routes

Unnamed: 0,route_id,stops_sequence,travel_time,waiting_time,distance
0,00001.000044.018:1,"[LES BRENETS, LES FRETES, LE LOCLE LE CHALET, ...",7.500000,2.000000,3.320147
1,00001.000104.001:1,"[BRIENZ BRB, PLANALP, BRIENZER ROTHORN]",61.030016,0.000000,4.552323
2,00003.000011.101:3,"[BASEL BAD BF, BASEL SBB, HAUENSTEIN-BASISTUNN...",77.503704,17.180000,79.239036
3,00003.000011.102:3,"[BASEL SBB, HAUENSTEIN-BASISTUNNEL, ZURICH HB,...",139.375927,14.120363,182.406957
4,00003.000011.103:3,"[BASEL SBB, HAUENSTEIN-BASISTUNNEL, ZURICH HB]",54.024419,0.000000,76.670926
...,...,...,...,...,...
2685,96801.000011.101:96801,"[VALENCE, BELLEGARDE (AIN), GENEVE]",196.000000,2.000000,177.110629
2686,96802.000011.101:96802,"[GENEVE, BELLEGARDE (AIN), GRENOBLE]",124.000000,0.000000,129.437931
2687,96804.000011.101:96804,"[GENEVE, BELLEGARDE (AIN), CULOZ, GRENOBLE]",140.000000,8.666667,129.465746
2688,96804.000011.102:96804,"[GENEVE, BELLEGARDE (AIN), CULOZ, GRENOBLE, VA...",214.000000,11.000000,200.296819
