In [1]:
!pip install geopy



In [2]:
'''To import the required packages.'''
import pandas as pd
import numpy as np
import networkx as nx
import collections
import matplotlib.pyplot as plt
import math
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

# Import of the Swiss railway datasets

In [3]:
'''To register the GitHub link with the Swiss data as a variable.'''
datalink = "https://raw.githubusercontent.com/polkuleuven/Thesis_Train/main/gtfs_train_Switzerland_1503/"

In [4]:
'''Import all the GTFS data'''

#To import the agency dataset that contains limited information about the Swiss SBB railway agency.
agency_Switzerland = pd.read_csv(datalink + "agency.txt", sep=",")
#To import the stops dataset that contains information about the ids, the names and the geographical coordinates of the Swiss SBB railway stations.
stops_Switzerland = pd.read_csv(datalink + "stops.txt", sep=",")
#To import the feed_info dataset that contains limited information about the Swiss SBB railway feed.
feed_info_Switzerland = pd.read_csv(datalink + "feed_info.txt", sep=",")
#To import the transfers dataset that gives the minimum transfer time to switch routes at each Swiss SBB railway station.
transfers_not_cleaned_Switzerland = pd.read_csv(datalink + "transfers.txt", sep=",")
#To import the routes dataset that provides the id, the name and the type of vehicle used for all Swiss SBB railway routes.
routes_Switzerland = pd.read_csv(datalink + "routes.txt", sep=",")
#To import the trips dataset that gives for all routes an overview of the trips and the headsigns of these trips belonging to the Swiss SBB railway route.
#The service_id is an indication of all the dates this trip is valid (consultable in the calendar_dates dataset).
trips_Switzerland = pd.read_csv(datalink + "trips.txt", sep=",")
#To import the stop_times dataset that gives for all trips an overview of the ids of the stations served and the sequence in which these stations are served. 
#In addition, for all the trips the arrival and departure times at the stations served are given.
stop_times_Switzerland = pd.read_csv(datalink + "stop_times.txt", sep=",")
#To import the calendar dataset that gives the first and last date of all data observations.
calendar_Switzerland = pd.read_csv(datalink + "calendar.txt", sep=",")
#To import the calendar_dates dataset that gives for each service_id all the exact dates when that service_id is valid.
calendar_dates_Switzerland = pd.read_csv(datalink + "calendar_dates.txt", sep=",")


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


# Cleaning of the Swiss railway data

In [5]:
''' To clean the routes_Switzerland df.'''
#To keep the train routes
routes_cleaned_Switzerland = routes_Switzerland[routes_Switzerland['route_type'] == 2]
routes_cleaned_Switzerland

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_desc,route_type,route_url,route_color,route_text_color
0,04236.06____.020:4236,06____,RE,RE 4236,,2,,,
1,04700.06____.001:4700,06____,RE,RE 4700,,2,,,
2,04700.06____.014:4700,06____,RE,RE 4700,,2,,,
3,04701.06____.002:4701,06____,RE,RE 4701,,2,,,
4,04701.06____.015:4701,06____,RE,RE 4701,,2,,,
...,...,...,...,...,...,...,...,...,...
49346,87945.L7____.001:5,L7____,S5,S 5,,2,,FFEA00,000000
49347,87946.L7____.001:5,L7____,S5,S 5,,2,,FFEA00,000000
49348,87947.L7____.001:5,L7____,S5,S 5,,2,,FFEA00,000000
49349,87948.L7____.001:5,L7____,S5,S 5,,2,,FFEA00,000000


In [6]:
''' To clean the trips_Switzerland df.'''
# To remove the routes that are not train routes
no_route_id_train_route = routes_Switzerland.loc[routes_Switzerland['route_type'] != 2, 'route_id']
trips_cleaned_Switzerland = trips_Switzerland[(~trips_Switzerland['route_id'].isin(no_route_id_train_route))]

# To remove the accents from the trip_headsign and to change to uppercase
trips_cleaned_Switzerland.loc[:,'trip_headsign'] = trips_cleaned_Switzerland.loc[:,'trip_headsign'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
trips_cleaned_Switzerland.loc[:,'trip_headsign'] = trips_cleaned_Switzerland.loc[:,'trip_headsign'].str.upper()
trips_cleaned_Switzerland

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


Unnamed: 0,route_id,service_id,trip_id,trip_headsign,trip_short_name,direction_id,block_id,shape_id,bikes_allowed,attributes_ch
0,04236.06____.020:4236,19311,0:1,KARLSRUHE HBF,4236,,,,0,MO
1,04700.06____.001:4700,133763,1:1,KARLSRUHE HBF,4700,,,,0,MO
2,04700.06____.001:4700,1,1:2,KARLSRUHE HBF,4700,,,,0,MO
3,04700.06____.001:4700,13336,1:3,KARLSRUHE HBF,4700,,,,0,MO
4,04700.06____.001:4700,2610,1:4,KARLSRUHE HBF,4700,,,,0,MO
...,...,...,...,...,...,...,...,...,...,...
90223,87947.L7____.001:5,2364,49348:1,WEIL AM RHEIN,5,,,,0,MO
90224,87948.L7____.001:5,56518,49349:1,LORRACH HBF,5,,,,0,MO
90225,87948.L7____.001:5,17102,49349:2,ZELL (WIESENTAL),5,,,,0,MO
90226,87949.L7____.001:5,17102,49350:1,WEIL AM RHEIN,5,,,,0,MO


In [7]:
''' To clean the stop_times_Switzerland df.'''
# To remove the stop_times trip_ids that are not trip_ids that belong to train routes
no_trip_id_train_route = trips_Switzerland.loc[trips_Switzerland['route_id'].isin(no_route_id_train_route), 'trip_id']
stop_times_cleaned_Switzerland = stop_times_Switzerland[(~stop_times_Switzerland['trip_id'].isin(no_trip_id_train_route))]

# To remove the superfluous characters of the stop_id (: and platform_code)
stop_times_cleaned_Switzerland_column = stop_times_cleaned_Switzerland['stop_id'].str.split(':').str[0]
stop_times_cleaned_Switzerland.loc[:,'stop_id'] = stop_times_cleaned_Switzerland_column

# To make the stop_ids numerical and to remove the duplicate stop_ids
stop_times_cleaned_Switzerland.loc[:,'stop_id'] = stop_times_cleaned_Switzerland.loc[:,'stop_id'].astype(np.int64)
stop_times_cleaned_Switzerland

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled,attributes_ch
0,0:1,18:16:00,18:16:00,8014554,0,,0,0,,
1,0:1,18:28:00,18:30:00,8014534,1,,0,0,,
2,0:1,18:40:00,18:40:00,8014529,2,,0,0,,
3,0:1,18:49:00,18:50:00,8014521,3,,0,0,,
4,0:1,18:58:00,18:59:00,8014518,4,,0,0,,
...,...,...,...,...,...,...,...,...,...,...
1034556,49350:2,10:38:00,10:38:00,8069220,3,,3,3,,X
1034557,49350:2,10:40:00,10:40:00,8014429,4,,3,3,,X
1034558,49350:2,10:41:00,10:42:00,8060979,5,,3,3,,X
1034559,49350:2,10:43:00,10:43:00,8060978,6,,3,3,,X


''' To clean the stops_Switzerland df   (1).'''
##### To select all stops that appear in the stop_times df
stops_cleaned_df_Switzerland = stop_times_cleaned_Switzerland[['stop_id']]
stops_cleaned_df_Switzerland = stops_cleaned_df_Switzerland.drop_duplicates()

##### To make the stop_id attribute in the initial stops_Switzerland df numerical
stops_initial_Switzerland_column = stops_Switzerland['stop_id'].str.split(':').str[0]
stops_initial_Switzerland = stops_Switzerland.copy()
stops_initial_Switzerland.loc[:,'stop_id'] = stops_initial_Switzerland_column

##### To make the stop_ids numerical and to remove the duplicate stop_ids
stops_initial_Switzerland = stops_initial_Switzerland[['stop_id', 'stop_name', 'stop_lat', 'stop_lon']]
stops_initial_Switzerland.loc[:,'stop_id'] = stops_initial_Switzerland.loc[:,'stop_id'].astype(np.int64)
stops_initial_Switzerland = stops_initial_Switzerland.drop_duplicates()

##### To remove the accents from the stop_name and to change to uppercase
stops_initial_Switzerland.loc[:,'stop_name'] = stops_initial_Switzerland.loc[:,'stop_name'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
stops_initial_Switzerland.loc[:,'stop_name'] = stops_initial_Switzerland.loc[:,'stop_name'].str.upper()

##### To merge the stops_cleaned_df_Switzerland and the stops_initial_Switzerland df
stops_cleaned_Switzerland = pd.merge(stops_initial_Switzerland, stops_cleaned_df_Switzerland, on = 'stop_id', how='right')

''' To clean the stops_Switzerland df   (2).'''
##### To initialize the Nominatim API to get the location from the input string 
geolocator = Nominatim(user_agent="application")
reverse = RateLimiter(geolocator.reverse, min_delay_seconds=0.2)

##### To get the location with the geolocator.reverse() function and to extract the country from the location instance
country_list = []
for index, row in stops_cleaned_Switzerland.iterrows():
    latitude = row['stop_lat']
    longitude = row['stop_lon']
    # To assign the latitude and longitude into a geolocator.reverse() method
    location = reverse((latitude, longitude), language='en', exactly_one=True)
    # To get the country from the given list and parsed into a dictionary with raw function()
    address = location.raw['address']
    country = address.get('country', '')
    country_list.append(country)

##### To add the values of country_list as a new attribute country     
stops_cleaned_Switzerland.loc[:,'country'] = country_list
stops_cleaned_Switzerland

##### To calculate the total number of Belgian stations in the stops_cleaned dataset
swiss_stops_Switzerland = stops_cleaned_Switzerland[stops_cleaned_Switzerland['country'] == 'Switzerland']
swiss_stops_Switzerland_series = stops_cleaned_Switzerland.loc[stops_cleaned_Switzerland['country'] == 'Switzerland', 'stop_name']

swiss_stops_Switzerland_series.to_csv(r'/Users/pol/Desktop/CSV_export/swiss_stops_Switzerland_series.csv', index = False, header=True, encoding='utf-8-sig')

stops_cleaned_Switzerland.to_csv(r'/Users/pol/Desktop/CSV_export/stops_cleaned_Switzerland.csv', index = False, header=True, encoding='utf-8-sig')

In [8]:
'''import the cleaned version of the stops with their country'''
stops_cleaned_Switzerland = pd.read_csv("https://raw.githubusercontent.com/polkuleuven/Thesis_Train/main/stops_cleaned/stops_cleaned_Switzerland.csv", sep=",")
swiss_stops_Switzerland_series = pd.read_csv("https://raw.githubusercontent.com/polkuleuven/Thesis_Train/main/country_stops_series/swiss_stops_Switzerland_series.csv", sep=",")['stop_name']

In [9]:
'''filter the dats from the selected begin to the end date'''
#here we used 4 months
begin_date = 20210314
end_date = 20210713
filtered_calendar_dates_Switzerland = calendar_dates_Switzerland.copy()
filtered_calendar_dates_Switzerland = filtered_calendar_dates_Switzerland.drop(filtered_calendar_dates_Switzerland[(filtered_calendar_dates_Switzerland['date'] > end_date) |(filtered_calendar_dates_Switzerland['date'] < begin_date)].index)
filtered_calendar_dates_Switzerland

Unnamed: 0,service_id,date,exception_type
63,5,20210315,1
64,5,20210316,1
65,5,20210317,1
66,5,20210318,1
67,5,20210319,1
...,...,...,...
1536188,12559,20210711,1
1536189,12559,20210712,1
1536190,12559,20210713,1
1536341,12560,20210603,1


# Exploratory data analysis with the Swiss railway data

In [10]:
'''To calculate the number of unique route_ids '''
set_routes_Switzerland = {r for r in routes_cleaned_Switzerland['route_id']}
len(set_routes_Switzerland)

46037

In [11]:
'''To calculate the total number of stations in the stops_cleaned_Switzerland dataset'''
set_stations_Switzerland = {s for s in stops_cleaned_Switzerland['stop_id']}
len(set_stations_Switzerland)

2608

In [12]:
'''To calculate the total number of Swiss stations in the stops_cleaned dataset'''
len(swiss_stops_Switzerland_series)

1762

# **Preparation for the L-space representation of the Swiss railway system**

In [13]:
'''To merge a selection of the trips dataset and a selection of the routes dataset on route_id'''
trips_routes_Switzerland = pd.merge(trips_cleaned_Switzerland[['route_id','service_id','trip_id', 'trip_headsign']], routes_cleaned_Switzerland[['route_id', 'route_short_name', 'route_long_name']], on='route_id')
trips_routes_Switzerland

Unnamed: 0,route_id,service_id,trip_id,trip_headsign,route_short_name,route_long_name
0,04236.06____.020:4236,19311,0:1,KARLSRUHE HBF,RE,RE 4236
1,04700.06____.001:4700,133763,1:1,KARLSRUHE HBF,RE,RE 4700
2,04700.06____.001:4700,1,1:2,KARLSRUHE HBF,RE,RE 4700
3,04700.06____.001:4700,13336,1:3,KARLSRUHE HBF,RE,RE 4700
4,04700.06____.001:4700,2610,1:4,KARLSRUHE HBF,RE,RE 4700
...,...,...,...,...,...,...
86909,87947.L7____.001:5,2364,49348:1,WEIL AM RHEIN,S5,S 5
86910,87948.L7____.001:5,56518,49349:1,LORRACH HBF,S5,S 5
86911,87948.L7____.001:5,17102,49349:2,ZELL (WIESENTAL),S5,S 5
86912,87949.L7____.001:5,17102,49350:1,WEIL AM RHEIN,S5,S 5


In [14]:
'''To merge a selection of the stop_times_cleaned_Switzerland dataset with the stops_cleaned_Switzerland dataset'''
stop_times_stops_Switzerland = pd.merge(stop_times_cleaned_Switzerland[['trip_id','arrival_time', 'departure_time','stop_id','stop_sequence']], stops_cleaned_Switzerland, on='stop_id')
stop_times_stops_Switzerland

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_name,stop_lat,stop_lon,country
0,0:1,18:16:00,18:16:00,8014554,0,ENGEN,47.856347,8.772786,Germany
1,7:1,06:32:00,06:33:00,8014554,4,ENGEN,47.856347,8.772786,Germany
2,7:2,06:32:00,06:33:00,8014554,4,ENGEN,47.856347,8.772786,Germany
3,12:1,07:18:00,07:19:00,8014554,12,ENGEN,47.856347,8.772786,Germany
4,12:2,07:18:00,07:19:00,8014554,12,ENGEN,47.856347,8.772786,Germany
...,...,...,...,...,...,...,...,...,...
991431,49275:2,24:04:00,24:04:00,8014439,2,RIEHEN,47.583156,7.652008,Switzerland
991432,49276:1,24:14:00,24:14:00,8014439,14,RIEHEN,47.583156,7.652008,Switzerland
991433,49276:2,24:14:00,24:14:00,8014439,14,RIEHEN,47.583156,7.652008,Switzerland
991434,49276:3,24:14:00,24:14:00,8014439,14,RIEHEN,47.583156,7.652008,Switzerland


In [15]:
'''To merge a selection of the stop_times_stops_Switzerland dataset with the trips_routes_Switzerland dataset.'''
trips_routes_stop_times_stops_Switzerland = pd.merge(trips_routes_Switzerland, stop_times_stops_Switzerland, on='trip_id')
trips_routes_stop_times_stops_Switzerland = trips_routes_stop_times_stops_Switzerland.sort_values(by=['route_id', 'trip_id', 'stop_sequence'])
trips_routes_stop_times_stops_Switzerland

Unnamed: 0,route_id,service_id,trip_id,trip_headsign,route_short_name,route_long_name,arrival_time,departure_time,stop_id,stop_sequence,stop_name,stop_lat,stop_lon,country
766843,00001.000044.018:1,936,33248:1,LE LOCLE,R,R 1,05:50:00,05:50:00,8504392,0,LES BRENETS,47.067210,6.707389,Switzerland
766844,00001.000044.018:1,936,33248:1,LE LOCLE,R,R 1,05:53:00,05:54:00,8504391,1,LES FRETES,47.058580,6.725787,Switzerland
766845,00001.000044.018:1,936,33248:1,LE LOCLE,R,R 1,05:54:00,05:55:00,8530260,2,LE LOCLE LE CHALET,47.055918,6.738986,Switzerland
766842,00001.000044.018:1,936,33248:1,LE LOCLE,R,R 1,05:58:00,05:58:00,8504316,3,LE LOCLE,47.057861,6.746153,Switzerland
766847,00001.000044.028:1,869,33249:1,LE LOCLE,R,R 1,06:08:00,06:08:00,8504392,0,LES BRENETS,47.067210,6.707389,Switzerland
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
631389,96814.000011.101:96814,116609,23295:1,LYON PART DIEU,TER,TER 96814,12:56:00,13:05:00,8774500,1,BELLEGARDE (AIN),46.110918,5.825962,France
631388,96814.000011.101:96814,116609,23295:1,LYON PART DIEU,TER,TER 96814,14:33:00,14:33:00,8772319,2,LYON PART DIEU,45.760564,4.859990,France
631390,96818.000011.101:96818,44307,23296:1,LYON PART DIEU,TER,TER 96818,19:26:00,19:26:00,8501008,0,GENEVE,46.210213,6.142452,Switzerland
631392,96818.000011.101:96818,44307,23296:1,LYON PART DIEU,TER,TER 96818,19:58:00,20:02:00,8774500,1,BELLEGARDE (AIN),46.110918,5.825962,France


In [16]:
'''Creates the a dataframe with the departure time form the first stop sequence and with the one from last stop sequence for each trip_id'''
departure_time_first = trips_routes_stop_times_stops_Switzerland.reset_index().loc[trips_routes_stop_times_stops_Switzerland.reset_index().groupby(['trip_id'])['stop_sequence'].idxmin()][['route_id', 'trip_id', 'departure_time']].copy()
departure_time_first = departure_time_first.rename(columns = {'departure_time': 'departure_time_first'})
departure_time_last = trips_routes_stop_times_stops_Switzerland.reset_index().loc[trips_routes_stop_times_stops_Switzerland.reset_index().groupby(['trip_id'])['stop_sequence'].idxmax()][['route_id', 'trip_id', 'departure_time']].copy()
departure_time_last = departure_time_last.rename(columns = {'departure_time': 'departure_time_last'})
departure_times_Switzerland = departure_time_first.merge(departure_time_last[['trip_id', 'departure_time_last']], on='trip_id')
departure_times_Switzerland

Unnamed: 0,route_id,trip_id,departure_time_first,departure_time_last
0,04236.06____.020:4236,0:1,18:16:00,20:49:00
1,10834.000011.101:8,10000:1,19:48:00,22:22:00
2,10834.000011.102:8,10001:1,20:32:00,21:50:00
3,10834.000011.103:8,10002:1,19:48:00,22:15:00
4,10834.000011.104:8,10003:1,19:48:00,22:22:00
...,...,...,...,...
86909,04725.06____.014:4725,99:1,12:07:00,13:20:00
86910,04704.06____.007:4704,9:1,05:35:00,07:50:00
86911,04704.06____.007:4704,9:2,05:35:00,07:50:00
86912,04704.06____.007:4704,9:3,05:35:00,07:50:00


In [17]:
'''To create a route_sequence dataset that gives for each trip_id that belongs to a route the sequence of stations served'''
route_sequence_Switzerland = trips_routes_stop_times_stops_Switzerland.groupby(['route_id','route_long_name','trip_headsign','trip_id','stop_sequence'], as_index=False)[['stop_name', 'stop_lat', 'stop_lon']].last()
route_sequence_Switzerland

Unnamed: 0,route_id,route_long_name,trip_headsign,trip_id,stop_sequence,stop_name,stop_lat,stop_lon
0,00001.000044.018:1,R 1,LE LOCLE,33248:1,0,LES BRENETS,47.067210,6.707389
1,00001.000044.018:1,R 1,LE LOCLE,33248:1,1,LES FRETES,47.058580,6.725787
2,00001.000044.018:1,R 1,LE LOCLE,33248:1,2,LE LOCLE LE CHALET,47.055918,6.738986
3,00001.000044.018:1,R 1,LE LOCLE,33248:1,3,LE LOCLE,47.057861,6.746153
4,00001.000044.028:1,R 1,LE LOCLE,33249:1,0,LES BRENETS,47.067210,6.707389
...,...,...,...,...,...,...,...,...
991431,96814.000011.101:96814,TER 96814,LYON PART DIEU,23295:1,1,BELLEGARDE (AIN),46.110918,5.825962
991432,96814.000011.101:96814,TER 96814,LYON PART DIEU,23295:1,2,LYON PART DIEU,45.760564,4.859990
991433,96818.000011.101:96818,TER 96818,LYON PART DIEU,23296:1,0,GENEVE,46.210213,6.142452
991434,96818.000011.101:96818,TER 96818,LYON PART DIEU,23296:1,1,BELLEGARDE (AIN),46.110918,5.825962


In [18]:
''' To groupby the trip_id and to order the stop_sequence in an ascending order (the stop_sequences of some
routes are initially in descending order while other stop_sequences are in ascending order) '''

trips_stop_sequence_ascending_Switzerland = stop_times_stops_Switzerland.groupby(['trip_id'], as_index=False).apply(lambda x: x.sort_values('stop_sequence'))
trips_stop_sequence_ascending_Switzerland

Unnamed: 0,Unnamed: 1,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_name,stop_lat,stop_lon,country
0,0,0:1,18:16:00,18:16:00,8014554,0,ENGEN,47.856347,8.772786,Germany
0,541,0:1,18:28:00,18:30:00,8014534,1,IMMENDINGEN,47.936007,8.729536,Germany
0,1100,0:1,18:40:00,18:40:00,8014529,2,DONAUESCHINGEN,47.947786,8.498919,Germany
0,1895,0:1,18:49:00,18:50:00,8014521,3,VILLINGEN (SCHWARZW),48.058022,8.465261,Germany
0,2592,0:1,18:58:00,18:59:00,8014518,4,ST GEORGEN (SCHWARZW),48.123813,8.341955,Germany
...,...,...,...,...,...,...,...,...,...,...
86913,7970,9:4,07:24:00,07:25:00,8014277,14,BADEN-BADEN,48.790327,8.190831,Germany
86913,8657,9:4,07:31:00,07:32:00,8014245,15,RASTATT,48.860483,8.215623,Germany
86913,10637,9:4,07:35:00,07:35:00,8014241,16,MUGGENSTURM,48.876805,8.274293,Germany
86913,10705,9:4,07:38:00,07:39:00,8014240,17,MALSCH,48.889506,8.323585,Germany


In [19]:
''' To put the stop_names of a stop sequence of a trip_id in a list '''
trips_stop_sequence_Switzerland = trips_stop_sequence_ascending_Switzerland.groupby('trip_id')['stop_name'].apply(lambda group_series: group_series.tolist()).reset_index()
trips_stop_sequence_Switzerland.rename(columns={'stop_name':'stop_sequence'}, inplace=True)
trips_stop_sequence_Switzerland

Unnamed: 0,trip_id,stop_sequence
0,0:1,"[ENGEN, IMMENDINGEN, DONAUESCHINGEN, VILLINGEN..."
1,10000:1,"[ROMANSHORN, AMRISWIL, WEINFELDEN, FRAUENFELD,..."
2,10001:1,"[ZURICH HB, OLTEN, BAHN-2000-STRECKE, BERN]"
3,10002:1,"[ROMANSHORN, AMRISWIL, WEINFELDEN, FRAUENFELD,..."
4,10003:1,"[ROMANSHORN, AMRISWIL, WEINFELDEN, FRAUENFELD,..."
...,...,...
86909,99:1,"[KARLSRUHE HBF, RASTATT, BADEN-BADEN, BUHL (BA..."
86910,9:1,"[VILLINGEN (SCHWARZW), ST GEORGEN (SCHWARZW), ..."
86911,9:2,"[VILLINGEN (SCHWARZW), ST GEORGEN (SCHWARZW), ..."
86912,9:3,"[VILLINGEN (SCHWARZW), ST GEORGEN (SCHWARZW), ..."


In [20]:
'''To calculate the hash value for the stop sequence of each trip_id'''

#To copy the filtered_trips dataset
trips_hash_Switzerland = trips_stop_sequence_Switzerland.copy()

#calculates the hash of the stop sequence in both order (ascending and descending)
trips_hash_Switzerland['hash'] = trips_hash_Switzerland['stop_sequence'].apply(lambda x: hash(tuple(x)))
trips_hash_Switzerland['hash_inverse'] = trips_hash_Switzerland['stop_sequence'].apply(lambda x: hash(tuple(x[::-1])))

In [21]:
''' To add the list of stop_sequence of stations to the trips_hash_Switzerland df by joining on trip_id'''
# To add the stop_sequence of stations to the trips_routes_Swizerland df by joining on trip_id
trips_hash_stop_sequence_Switzerland = pd.merge(trips_routes_Switzerland, trips_hash_Switzerland, on='trip_id', how='left')

# To put the columns in a more logical order
trips_hash_stop_sequence_Switzerland = trips_hash_stop_sequence_Switzerland[['route_id', 'route_long_name','service_id','trip_headsign','trip_id','hash', 'hash_inverse','stop_sequence']]
trips_hash_stop_sequence_Switzerland

Unnamed: 0,route_id,route_long_name,service_id,trip_headsign,trip_id,hash,hash_inverse,stop_sequence
0,04236.06____.020:4236,RE 4236,19311,KARLSRUHE HBF,0:1,9205176474328118748,-2877522756731758300,"[ENGEN, IMMENDINGEN, DONAUESCHINGEN, VILLINGEN..."
1,04700.06____.001:4700,RE 4700,133763,KARLSRUHE HBF,1:1,-4850981020070306687,339185184194859384,"[OFFENBURG, APPENWEIER, RENCHEN (D), ACHERN, B..."
2,04700.06____.001:4700,RE 4700,1,KARLSRUHE HBF,1:2,-4850981020070306687,339185184194859384,"[OFFENBURG, APPENWEIER, RENCHEN (D), ACHERN, B..."
3,04700.06____.001:4700,RE 4700,13336,KARLSRUHE HBF,1:3,-4850981020070306687,339185184194859384,"[OFFENBURG, APPENWEIER, RENCHEN (D), ACHERN, B..."
4,04700.06____.001:4700,RE 4700,2610,KARLSRUHE HBF,1:4,-4850981020070306687,339185184194859384,"[OFFENBURG, APPENWEIER, RENCHEN (D), ACHERN, B..."
...,...,...,...,...,...,...,...,...
86909,87947.L7____.001:5,S 5,2364,WEIL AM RHEIN,49348:1,7350775758517724160,-998733850491266350,"[LORRACH HBF, LORRACH MUSEUM/BURGHOF, LORRACH-..."
86910,87948.L7____.001:5,S 5,56518,LORRACH HBF,49349:1,-998733850491266350,7350775758517724160,"[WEIL AM RHEIN, WEIL AM RHEIN GARTENSTADT, WEI..."
86911,87948.L7____.001:5,S 5,17102,ZELL (WIESENTAL),49349:2,5974864185153478306,-438996269003196970,"[WEIL AM RHEIN, WEIL AM RHEIN GARTENSTADT, WEI..."
86912,87949.L7____.001:5,S 5,17102,WEIL AM RHEIN,49350:1,-438996269003196970,5974864185153478306,"[ZELL (WIESENTAL), HAUSEN-RAITBACH, FAHRNAU, S..."


In [22]:
'''Merges the trips_hash_stop_sequence with the departure_times'''
trips_hash_stop_sequence_departure_Switzerland = trips_hash_stop_sequence_Switzerland.merge(departure_times_Switzerland[['trip_id','departure_time_first','departure_time_last']], on='trip_id')
trips_hash_stop_sequence_departure_Switzerland

Unnamed: 0,route_id,route_long_name,service_id,trip_headsign,trip_id,hash,hash_inverse,stop_sequence,departure_time_first,departure_time_last
0,04236.06____.020:4236,RE 4236,19311,KARLSRUHE HBF,0:1,9205176474328118748,-2877522756731758300,"[ENGEN, IMMENDINGEN, DONAUESCHINGEN, VILLINGEN...",18:16:00,20:49:00
1,04700.06____.001:4700,RE 4700,133763,KARLSRUHE HBF,1:1,-4850981020070306687,339185184194859384,"[OFFENBURG, APPENWEIER, RENCHEN (D), ACHERN, B...",04:47:00,05:40:00
2,04700.06____.001:4700,RE 4700,1,KARLSRUHE HBF,1:2,-4850981020070306687,339185184194859384,"[OFFENBURG, APPENWEIER, RENCHEN (D), ACHERN, B...",04:47:00,05:40:00
3,04700.06____.001:4700,RE 4700,13336,KARLSRUHE HBF,1:3,-4850981020070306687,339185184194859384,"[OFFENBURG, APPENWEIER, RENCHEN (D), ACHERN, B...",04:47:00,05:40:00
4,04700.06____.001:4700,RE 4700,2610,KARLSRUHE HBF,1:4,-4850981020070306687,339185184194859384,"[OFFENBURG, APPENWEIER, RENCHEN (D), ACHERN, B...",04:47:00,05:40:00
...,...,...,...,...,...,...,...,...,...,...
86909,87947.L7____.001:5,S 5,2364,WEIL AM RHEIN,49348:1,7350775758517724160,-998733850491266350,"[LORRACH HBF, LORRACH MUSEUM/BURGHOF, LORRACH-...",09:34:00,09:45:00
86910,87948.L7____.001:5,S 5,56518,LORRACH HBF,49349:1,-998733850491266350,7350775758517724160,"[WEIL AM RHEIN, WEIL AM RHEIN GARTENSTADT, WEI...",09:14:00,09:28:00
86911,87948.L7____.001:5,S 5,17102,ZELL (WIESENTAL),49349:2,5974864185153478306,-438996269003196970,"[WEIL AM RHEIN, WEIL AM RHEIN GARTENSTADT, WEI...",09:14:00,09:56:00
86912,87949.L7____.001:5,S 5,17102,WEIL AM RHEIN,49350:1,-438996269003196970,5974864185153478306,"[ZELL (WIESENTAL), HAUSEN-RAITBACH, FAHRNAU, S...",10:04:00,10:45:00


In [23]:
''' To count the number of dates for each service_id '''
service_id_df_Switzerland = filtered_calendar_dates_Switzerland.groupby(['service_id'])[['service_id']].count().rename(columns={'service_id':'count_service_id'}).reset_index()
service_id_df_Switzerland

Unnamed: 0,service_id,count_service_id
0,1,90
1,2,31
2,5,83
3,6,83
4,8,5
...,...,...
13936,244127,88
13937,244129,34
13938,254888,67
13939,254889,120


In [24]:
''' To regroup the days per service_id in a set '''
service_id_dates_Switzerland = filtered_calendar_dates_Switzerland.groupby('service_id')['date'].apply(lambda group_series: set(group_series.tolist())).reset_index()
service_id_dates_Switzerland.rename(columns={'date':'dates'}, inplace=True)
service_id_dates_Switzerland = service_id_dates_Switzerland.merge(service_id_df_Switzerland, on='service_id', how='left')
service_id_dates_Switzerland

Unnamed: 0,service_id,dates,count_service_id
0,1,"{20210315, 20210316, 20210317, 20210318, 20210...",90
1,2,"{20210701, 20210702, 20210703, 20210704, 20210...",31
2,5,"{20210701, 20210702, 20210705, 20210706, 20210...",83
3,6,"{20210701, 20210702, 20210705, 20210706, 20210...",83
4,8,"{20210626, 20210627, 20210613, 20210619, 20210...",5
...,...,...,...
13936,244127,"{20210701, 20210704, 20210705, 20210706, 20210...",88
13937,244129,"{20210702, 20210319, 20210320, 20210703, 20210...",34
13938,254888,"{20210701, 20210702, 20210703, 20210704, 20210...",67
13939,254889,"{20210701, 20210702, 20210703, 20210704, 20210...",120


In [49]:
'''To merge the trips_hash_stop_sequence df with the service_id_dates to get the sets of corresponding dates'''
trips_routes_stop_times_stops_dates_Switzerland = pd.merge(trips_routes_stop_times_stops_Switzerland, service_id_dates_Switzerland, on='service_id', how='inner')
trips_routes_stop_times_stops_dates_Switzerland

Unnamed: 0,route_id,service_id,trip_id,trip_headsign,route_short_name,route_long_name,arrival_time,departure_time,stop_id,stop_sequence,stop_name,stop_lat,stop_lon,country,dates,count_service_id
0,00001.000044.018:1,936,33248:1,LE LOCLE,R,R 1,05:50:00,05:50:00,8504392,0,LES BRENETS,47.067210,6.707389,Switzerland,"{20210701, 20210702, 20210705, 20210706, 20210...",83
1,00001.000044.018:1,936,33248:1,LE LOCLE,R,R 1,05:53:00,05:54:00,8504391,1,LES FRETES,47.058580,6.725787,Switzerland,"{20210701, 20210702, 20210705, 20210706, 20210...",83
2,00001.000044.018:1,936,33248:1,LE LOCLE,R,R 1,05:54:00,05:55:00,8530260,2,LE LOCLE LE CHALET,47.055918,6.738986,Switzerland,"{20210701, 20210702, 20210705, 20210706, 20210...",83
3,00001.000044.018:1,936,33248:1,LE LOCLE,R,R 1,05:58:00,05:58:00,8504316,3,LE LOCLE,47.057861,6.746153,Switzerland,"{20210701, 20210702, 20210705, 20210706, 20210...",83
4,00002.000044.017:2,936,33250:1,LES BRENETS,R,R 2,06:01:00,06:01:00,8504316,0,LE LOCLE,47.057861,6.746153,Switzerland,"{20210701, 20210702, 20210705, 20210706, 20210...",83
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
650517,96765.000011.101:L6,117674,23267:1,GENEVE,S,L6,21:50:00,21:50:00,8501006,4,MEYRIN,46.222335,6.076900,Switzerland,"{20210315, 20210316, 20210317, 20210318, 20210...",68
650518,96765.000011.101:L6,117674,23267:1,GENEVE,S,L6,21:54:00,21:55:00,8501007,5,VERNIER,46.220719,6.093891,Switzerland,"{20210315, 20210316, 20210317, 20210318, 20210...",68
650519,96765.000011.101:L6,117674,23267:1,GENEVE,S,L6,22:00:00,22:00:00,8501008,6,GENEVE,46.210213,6.142452,Switzerland,"{20210315, 20210316, 20210317, 20210318, 20210...",68
650520,96785.87_LEX.001:L6,65904,47667:1,POUGNY-CHANCY,TER,L6,19:37:00,19:37:00,8774500,0,BELLEGARDE (AIN),46.110918,5.825962,France,"{20210412, 20210413, 20210414, 20210415, 20210...",7


In [26]:
'''To put the different trip_ids in a list after joining and add the departure_time first and last'''
common_columns = ['route_id','route_long_name','hash', 'hash_inverse', 'service_id']
route_hash_freq_Switzerland = trips_hash_stop_sequence_Switzerland.groupby(common_columns)['trip_id'].apply(lambda group_series: group_series.tolist()).reset_index()
route_hash_freq_Switzerland_dep_first = trips_hash_stop_sequence_departure_Switzerland.groupby(common_columns)['departure_time_first'].apply(lambda group_series: group_series.tolist()).reset_index()
route_hash_freq_Switzerland_dep_last = trips_hash_stop_sequence_departure_Switzerland.groupby(common_columns)['departure_time_last'].apply(lambda group_series: group_series.tolist()).reset_index()
route_hash_freq_Switzerland = route_hash_freq_Switzerland.merge(route_hash_freq_Switzerland_dep_first, on= common_columns)
route_hash_freq_Switzerland = route_hash_freq_Switzerland.merge(route_hash_freq_Switzerland_dep_last, on= common_columns)
route_hash_freq_Switzerland

Unnamed: 0,route_id,route_long_name,hash,hash_inverse,service_id,trip_id,departure_time_first,departure_time_last
0,00001.000044.018:1,R 1,-5189486139819647528,3242894917632905562,936,[33248:1],[05:50:00],[05:58:00]
1,00001.000044.028:1,R 1,-5189486139819647528,3242894917632905562,869,[33249:1],[06:08:00],[06:16:00]
2,00001.000104.001:1,CC 1,3865371173043443348,-6836560398534858013,5072,[4117:1],[07:36:00],[08:36:00]
3,00002.000044.017:2,R 2,3242894917632905562,-5189486139819647528,936,[33250:1],[06:01:00],[06:08:00]
4,00002.000044.024:2,R 2,3242894917632905562,-5189486139819647528,869,[33251:1],[06:18:00],[06:25:00]
...,...,...,...,...,...,...,...,...
86909,96806.000011.102:96806,TER 96806,1700149812652076650,256460250998497924,44307,[23292:1],[18:39:00],[20:59:00]
86910,96810.000011.101:96810,TER 96810,-4684204956159847117,-1289879366665992573,116609,[23293:1],[06:16:00],[08:24:00]
86911,96812.000011.101:96812,TER 96812,-4684204956159847117,-1289879366665992573,9453,[23294:1],[07:17:00],[09:25:00]
86912,96814.000011.101:96814,TER 96814,-4684204956159847117,-1289879366665992573,116609,[23295:1],[12:25:00],[14:33:00]


In [27]:
''' To add the sequence of stops to the route_hash_freq dataset '''
route_hash_freq_Switzerland = pd.merge(route_hash_freq_Switzerland, trips_hash_stop_sequence_Switzerland[['route_id','hash', 'hash_inverse', 'service_id','stop_sequence']], on=['route_id', 'hash', 'hash_inverse', 'service_id'], how='left')
route_hash_freq_Switzerland = route_hash_freq_Switzerland.drop_duplicates( subset = ['route_id', 'hash', 'service_id'], keep = 'first')
route_hash_freq_Switzerland

Unnamed: 0,route_id,route_long_name,hash,hash_inverse,service_id,trip_id,departure_time_first,departure_time_last,stop_sequence
0,00001.000044.018:1,R 1,-5189486139819647528,3242894917632905562,936,[33248:1],[05:50:00],[05:58:00],"[LES BRENETS, LES FRETES, LE LOCLE LE CHALET, ..."
1,00001.000044.028:1,R 1,-5189486139819647528,3242894917632905562,869,[33249:1],[06:08:00],[06:16:00],"[LES BRENETS, LES FRETES, LE LOCLE LE CHALET, ..."
2,00001.000104.001:1,CC 1,3865371173043443348,-6836560398534858013,5072,[4117:1],[07:36:00],[08:36:00],"[BRIENZ BRB, PLANALP, BRIENZER ROTHORN]"
3,00002.000044.017:2,R 2,3242894917632905562,-5189486139819647528,936,[33250:1],[06:01:00],[06:08:00],"[LE LOCLE, LE LOCLE LE CHALET, LES FRETES, LES..."
4,00002.000044.024:2,R 2,3242894917632905562,-5189486139819647528,869,[33251:1],[06:18:00],[06:25:00],"[LE LOCLE, LE LOCLE LE CHALET, LES FRETES, LES..."
...,...,...,...,...,...,...,...,...,...
86909,96806.000011.102:96806,TER 96806,1700149812652076650,256460250998497924,44307,[23292:1],[18:39:00],[20:59:00],"[GENEVE, BELLEGARDE (AIN), CULOZ, GRENOBLE]"
86910,96810.000011.101:96810,TER 96810,-4684204956159847117,-1289879366665992573,116609,[23293:1],[06:16:00],[08:24:00],"[GENEVE, BELLEGARDE (AIN), LYON PART DIEU]"
86911,96812.000011.101:96812,TER 96812,-4684204956159847117,-1289879366665992573,9453,[23294:1],[07:17:00],[09:25:00],"[GENEVE, BELLEGARDE (AIN), LYON PART DIEU]"
86912,96814.000011.101:96814,TER 96814,-4684204956159847117,-1289879366665992573,116609,[23295:1],[12:25:00],[14:33:00],"[GENEVE, BELLEGARDE (AIN), LYON PART DIEU]"


In [28]:
''' To calculate the number of trip_ids in the list of trip_ids and to add it as a new attribute '''
number_trip_ids_Switzerland = []
for list_trip_ids_Switzerland in route_hash_freq_Switzerland['trip_id']:
    count_Switzerland = len(list_trip_ids_Switzerland)
    number_trip_ids_Switzerland.append(count_Switzerland)
route_hash_freq_Switzerland['number_trip_ids'] = number_trip_ids_Switzerland

route_hash_freq_Switzerland

Unnamed: 0,route_id,route_long_name,hash,hash_inverse,service_id,trip_id,departure_time_first,departure_time_last,stop_sequence,number_trip_ids
0,00001.000044.018:1,R 1,-5189486139819647528,3242894917632905562,936,[33248:1],[05:50:00],[05:58:00],"[LES BRENETS, LES FRETES, LE LOCLE LE CHALET, ...",1
1,00001.000044.028:1,R 1,-5189486139819647528,3242894917632905562,869,[33249:1],[06:08:00],[06:16:00],"[LES BRENETS, LES FRETES, LE LOCLE LE CHALET, ...",1
2,00001.000104.001:1,CC 1,3865371173043443348,-6836560398534858013,5072,[4117:1],[07:36:00],[08:36:00],"[BRIENZ BRB, PLANALP, BRIENZER ROTHORN]",1
3,00002.000044.017:2,R 2,3242894917632905562,-5189486139819647528,936,[33250:1],[06:01:00],[06:08:00],"[LE LOCLE, LE LOCLE LE CHALET, LES FRETES, LES...",1
4,00002.000044.024:2,R 2,3242894917632905562,-5189486139819647528,869,[33251:1],[06:18:00],[06:25:00],"[LE LOCLE, LE LOCLE LE CHALET, LES FRETES, LES...",1
...,...,...,...,...,...,...,...,...,...,...
86909,96806.000011.102:96806,TER 96806,1700149812652076650,256460250998497924,44307,[23292:1],[18:39:00],[20:59:00],"[GENEVE, BELLEGARDE (AIN), CULOZ, GRENOBLE]",1
86910,96810.000011.101:96810,TER 96810,-4684204956159847117,-1289879366665992573,116609,[23293:1],[06:16:00],[08:24:00],"[GENEVE, BELLEGARDE (AIN), LYON PART DIEU]",1
86911,96812.000011.101:96812,TER 96812,-4684204956159847117,-1289879366665992573,9453,[23294:1],[07:17:00],[09:25:00],"[GENEVE, BELLEGARDE (AIN), LYON PART DIEU]",1
86912,96814.000011.101:96814,TER 96814,-4684204956159847117,-1289879366665992573,116609,[23295:1],[12:25:00],[14:33:00],"[GENEVE, BELLEGARDE (AIN), LYON PART DIEU]",1


In [29]:
''' To merge the route_hash_freq_Switzerland df with the service_id_dates to get the sets of corresponding dates '''
route_hash_service_freq_Switzerland = pd.merge(route_hash_freq_Switzerland, service_id_dates_Switzerland, on='service_id', how='inner')
route_hash_service_freq_Switzerland

Unnamed: 0,route_id,route_long_name,hash,hash_inverse,service_id,trip_id,departure_time_first,departure_time_last,stop_sequence,number_trip_ids,dates,count_service_id
0,00001.000044.018:1,R 1,-5189486139819647528,3242894917632905562,936,[33248:1],[05:50:00],[05:58:00],"[LES BRENETS, LES FRETES, LE LOCLE LE CHALET, ...",1,"{20210701, 20210702, 20210705, 20210706, 20210...",83
1,00002.000044.017:2,R 2,3242894917632905562,-5189486139819647528,936,[33250:1],[06:01:00],[06:08:00],"[LE LOCLE, LE LOCLE LE CHALET, LES FRETES, LES...",1,"{20210701, 20210702, 20210705, 20210706, 20210...",83
2,00005.000044.018:5,R 5,-5189486139819647528,3242894917632905562,936,[33256:1],[06:50:00],[06:58:00],"[LES BRENETS, LES FRETES, LE LOCLE LE CHALET, ...",1,"{20210701, 20210702, 20210705, 20210706, 20210...",83
3,00006.000044.018:6,R 6,3242894917632905562,-5189486139819647528,936,[33258:1],[07:01:00],[07:08:00],"[LE LOCLE, LE LOCLE LE CHALET, LES FRETES, LES...",1,"{20210701, 20210702, 20210705, 20210706, 20210...",83
4,00007.000044.017:7,R 7,-5189486139819647528,3242894917632905562,936,[33260:1],[07:17:00],[07:25:00],"[LES BRENETS, LES FRETES, LE LOCLE LE CHALET, ...",1,"{20210701, 20210702, 20210705, 20210706, 20210...",83
...,...,...,...,...,...,...,...,...,...,...,...,...
57323,96760.000011.101:L6,L6,-8393100017709903846,-7876505502063812119,28493,[23263:1],[20:18:00],[20:41:00],"[GENEVE, VERNIER, MEYRIN, ZIMEYSA, SATIGNY, RU...",1,"{20210701, 20210702, 20210705, 20210706, 20210...",78
57324,96761.000011.101:L6,L6,-5956246458137668248,3602218816774858122,28493,[23264:1],[19:36:00],[19:57:00],"[POUGNY-CHANCY, RUSSIN, SATIGNY, ZIMEYSA, MEYR...",1,"{20210701, 20210702, 20210705, 20210706, 20210...",78
57325,96763.000011.101:L6,L6,-5956246458137668248,3602218816774858122,24237,[23265:1],[20:36:00],[21:00:00],"[POUGNY-CHANCY, RUSSIN, SATIGNY, ZIMEYSA, MEYR...",1,"{20210701, 20210702, 20210705, 20210706, 20210...",78
57326,96765.000011.101:L6,L6,-5956246458137668248,3602218816774858122,117674,[23267:1],[21:36:00],[22:00:00],"[POUGNY-CHANCY, RUSSIN, SATIGNY, ZIMEYSA, MEYR...",1,"{20210315, 20210316, 20210317, 20210318, 20210...",68


## Functions for the route creation

In [30]:
'''Some functions to better factorise the functions in the coming cells'''

def select_stop_sequences(stop_sequences_df, route_id):
    '''retruns the stop sequences with the selected route_id'''
    return stop_sequences_df[stop_sequences_df['route_id'] == route_id].copy()


def take_leftovers_list_c_from_intersection_AAndB(list_a, list_b, list_c):
    '''take the indexes of the intersection of list a with list b and retain the elments of list c with that index'''
    ind_dict = dict((k,i) for i,k in enumerate(list_a))
    return [list_c[ind_dict[x]] for x in (set(list_a).intersection(list_b))]

def get_extentions (after_or_behind, route_sequences_route_id, trip):
    '''returns the extentions for the trip (behind or after)'''
    if after_or_behind == 'after':
        #checks the extentions possible for the trip that can follow after its last stop
        possible_extentions = route_sequences_route_id[route_sequences_route_id['stop_sequence'].apply(lambda x: any(item for item in [trip['stop_sequence'][-1]] if (item == x[0]) and not(set(x[1:]) & set(trip['stop_sequence']))))].copy()
    elif after_or_behind == 'behind':
        #checks the extentions possible for the trip that can follow before its first stop
        possible_extentions = route_sequences_route_id[route_sequences_route_id['stop_sequence'].apply(lambda x: any(item for item in [trip['stop_sequence'][0]] if (item == x[-1]) and not(set(x[:-1]) & set(trip['stop_sequence']))))].copy()        
    #checks that those extentions have a common date as the trip
    possible_extentions = possible_extentions[possible_extentions['dates'].apply(lambda x: any(item for item in trip['dates'] if item in x))].copy()   
    if not possible_extentions.empty: 
        if after_or_behind == 'after':
            #checks that those extentions have a matching time schedule as the trip
            possible_extentions = possible_extentions[possible_extentions['departure_time_first'].apply(lambda x: any(item for item in trip['departure_time_last'] if item in x))].copy()
        elif after_or_behind == 'behind':
            #checks that those extentions have a matching time schedule as the trip
            possible_extentions = possible_extentions[possible_extentions['departure_time_last'].apply(lambda x: any(item for item in trip['departure_time_first'] if item in x))].copy()
    return possible_extentions      

def calculate_frequency (sequences_df):
    '''calculate the frequency based on the length of the dates and departure_time and put the hash in as a column of list'''
    sequences_df['number_dates'] = sequences_df['dates'].apply(lambda x: len(x))
    sequences_df['number_times'] = sequences_df['departure_time_last'].apply(lambda x: len(x))
    sequences_df['frequency'] = sequences_df['number_dates']* sequences_df['number_times'] 
    sequences_df = sequences_df.drop(['dates', 'departure_time_last', 'number_dates', 'number_times'], axis=1)
    sequences_df['hash'] = sequences_df['hash'].apply(lambda x: [x])
    return sequences_df.copy()
         
from datetime import datetime
from datetime import timedelta
FMT = '%H:%M:%S'
day_in_seconds = timedelta(days=1).total_seconds()
def calculate_time_difference(time_df, later_time, earlier_time, column_name):
    '''calculates the time difference between later time and earlier time and put it in time_df[column_name]'''
    #transform 24:00:00 into 00:00:00
    time_df['departure_time'] = time_df['departure_time'].apply(lambda x: str(int(x[:2])-24) + x[2:] if int(x[:2]) >= 24 else x)
    time_df['arrival_time'] = time_df['arrival_time'].apply(lambda x: str(int(x[:2])-24) + x[2:] if int(x[:2]) >=  24 else x)
    #calculate the waiting_time
    time_df[column_name] = time_df[['arrival_time','departure_time']].apply(lambda x: int((datetime.strptime(x[later_time], FMT) - datetime.strptime(x[earlier_time], FMT)).total_seconds()/60), axis=1)
    #if one day as past, take it into consideration
    time_df[column_name] = time_df[column_name].apply(lambda x: day_in_seconds/60 + x if x < 0 else x)
    return time_df            


In [31]:
'''Finds the routes that can be either extended from behind or from after and those which are complete sequences'''

def get_extention_indexes(stop_sequences_df):
    '''returns the tree indexes: index_of_extendable, index_of_begin_sequences, index_of_complete_sequences'''
    #intiate the dictionnaries, that will be used to retrieve different rows later on
    index_of_extendable = {}
    index_of_begin_sequences = {}
    index_of_complete_sequences = {}
    for route_id in stop_sequences_df['route_id'].unique():
        #select the route with the route_id selected by the loop iteration
        route_sequences_route_id = select_stop_sequences(stop_sequences_df, route_id)
        for index_trip, trip in route_sequences_route_id.iterrows():
            #checks the extentions possible for the trip that can follow after its last stop
            possible_extentions_after = get_extentions('after', route_sequences_route_id, trip)
            #checks the extentions possible for the trip that can follow before its first stop
            possible_extentions_behind = get_extentions('behind', route_sequences_route_id, trip)
            #put all the sequences that can be extended either from the beginning either from the end together
            possible_extentions = possible_extentions_after.append(possible_extentions_behind, ignore_index = True)
            if not possible_extentions.empty:
                if route_id not in index_of_extendable:
                    index_of_extendable[route_id] = []
                index_of_extendable[route_id].append(index_trip)
                if possible_extentions_behind.empty:
                    if route_id not in index_of_begin_sequences:
                        index_of_begin_sequences[route_id] = []
                    index_of_begin_sequences[route_id].append(index_trip)
            elif possible_extentions.empty:
                if route_id not in index_of_complete_sequences:
                    index_of_complete_sequences[route_id] = []
                index_of_complete_sequences[route_id].append(index_trip)
                
    return index_of_extendable, index_of_begin_sequences, index_of_complete_sequences

In [32]:
'''Creates all the sequences of routes possible to reconstruct the real route'''

def possible_sequences_construction(stop_sequences_df, index_of_extendable, index_of_begin_sequences, index_of_complete_sequences):
    '''returns the first part of the route_creation, two others need to be added'''
    import copy
    #create an empty df for the process of route creation
    route_creation  = pd.DataFrame()
    for route_id in index_of_extendable:
        #checks if some parts are begin sequences, if not, then we can't build routes with multiple sequences
        if route_id in index_of_begin_sequences:
            #create a copy of the df with only the route considered in the loop iteration
            routes_with_route_id = select_stop_sequences(stop_sequences_df, route_id)
            #set default frequency to NaN
            routes_with_route_id['frequency'] = np.nan
            #create a df where only the routes that have an end stop as their first element of the sequence
            route_creation_route_id = routes_with_route_id.loc[index_of_begin_sequences[route_id]][['route_id', 'hash', 'stop_sequence', 'dates', 'departure_time_last','frequency']]
            #create a df with the exentable sequences for that route_id
            route_creation_extensions_route_id = routes_with_route_id.loc[index_of_extendable[route_id]][['route_id', 'hash', 'stop_sequence', 'dates', 'departure_time_first', 'departure_time_last','frequency']]    
            #make the hash column as a column of lists
            route_creation_route_id['hash'] = route_creation_route_id['hash'].apply(lambda x: [x])
            route_creation_route_id = route_creation_route_id.reset_index(drop=True)
            #to stop the while loop when all the routes are complete in the df for the route_id of the loop iteration
            complete_routes = 0
            while complete_routes < len(route_creation_route_id.index):
                #use a deepcopy to not impact the iterrows of the main loop
                route_creation_deep_copy = copy.deepcopy(route_creation_route_id)
                for index_original, route_part in route_creation_deep_copy.iterrows():
                    #create a dataframe of the possible extentions for each route_part
                    #select an extention only if the extention is the next part of the route and also that no other station are repeated in the sequence if this extention is added(otherwise it might cause an infinite loop)
                    possible_extentions = get_extentions('after', route_creation_extensions_route_id, route_part)
                    #checks whether any extention fullfilling the criterias has been found
                    if not possible_extentions.empty:
                        #if so, extend it with every single possibilities
                        for index_extention, possible_extention in possible_extentions.iterrows():
                            #must create a deepcopy, otherwise the orignal hash list will change as well (mutable)
                            updated_hash = copy.deepcopy(route_part['hash'])
                            updated_hash.append(possible_extention['hash'])
                            updated_route_sequence = route_part['stop_sequence'] + possible_extention['stop_sequence'][1:]
                            common_dates = possible_extention['dates'] & route_part['dates']
                            new_departure_time_last = take_leftovers_list_c_from_intersection_AAndB(list(possible_extentions['departure_time_first'])[0], list(route_part['departure_time_last']), list(possible_extentions['departure_time_last'])[0])
                            new_frequency = len(new_departure_time_last) * len(common_dates)
                            route_creation_route_id.loc[max(route_creation_route_id.index)+1] = [route_id, updated_hash, updated_route_sequence, common_dates, new_departure_time_last, new_frequency]
                        #then delete the route with the index (see loop here above)
                        route_creation_route_id = route_creation_route_id.drop(index = index_original)            
                    #the route can't be extended anymore
                    else:
                        complete_routes += 1
            #adds all the possible routes created with the trips of the route_id of the main loop
            route_creation = route_creation.append(route_creation_route_id, ignore_index = True)
    if 'departure_time_last' in route_creation.columns:
        route_creation = route_creation.drop(['dates', 'departure_time_last'], axis=1)
    route_creation = route_creation.reindex(columns=['route_id','hash','stop_sequence', 'frequency'])
    return route_creation

In [33]:
'''Adds the full sequences to the route_creation dataframe'''

def add_full_sequences(stop_sequences_df, route_creation, index_of_complete_sequences):
    '''returns the second part of the route_creation, one other needs to be added'''
    for route_id in index_of_complete_sequences:
        #findes all the complete sequences for that route_id
        copy_complete_sequences_df = stop_sequences_df.loc[index_of_complete_sequences[route_id]][['route_id','hash','stop_sequence', 'dates', 'departure_time_last']].copy()
        copy_complete_sequences_df = calculate_frequency(copy_complete_sequences_df)
        #adds each of them in the route_creation dataframe
        for index_complete_sequence, complete_sequence in copy_complete_sequences_df.iterrows():
            route_creation = route_creation.append(complete_sequence, ignore_index = True)
    route_creation = route_creation.sort_values(by=['route_id'], ignore_index = True)
    return route_creation 

In [34]:
'''Adds the sequences that were not yet added in the route_creation dataframe'''

def add_unused_sequences(stop_sequences_df, route_creation):
    '''returns the third part of the route_creation'''
    for route_id in stop_sequences_df['route_id'].unique():
        if route_id in route_creation['route_id'].unique():
            #get a set of the hashes that were employed to create the routes for that route_id
            used_sequences_hash = set(route_creation[route_creation['route_id'] == route_id].apply(lambda x: pd.Series(x['hash']),axis=1).stack().reset_index(level=1, drop=True))
            #get a tuple of all the route sequences for that route_id
            used_sequences = tuple(route_creation[route_creation['route_id'] == route_id]['stop_sequence'])
            copy_sequences_route_id = select_stop_sequences(stop_sequences_df, route_id)[['route_id','hash','stop_sequence', 'dates', 'departure_time_last']]
            copy_sequences_route_id = calculate_frequency(copy_sequences_route_id)
            #adds the hashes that were not employed in any route creations for that route_id
            for index_trip, trip in copy_sequences_route_id.iterrows():
                #first element of the list because there is always only one element
                if trip['hash'][0] not in used_sequences_hash:
                    #checks that the sequence is not a sublist of any existing sequences
                    is_subsequence = False
                    for sequence in used_sequences:
                        if set(trip['stop_sequence']).issubset(sequence):
                            is_subsequence = True
                    if not is_subsequence:
                        route_creation = route_creation.append(trip, ignore_index = True)
    return route_creation

In [35]:
'''Creates a column in the df that calculates the travel time between the first and last stop (waiting time included)'''
from datetime import datetime
from datetime import timedelta
FMT = '%H:%M:%S'
day_in_seconds = timedelta(days=1).total_seconds()

def give_begin_end_time(route_creation_frequency_single, trips_hash_stop_sequence, stops_cleaned_stop_times_trips_merge_dates):
    #create a copy to not change the input DataFrame
    route_creation_frequency_single = route_creation_frequency_single.copy()
    #makes a column with the a representative begin time and end time of the route
    route_creation_frequency_single['travel_time'] = np.nan
    for index_sequence, sequence in route_creation_frequency_single.iterrows():
        constructed_route = pd.DataFrame()
        for index_hash, hash_value in enumerate(sequence['hash']):
            index_plus_one = index_hash + 1
            #take all the trips with that hash
            next_representative_trips = trips_hash_stop_sequence[(trips_hash_stop_sequence['hash'] == hash_value) & (trips_hash_stop_sequence['route_id'] == sequence['route_id'])].copy()['trip_id']
            #take all the stop sequences and their time that belongs 
            full_times = stops_cleaned_stop_times_trips_merge_dates[stops_cleaned_stop_times_trips_merge_dates['trip_id'].isin(next_representative_trips)].copy()
            #select) only the last stop sequences of full_times for each trip_id
            new_index_max_per_trip_id = full_times.reset_index().groupby(['route_id', 'trip_id'])['stop_sequence'].idxmax()
            max_per_trip_id = full_times.reset_index().loc[new_index_max_per_trip_id]
            #select only the first stop sequences of full_times for each trip_id            
            new_index_min_per_trip_id = full_times.reset_index().groupby(['route_id', 'trip_id'])['stop_sequence'].idxmin()            
            min_per_trip_id = full_times.reset_index().loc[new_index_min_per_trip_id]
            #merge max_per_trip_id and min_per_trip_id
            merged = min_per_trip_id[['trip_id', 'dates', 'departure_time']].merge(max_per_trip_id[['trip_id', 'arrival_time', 'departure_time']], on='trip_id')
            #take all the stop sequences except the first one, and the last one if it is not the last sequence of the route
            if index_hash == len(sequence['hash']) - 1:
                rest_per_trip_id = full_times.reset_index().drop(pd.concat([new_index_min_per_trip_id,new_index_max_per_trip_id]))
            else:
                rest_per_trip_id = full_times.reset_index().drop(new_index_min_per_trip_id)            
            rest_per_trip_id = rest_per_trip_id.dropna()
            if not rest_per_trip_id.empty:
                rest_per_trip_id = calculate_time_difference(rest_per_trip_id, 'departure_time', 'arrival_time', 'waiting_time')
                #calculate the total waiting_time
                rest_per_trip_id_grouped = rest_per_trip_id.groupby(['trip_id'], as_index=False)['waiting_time'].sum()
                merged_waiting_time = merged.merge(rest_per_trip_id_grouped, on='trip_id')
            #in case there are only two stops in for the hash
            else:
                merged_waiting_time = merged.copy()
                merged_waiting_time['waiting_time'] = 0
            #rename the columns     
            merged_waiting_time = merged_waiting_time.rename(columns = {'trip_id': 'trip_id_' + str(index_plus_one),'departure_time_x':'departure_time_'+ str(index_plus_one), 'arrival_time':'arrival_time_'+ str(index_plus_one),
                                          'departure_time_y':'departure_time_'+ str(index_plus_one + 1), 'waiting_time': 'waiting_time_' + str(index_plus_one)})
            if index_hash == 0:
                constructed_route = merged_waiting_time
            elif index_hash > 0:
                constructed_route = constructed_route.merge(merged_waiting_time, how='inner', on=['departure_time_' + str(index_plus_one)])
                #take the intersection of the dates => only get the common dates and retain those rows with common dates
                constructed_route['dates'] = [a & b for a,b in zip(constructed_route['dates_x'], constructed_route['dates_y'])]
                constructed_route = constructed_route[constructed_route['dates'].map(lambda d: len(d)) > 0]
                constructed_route = constructed_route.drop(['dates_x','dates_y'], axis=1)        
        #make a list of all the columns of waiting_times
        list_column_waiting_time = []
        for i in range(1, index_plus_one + 1):
            list_column_waiting_time.append('waiting_time_' + str(i))
        #sum all the waiting times together for each route itinerary
        constructed_route['waiting_time'] = constructed_route[list_column_waiting_time].astype(int).sum(1)
        
        #sometimes it is impossible to find trips that follow each other
        if not constructed_route.empty:
            #when the loop is finished, take the last arrival time, that will be used to calculate the travel time
            time_constructed_route = constructed_route[['departure_time_1', 'arrival_time_' + str(index_plus_one), 'waiting_time', 'dates']]
            time_constructed_route = time_constructed_route.rename(columns = {'departure_time_1':'departure_time', 'arrival_time_' + str(index_plus_one):'arrival_time'})
            time_constructed_route = calculate_time_difference(time_constructed_route, 'arrival_time', 'departure_time', 'time_diff_min')
            #add here a new column count dates that is the sum of the common dates
            time_constructed_route['count_dates'] = time_constructed_route['dates'].apply(lambda x: len(x))
            sum_count_dates = time_constructed_route['count_dates'].sum()
            #take the first most frequent one
            #create the weighted sum
            time_constructed_route['WS_travel_time'] = (time_constructed_route['time_diff_min'] * time_constructed_route['count_dates'])/sum_count_dates
            time_constructed_route['WS_waiting_time'] = (time_constructed_route['waiting_time'] * time_constructed_route['count_dates'])/sum_count_dates    
            weighted_sum_tt = time_constructed_route['WS_travel_time'].sum()
            weighted_sum_wt = time_constructed_route['WS_waiting_time'].sum()
            #Add this to the first dataframe
            route_creation_frequency_single.loc[index_sequence,'travel_time'] = weighted_sum_tt
            route_creation_frequency_single.loc[index_sequence,'waiting_time'] = weighted_sum_wt
        #if there is no trips that follow each other with the hash from the array
        else:
            route_creation_frequency_single = route_creation_frequency_single.drop(index_sequence)
            
    return route_creation_frequency_single

In [36]:
def calculate_hash_route_creation(route_creation): 
    '''calculates the hash and the hash inverse of the route_creation'''
    #copy the route_creation dataFrame
    route_creation_hash = route_creation.copy()
    #calculate the hash and the hash inverse using the lists in stop_sequence
    route_creation_hash['hash'] = route_creation_hash['stop_sequence'].apply(lambda x: hash(tuple(x)))
    route_creation_hash['hash_inverse'] = route_creation_hash['stop_sequence'].apply(lambda x: hash(tuple(x[::-1])))
    return route_creation_hash

In [37]:
'''Regroup the routes that are the same (even though they are in the opposite direction)'''

def regroup_same_stop_sequences(route_creation_hash):
    '''regroups the stop_sequences that are the same'''
    
    route_creation_max_hash = route_creation_hash.copy()
    route_creation_max_hash['max_hash'] = route_creation_max_hash[['hash', 'hash_inverse']].max(axis=1)
    #create a df that sums the frequence of the trips going from opposite directions
    route_creation_max_hash_freq = route_creation_max_hash.groupby(['route_id','max_hash'], as_index = False)[['frequency']].sum()
    #renames the max_hash column into hash so it the dataframe can be merged with route_hash_without_freq
    route_creation_max_hash_freq = route_creation_max_hash_freq.rename(columns = {'max_hash':'hash'})
    #drops the column freq_sequence_route because the one that is of interest is in route_creation_max_hash_freq
    route_hash_without_freq = route_creation_hash.copy().drop(['frequency'], axis = 1)
    route_hash_without_freq = route_hash_without_freq.drop_duplicates(subset=['route_id', 'hash'])
    route_hash_freq_combined_first_merge = pd.merge(route_creation_max_hash_freq, route_hash_without_freq, on=['route_id', 'hash'], how='left')
    #selects the part of the dataset that doesn't have NaN (because for the NaN, their hash_value that was max was the one in hash_inverse and it didn't exist in the other df), so we can concatenate it with the part that had NaN later
    route_hash_freq_first_part = route_hash_freq_combined_first_merge[pd.notnull(route_hash_freq_combined_first_merge['stop_sequence'])]
    #selects one part the part of the dataset that does have NaN, so we can concatenate it with the part that has no NaN later on.
    #but first, we will need to fill those NaN values (done in the code lines behind this one)
    route_hash_freq_second_part = route_hash_freq_combined_first_merge[pd.isnull(route_hash_freq_combined_first_merge['stop_sequence'])][['route_id', 'hash', 'frequency']]
    #renames the hash column into hash_inverse so it the dataframe can be merged with route_hash_without_freq (because it didn't work with 'hash' on the first merge)
    route_hash_freq_second_part = route_hash_freq_second_part.rename(columns = {'hash':'hash_inverse'})
    route_hash_freq_second_part = pd.merge(route_hash_freq_second_part, route_hash_without_freq, on=['route_id', 'hash_inverse'], how='left')
    #the hash that is of interest in the final df will be hash and not hash_inverse
    route_hash_freq_combined_not_sorted = pd.concat([route_hash_freq_first_part, route_hash_freq_second_part])
    route_hash_freq_combined = route_hash_freq_combined_not_sorted.sort_values(by = ['route_id'])
    route_hash_freq_combined = route_hash_freq_combined.reset_index(drop = True)
    return route_hash_freq_combined

In [38]:
'''Deletes the routes that do not represent 10% of the total route frequency and creates new route, if some of them are different'''

def apply_treshold_route_creation(route_hash_freq_combined): 
    #calculates the total frequency per route_id
    frequency_each_route = route_hash_freq_combined.groupby(['route_id'], as_index = False)['frequency'].sum()
    frequency_treshold = frequency_each_route.copy()
    #calculates the treshold (here 10%)
    frequency_treshold['frequency'] = frequency_treshold['frequency']/10
    frequency_treshold.rename(columns = {'frequency':'frequency_treshold'}, inplace = True)
    route_hash_freq_treshold = route_hash_freq_combined.merge(frequency_treshold, on='route_id', how = 'left')
    #find the sequences that are not more than 10% of the route frequency and delete them
    index_names = route_hash_freq_treshold[route_hash_freq_treshold['frequency'] < route_hash_freq_treshold['frequency_treshold']].index
    route_hash_freq_treshold.drop(index_names, inplace = True)
    #drop the routes with the same hash as others
    route_hash_freq_treshold['max_hash'] = route_hash_freq_treshold[['hash', 'hash_inverse']].max(axis=1)
    route_hash_freq_treshold = route_hash_freq_treshold.drop_duplicates(subset='max_hash')
    route_hash_freq_treshold  = route_hash_freq_treshold.drop(['hash_inverse', 'max_hash'], axis = 1)
    #selects the sequences that are not the first most frequent per route_id
    sequences_max_freq = route_hash_freq_treshold.groupby(['route_id'],as_index = False)['frequency'].max()
    sequences_max_freq.rename(columns = {'frequency':'max_frequency'}, inplace = True)
    sequences_max_freq_merged = route_hash_freq_treshold.merge(sequences_max_freq, on='route_id', how='left')
    sequences_max_freq_index = sequences_max_freq_merged[sequences_max_freq_merged['frequency'] == sequences_max_freq_merged['max_frequency']].drop_duplicates(subset='route_id').index
    sequences_non_max_freq_index = sequences_max_freq_merged[~sequences_max_freq_merged.index.isin(sequences_max_freq_index)].index
    #those selected sequences get a new route_id that starts from routes['route_id'].max() + 1 and increments by one for each new route
    route_id_creation =  0 + 1
    new_route_id_column = list(range(route_id_creation, route_id_creation + len(sequences_non_max_freq_index)))    
    sequences_max_freq_merged.loc[sequences_non_max_freq_index, 'route_id'] = new_route_id_column
    #keep only the column route_id and stop_sequence
    final_routes = sequences_max_freq_merged.drop(sequences_max_freq_merged[sequences_max_freq_merged['frequency'] == 0].index)
    final_routes = final_routes.drop(columns=['hash', 'frequency', 'frequency_treshold', 'max_frequency'])
    return final_routes

In [39]:
''' To keep only the routes that have at least one Swiss station in their route_sequence'''

def keep_swiss_routes(final_routes):
    non_swiss_routes = set()
    for index_route, route in final_routes.iterrows():
        is_in_Switzerland = False
        for stop in route['stop_sequence']:
            if stop in set(swiss_stops_Switzerland_series):
                is_in_Switzerland = True
                break
        if not is_in_Switzerland:
            route_id = route['route_id']
            non_swiss_routes.add(route_id)
    swiss_routes = final_routes.loc[~final_routes['route_id'].isin(non_swiss_routes)]    

    return swiss_routes

In [40]:
'''calculates the distances of the trip'''

def calculate_distance_from_lat_long(name_first, name_second, stop_df):
        lon_first, lat_first = math.radians(stop_df[stop_df['stop_name'] == name_first].iloc[0]['stop_lon']), math.radians(stop_df[stop_df['stop_name'] == name_first].iloc[0]['stop_lat'])
        lon_second, lat_second = math.radians(stop_df[stop_df['stop_name'] == name_second].iloc[0]['stop_lon']), math.radians(stop_df[stop_df['stop_name'] == name_second].iloc[0]['stop_lat'])
        # The radius of the earth
        R = 6373.0 
        # To calculate the change in coordinates
        dlon = lon_second - lon_first
        dlat = lat_second - lat_first
        # To use the Haversine formula to get the distance in kilometers between the starting_station and the ending_station
        a = math.sin(dlat / 2)**2 + math.cos(lat_first) * math.cos(lat_second) * math.sin(dlon / 2)**2
        c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
        # To calculate the distance
        distance = R * c
        return distance

def calculate_distance(stop_sequence, stop_df):
    distance = 0
    for index_stop ,stop in enumerate(stop_sequence):
        index_plus_one = index_stop + 1
        if index_plus_one <= len(stop_sequence) - 1:
            distance += calculate_distance_from_lat_long(stop, stop_sequence[index_plus_one], stop_df)
    return distance

In [41]:
'''Makes a set that can be used for building the edges of the graph using Networkx package'''

def create_df_for_Networkx(final_routes):
    '''return df_for_edges a df that can be used to build a Networkx L-space graph'''
    #takes the list stop sequence and make it a new column for each stop
    stop_sequence_values = final_routes.apply(lambda x: pd.Series(x['stop_sequence']),axis=1).stack().reset_index(level=1, drop=True)
    stop_sequence_values.name = 'stop_sequence'
    final_routes_stops = final_routes.drop('stop_sequence', axis=1).join(stop_sequence_values)
    final_routes_stops = final_routes_stops.reset_index(drop=True)
    #Creates a shifted instance of the df to use it for the final result
    final_routes_stops_shifted = final_routes_stops.shift()
    #Check if which of the rows are followed by a row with the same trip_id
    final_routes_stops_shifted['match'] = final_routes_stops_shifted['route_id'].eq(final_routes_stops['route_id'])
    #Drop the rows for which this condition is not satisfied
    final_routes_stops_shifted.drop(final_routes_stops_shifted[final_routes_stops_shifted['match'] == False].index, inplace = True)
    final_routes_stops_shifted.rename(columns=
      {"stop_sequence": "stop_name_1",
      "stop_name": "stop_name_1"}, inplace=True)
    #joins the df with its shifted version sothat each sequence of two stations is represented in the table as a row
    df_for_edges = final_routes_stops_shifted.join(final_routes_stops[['stop_sequence']], lsuffix='_caller', rsuffix='_other', how='left')
    df_for_edges.rename(columns=
      {"stop_sequence": "stop_name_2",
      "stop_name": "stop_name_2"}, inplace=True)

    df_for_edges = df_for_edges.drop_duplicates()
    df_for_edges = df_for_edges[['route_id','stop_name_1', 'stop_name_2']]
    df_for_edges = df_for_edges.reset_index(drop=True)
    return df_for_edges

## To apply the route creation function

In [43]:
def full_route_creation(stop_sequences_df, number_of_trips_per_hash, service_id_count_dates, trips_hash_stop_sequence, stops_cleaned_stop_times_trips_merge_dates, stops_cleaned):
    '''return a df that can be used to make a Networkx L-space (with treshold applied of 10%)'''
    index_of_extendable, index_of_begin_sequences, index_of_complete_sequences = get_extention_indexes(stop_sequences_df)
    route_creation_first = possible_sequences_construction(stop_sequences_df, index_of_extendable, index_of_begin_sequences, index_of_complete_sequences)
    route_creation_second = add_full_sequences(stop_sequences_df, route_creation_first, index_of_complete_sequences)
    route_creation_third = add_unused_sequences(stop_sequences_df, route_creation_second)
    route_creation_frequency_single_travel_time = give_begin_end_time(route_creation_third, trips_hash_stop_sequence, stops_cleaned_stop_times_trips_merge_dates)
    route_creation_hash = calculate_hash_route_creation(route_creation_frequency_single_travel_time)
    route_hash_freq_combined = regroup_same_stop_sequences(route_creation_hash)
    final_routes = apply_treshold_route_creation(route_hash_freq_combined)
    swiss_routes = keep_swiss_routes(final_routes)
    swiss_routes['distance'] = swiss_routes['stop_sequence'].apply(lambda x: calculate_distance(x, stops_cleaned))
    df_for_edges = create_df_for_Networkx(swiss_routes)
    
    return final_routes, swiss_routes, df_for_edges

In [44]:
#final_routes_Switzerland, swiss_routes_Switzerland, df_for_edges_Switzerland = full_route_creation(route_hash_service_freq_Switzerland, route_hash_service_freq_Switzerland.copy(), service_id_df_Switzerland, trips_hash_stop_sequence_Switzerland, trips_routes_stop_times_stops_dates_Switzerland, stops_cleaned_Switzerland))
#final_routes_Switzerland
#swiss_routes_Switzerland
#df_for_edges_Switzerland

In [45]:
#df_for_edges.to_csv(r'/Users/pol/Desktop/CSV_export/df_for_edges_Switzerland.csv', index = False, header=True, encoding='utf-8-sig')

In [46]:
#swiss_routes.to_csv(r'/Users/pol/Desktop/CSV_export/swiss_routes_Switzerland.csv', index = False, header=True, encoding='utf-8-sig')

In [48]:
stop_sequences_df, number_of_trips_per_hash, service_id_count_dates, trips_hash_stop_sequence, stops_cleaned_stop_times_trips_merge_dates, stops_cleaned = route_hash_service_freq_Switzerland, route_hash_service_freq_Switzerland.copy(), service_id_df_Switzerland, trips_hash_stop_sequence_Switzerland, trips_routes_stop_times_stops_dates_Switzerland, stops_cleaned_Switzerland
index_of_extendable, index_of_begin_sequences, index_of_complete_sequences = get_extention_indexes(stop_sequences_df)
route_creation_first = possible_sequences_construction(stop_sequences_df, index_of_extendable, index_of_begin_sequences, index_of_complete_sequences)
route_creation_second = add_full_sequences(stop_sequences_df, route_creation_first, index_of_complete_sequences)
route_creation_third = add_unused_sequences(stop_sequences_df, route_creation_second)
route_creation_frequency_single_travel_time = give_begin_end_time(route_creation_third, trips_hash_stop_sequence, stops_cleaned_stop_times_trips_merge_dates)
route_creation_hash = calculate_hash_route_creation(route_creation_frequency_single_travel_time)
route_hash_freq_combined = regroup_same_stop_sequences(route_creation_hash)
final_routes = apply_treshold_route_creation(route_hash_freq_combined)
swiss_routes = keep_swiss_routes(final_routes)
swiss_routes['distance'] = swiss_routes['stop_sequence'].apply(lambda x: calculate_distance(x, stops_cleaned))
df_for_edges = create_df_for_Networkx(swiss_routes)

ValueError: 

In [None]:
stop_sequences_df, number_of_trips_per_hash, service_id_count_dates, trips_hash_stop_sequence, stops_cleaned_stop_times_trips_merge_dates, stops_cleaned = route_hash_service_freq_Switzerland, route_hash_service_freq_Switzerland.copy(), service_id_df_Switzerland, trips_hash_stop_sequence_Switzerland, trips_routes_stop_times_stops_dates_Switzerland, stops_cleaned_Switzerland
route_creation_frequency_single_travel_time = give_begin_end_time(route_creation_third, trips_hash_stop_sequence, stops_cleaned_stop_times_trips_merge_dates)
route_creation_hash = calculate_hash_route_creation(route_creation_frequency_single_travel_time)
route_hash_freq_combined = regroup_same_stop_sequences(route_creation_hash)
final_routes = apply_treshold_route_creation(route_hash_freq_combined)
swiss_routes = keep_swiss_routes(final_routes)
swiss_routes['distance'] = swiss_routes['stop_sequence'].apply(lambda x: calculate_distance(x, stops_cleaned))
df_for_edges = create_df_for_Networkx(swiss_routes)