In [1]:
!pip install geopy



In [2]:
'''To import the required packages.'''
import pandas as pd
import numpy as np
import networkx as nx
import collections
import matplotlib.pyplot as plt
import math
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

# Import of the Swiss railway datasets

In [3]:
'''To register the GitHub link with the Swiss data as a variable.'''
datalink = "https://raw.githubusercontent.com/polkuleuven/Thesis_Train/main/gtfs_train_Switzerland_1503/"

In [4]:
'''Import all the GTFS data'''

#To import the agency dataset that contains limited information about the Swiss SBB railway agency.
agency_Switzerland = pd.read_csv(datalink + "agency.txt", sep=",")
#To import the stops dataset that contains information about the ids, the names and the geographical coordinates of the Swiss SBB railway stations.
stops_Switzerland = pd.read_csv(datalink + "stops.txt", sep=",")
#To import the feed_info dataset that contains limited information about the Swiss SBB railway feed.
feed_info_Switzerland = pd.read_csv(datalink + "feed_info.txt", sep=",")
#To import the transfers dataset that gives the minimum transfer time to switch routes at each Swiss SBB railway station.
transfers_not_cleaned_Switzerland = pd.read_csv(datalink + "transfers.txt", sep=",")
#To import the routes dataset that provides the id, the name and the type of vehicle used for all Swiss SBB railway routes.
routes_Switzerland = pd.read_csv(datalink + "routes.txt", sep=",")
#To import the trips dataset that gives for all routes an overview of the trips and the headsigns of these trips belonging to the Swiss SBB railway route.
#The service_id is an indication of all the dates this trip is valid (consultable in the calendar_dates dataset).
trips_Switzerland = pd.read_csv(datalink + "trips.txt", sep=",")
#To import the stop_times dataset that gives for all trips an overview of the ids of the stations served and the sequence in which these stations are served. 
#In addition, for all the trips the arrival and departure times at the stations served are given.
stop_times_Switzerland = pd.read_csv(datalink + "stop_times.txt", sep=",")
#To import the calendar dataset that gives the first and last date of all data observations.
calendar_Switzerland = pd.read_csv(datalink + "calendar.txt", sep=",")
#To import the calendar_dates dataset that gives for each service_id all the exact dates when that service_id is valid.
calendar_dates_Switzerland = pd.read_csv(datalink + "calendar_dates.txt", sep=",")


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


# Cleaning of the Swiss railway data

In [5]:
''' To clean the routes_Switzerland df.'''
#To keep the train routes
routes_cleaned_Switzerland = routes_Switzerland[routes_Switzerland['route_type'] == 2]
routes_cleaned_Switzerland

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_desc,route_type,route_url,route_color,route_text_color
0,04236.06____.020:4236,06____,RE,RE 4236,,2,,,
1,04700.06____.001:4700,06____,RE,RE 4700,,2,,,
2,04700.06____.014:4700,06____,RE,RE 4700,,2,,,
3,04701.06____.002:4701,06____,RE,RE 4701,,2,,,
4,04701.06____.015:4701,06____,RE,RE 4701,,2,,,
...,...,...,...,...,...,...,...,...,...
49346,87945.L7____.001:5,L7____,S5,S 5,,2,,FFEA00,000000
49347,87946.L7____.001:5,L7____,S5,S 5,,2,,FFEA00,000000
49348,87947.L7____.001:5,L7____,S5,S 5,,2,,FFEA00,000000
49349,87948.L7____.001:5,L7____,S5,S 5,,2,,FFEA00,000000


In [6]:
''' To clean the trips_Switzerland df.'''
# To remove the routes that are not train routes
no_route_id_train_route = routes_Switzerland.loc[routes_Switzerland['route_type'] != 2, 'route_id']
trips_cleaned_Switzerland = trips_Switzerland[(~trips_Switzerland['route_id'].isin(no_route_id_train_route))]

# To remove the accents from the trip_headsign and to change to uppercase
trips_cleaned_Switzerland.loc[:,'trip_headsign'] = trips_cleaned_Switzerland.loc[:,'trip_headsign'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
trips_cleaned_Switzerland.loc[:,'trip_headsign'] = trips_cleaned_Switzerland.loc[:,'trip_headsign'].str.upper()
trips_cleaned_Switzerland

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


Unnamed: 0,route_id,service_id,trip_id,trip_headsign,trip_short_name,direction_id,block_id,shape_id,bikes_allowed,attributes_ch
0,04236.06____.020:4236,19311,0:1,KARLSRUHE HBF,4236,,,,0,MO
1,04700.06____.001:4700,133763,1:1,KARLSRUHE HBF,4700,,,,0,MO
2,04700.06____.001:4700,1,1:2,KARLSRUHE HBF,4700,,,,0,MO
3,04700.06____.001:4700,13336,1:3,KARLSRUHE HBF,4700,,,,0,MO
4,04700.06____.001:4700,2610,1:4,KARLSRUHE HBF,4700,,,,0,MO
...,...,...,...,...,...,...,...,...,...,...
90223,87947.L7____.001:5,2364,49348:1,WEIL AM RHEIN,5,,,,0,MO
90224,87948.L7____.001:5,56518,49349:1,LORRACH HBF,5,,,,0,MO
90225,87948.L7____.001:5,17102,49349:2,ZELL (WIESENTAL),5,,,,0,MO
90226,87949.L7____.001:5,17102,49350:1,WEIL AM RHEIN,5,,,,0,MO


In [7]:
''' To clean the stop_times_Switzerland df.'''
# To remove the stop_times trip_ids that are not trip_ids that belong to train routes
no_trip_id_train_route = trips_Switzerland.loc[trips_Switzerland['route_id'].isin(no_route_id_train_route), 'trip_id']
stop_times_cleaned_Switzerland = stop_times_Switzerland[(~stop_times_Switzerland['trip_id'].isin(no_trip_id_train_route))]

# To remove the superfluous characters of the stop_id (: and platform_code)
stop_times_cleaned_Switzerland_column = stop_times_cleaned_Switzerland['stop_id'].str.split(':').str[0]
stop_times_cleaned_Switzerland.loc[:,'stop_id'] = stop_times_cleaned_Switzerland_column

# To make the stop_ids numerical and to remove the duplicate stop_ids
stop_times_cleaned_Switzerland.loc[:,'stop_id'] = stop_times_cleaned_Switzerland.loc[:,'stop_id'].astype(np.int64)
stop_times_cleaned_Switzerland

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled,attributes_ch
0,0:1,18:16:00,18:16:00,8014554,0,,0,0,,
1,0:1,18:28:00,18:30:00,8014534,1,,0,0,,
2,0:1,18:40:00,18:40:00,8014529,2,,0,0,,
3,0:1,18:49:00,18:50:00,8014521,3,,0,0,,
4,0:1,18:58:00,18:59:00,8014518,4,,0,0,,
...,...,...,...,...,...,...,...,...,...,...
1034556,49350:2,10:38:00,10:38:00,8069220,3,,3,3,,X
1034557,49350:2,10:40:00,10:40:00,8014429,4,,3,3,,X
1034558,49350:2,10:41:00,10:42:00,8060979,5,,3,3,,X
1034559,49350:2,10:43:00,10:43:00,8060978,6,,3,3,,X


In [8]:
''' To clean the stops_Switzerland df   (1).'''
# To select all stops that appear in the stop_times df
stops_cleaned_df_Switzerland = stop_times_cleaned_Switzerland[['stop_id']]
stops_cleaned_df_Switzerland = stops_cleaned_df_Switzerland.drop_duplicates()

# To make the stop_id attribute in the initial stops_Switzerland df numerical
stops_initial_Switzerland_column = stops_Switzerland['stop_id'].str.split(':').str[0]
stops_initial_Switzerland = stops_Switzerland.copy()
stops_initial_Switzerland.loc[:,'stop_id'] = stops_initial_Switzerland_column

# To make the stop_ids numerical and to remove the duplicate stop_ids
stops_initial_Switzerland = stops_initial_Switzerland[['stop_id', 'stop_name', 'stop_lat', 'stop_lon']]
stops_initial_Switzerland.loc[:,'stop_id'] = stops_initial_Switzerland.loc[:,'stop_id'].astype(np.int64)
stops_initial_Switzerland = stops_initial_Switzerland.drop_duplicates()

# To remove the accents from the stop_name and to change to uppercase
stops_initial_Switzerland.loc[:,'stop_name'] = stops_initial_Switzerland.loc[:,'stop_name'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
stops_initial_Switzerland.loc[:,'stop_name'] = stops_initial_Switzerland.loc[:,'stop_name'].str.upper()

# To merge the stops_cleaned_df_Switzerland and the stops_initial_Switzerland df
stops_cleaned_Switzerland = pd.merge(stops_initial_Switzerland, stops_cleaned_df_Switzerland, on = 'stop_id', how='right')

In [9]:
''' To clean the stops_Switzerland df   (2).'''
# To initialize the Nominatim API to get the location from the input string 
geolocator = Nominatim(user_agent="application")
reverse = RateLimiter(geolocator.reverse, min_delay_seconds=0.2)

# To get the location with the geolocator.reverse() function and to extract the country from the location instance
country_list = []
for index, row in stops_cleaned_Switzerland.iterrows():
    latitude = row['stop_lat']
    longitude = row['stop_lon']
    # To assign the latitude and longitude into a geolocator.reverse() method
    location = reverse((latitude, longitude), language='en', exactly_one=True)
    # To get the country from the given list and parsed into a dictionary with raw function()
    address = location.raw['address']
    country = address.get('country', '')
    country_list.append(country)

# To add the values of country_list as a new attribute country     
stops_cleaned_Switzerland.loc[:,'country'] = country_list
stops_cleaned_Switzerland

# To calculate the total number of Belgian stations in the stops_cleaned dataset
swiss_stops_Switzerland = stops_cleaned_Switzerland[stops_cleaned_Switzerland['country'] == 'Switzerland']
swiss_stops_Switzerland_series = stops_cleaned_Switzerland.loc[stops_cleaned_Switzerland['country'] == 'Switzerland', 'stop_name']

In [10]:
stops_cleaned_Switzerland.to_csv(r'/Users/Jos/Documents/KU Leuven/Thesis/GitHub/Thesis_Train/stops_cleaned_18-03_Ine/stops_cleaned_Switzerland.csv', index = False, header=True, encoding='utf-8-sig')

# Exploratory data analysis with the Swiss railway data

In [11]:
'''To calculate the number of unique route_ids '''
set_routes_Switzerland = {r for r in routes_cleaned_Switzerland['route_id']}
len(set_routes_Switzerland)

46037

In [12]:
'''To calculate the total number of stations in the stops_cleaned_Switzerland dataset'''
set_stations_Switzerland = {s for s in stops_cleaned_Switzerland['stop_id']}
len(set_stations_Switzerland)

2608

In [13]:
'''To calculate the total number of Swiss stations in the stops_cleaned dataset'''
set_swiss_stations = {s for s in swiss_stops_Switzerland['stop_id']}
len(set_swiss_stations)

1762

# **Preparation for the L-space representation of the Swiss railway system**

In [14]:
'''To merge a selection of the trips dataset and a selection of the routes dataset on route_id'''
trips_routes_Switzerland = pd.merge(trips_cleaned_Switzerland[['route_id','service_id','trip_id', 'trip_headsign']], routes_cleaned_Switzerland[['route_id', 'route_short_name', 'route_long_name']], on='route_id')
trips_routes_Switzerland

Unnamed: 0,route_id,service_id,trip_id,trip_headsign,route_short_name,route_long_name
0,04236.06____.020:4236,19311,0:1,KARLSRUHE HBF,RE,RE 4236
1,04700.06____.001:4700,133763,1:1,KARLSRUHE HBF,RE,RE 4700
2,04700.06____.001:4700,1,1:2,KARLSRUHE HBF,RE,RE 4700
3,04700.06____.001:4700,13336,1:3,KARLSRUHE HBF,RE,RE 4700
4,04700.06____.001:4700,2610,1:4,KARLSRUHE HBF,RE,RE 4700
...,...,...,...,...,...,...
86909,87947.L7____.001:5,2364,49348:1,WEIL AM RHEIN,S5,S 5
86910,87948.L7____.001:5,56518,49349:1,LORRACH HBF,S5,S 5
86911,87948.L7____.001:5,17102,49349:2,ZELL (WIESENTAL),S5,S 5
86912,87949.L7____.001:5,17102,49350:1,WEIL AM RHEIN,S5,S 5


In [15]:
'''To merge a selection of the stop_times_cleaned_Switzerland dataset with the stops_cleaned_Switzerland dataset'''
stop_times_stops_Switzerland = pd.merge(stop_times_cleaned_Switzerland[['trip_id','arrival_time', 'departure_time','stop_id','stop_sequence']], stops_cleaned_Switzerland, on='stop_id')
stop_times_stops_Switzerland

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_name,stop_lat,stop_lon,country
0,0:1,18:16:00,18:16:00,8014554,0,ENGEN,47.856347,8.772786,Germany
1,7:1,06:32:00,06:33:00,8014554,4,ENGEN,47.856347,8.772786,Germany
2,7:2,06:32:00,06:33:00,8014554,4,ENGEN,47.856347,8.772786,Germany
3,12:1,07:18:00,07:19:00,8014554,12,ENGEN,47.856347,8.772786,Germany
4,12:2,07:18:00,07:19:00,8014554,12,ENGEN,47.856347,8.772786,Germany
...,...,...,...,...,...,...,...,...,...
991431,49275:2,24:04:00,24:04:00,8014439,2,RIEHEN,47.583156,7.652008,Switzerland
991432,49276:1,24:14:00,24:14:00,8014439,14,RIEHEN,47.583156,7.652008,Switzerland
991433,49276:2,24:14:00,24:14:00,8014439,14,RIEHEN,47.583156,7.652008,Switzerland
991434,49276:3,24:14:00,24:14:00,8014439,14,RIEHEN,47.583156,7.652008,Switzerland


In [16]:
'''To merge a selection of the stop_times_stops_Switzerland dataset with the trips_routes_Switzerland dataset.'''
trips_routes_stop_times_stops_Switzerland = pd.merge(trips_routes_Switzerland, stop_times_stops_Switzerland, on='trip_id')
trips_routes_stop_times_stops_Switzerland

Unnamed: 0,route_id,service_id,trip_id,trip_headsign,route_short_name,route_long_name,arrival_time,departure_time,stop_id,stop_sequence,stop_name,stop_lat,stop_lon,country
0,04236.06____.020:4236,19311,0:1,KARLSRUHE HBF,RE,RE 4236,18:16:00,18:16:00,8014554,0,ENGEN,47.856347,8.772786,Germany
1,04236.06____.020:4236,19311,0:1,KARLSRUHE HBF,RE,RE 4236,18:28:00,18:30:00,8014534,1,IMMENDINGEN,47.936007,8.729536,Germany
2,04236.06____.020:4236,19311,0:1,KARLSRUHE HBF,RE,RE 4236,18:40:00,18:40:00,8014529,2,DONAUESCHINGEN,47.947786,8.498919,Germany
3,04236.06____.020:4236,19311,0:1,KARLSRUHE HBF,RE,RE 4236,18:49:00,18:50:00,8014521,3,VILLINGEN (SCHWARZW),48.058022,8.465261,Germany
4,04236.06____.020:4236,19311,0:1,KARLSRUHE HBF,RE,RE 4236,18:58:00,18:59:00,8014518,4,ST GEORGEN (SCHWARZW),48.123813,8.341955,Germany
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
991431,87949.L7____.001:5,56518,49350:2,WEIL AM RHEIN,S5,S 5,10:40:00,10:40:00,8014429,4,WEIL AM RHEIN-OST,47.590629,7.634780,Germany
991432,87949.L7____.001:5,56518,49350:2,WEIL AM RHEIN,S5,S 5,10:38:00,10:38:00,8069220,3,LORRACH DAMMSTRASSE,47.597713,7.655224,Germany
991433,87949.L7____.001:5,56518,49350:2,WEIL AM RHEIN,S5,S 5,10:37:00,10:37:00,8014440,2,LORRACH-STETTEN,47.601418,7.659032,Germany
991434,87949.L7____.001:5,56518,49350:2,WEIL AM RHEIN,S5,S 5,10:35:00,10:35:00,8069221,1,LORRACH MUSEUM/BURGHOF,47.607961,7.661196,Germany


In [17]:
'''To create a route_sequence dataset that gives for each trip_id that belongs to a route the sequence of stations served'''
route_sequence_Switzerland = trips_routes_stop_times_stops_Switzerland.groupby(['route_id','route_long_name','trip_headsign','trip_id','stop_sequence'], as_index=False)[['stop_name', 'stop_lat', 'stop_lon']].last()
route_sequence_Switzerland

Unnamed: 0,route_id,route_long_name,trip_headsign,trip_id,stop_sequence,stop_name,stop_lat,stop_lon
0,00001.000044.018:1,R 1,LE LOCLE,33248:1,0,LES BRENETS,47.067210,6.707389
1,00001.000044.018:1,R 1,LE LOCLE,33248:1,1,LES FRETES,47.058580,6.725787
2,00001.000044.018:1,R 1,LE LOCLE,33248:1,2,LE LOCLE LE CHALET,47.055918,6.738986
3,00001.000044.018:1,R 1,LE LOCLE,33248:1,3,LE LOCLE,47.057861,6.746153
4,00001.000044.028:1,R 1,LE LOCLE,33249:1,0,LES BRENETS,47.067210,6.707389
...,...,...,...,...,...,...,...,...
991431,96814.000011.101:96814,TER 96814,LYON PART DIEU,23295:1,1,BELLEGARDE (AIN),46.110918,5.825962
991432,96814.000011.101:96814,TER 96814,LYON PART DIEU,23295:1,2,LYON PART DIEU,45.760564,4.859990
991433,96818.000011.101:96818,TER 96818,LYON PART DIEU,23296:1,0,GENEVE,46.210213,6.142452
991434,96818.000011.101:96818,TER 96818,LYON PART DIEU,23296:1,1,BELLEGARDE (AIN),46.110918,5.825962


'''To calculate the hash and the hash_inverse values for the stop sequence of each trip_id'''

#To copy the trips_routes_Switzerland df
trips_hash_Switzerland = trips_routes_Switzerland.copy()

#To create a column called hash that contains NaN values
trips_hash_Switzerland['hash'] = np.nan

#To create a column called hash_inverse that contains NaN values
trips_hash_Switzerland['hash_inverse'] = np.nan

#For each trip_id in trips_routes_Switzerland, the stop_sequence that gets calculated is the subset of the stop_times dataset for that trip_id. 
#The tuple that results from the stop_id column of this subset dataset contains all the stop_ids that get served by this trip_id. 
 
#The hash value of the tuple of the stop_id column is calculated and is placed in the hash column of the trip_id in the trips_routes dataset
#The inverse_hash value of the tuple of the stop_id column is calculated as well.

for trip_Switzerland in trips_routes_Switzerland['trip_id'].unique():
    stop_sequence_Switzerland = stop_times_cleaned_Switzerland[stop_times_cleaned_Switzerland['trip_id'] == trip_Switzerland].sort_values(by = 'stop_sequence')
    trips_hash_Switzerland.loc[trips_hash_Switzerland['trip_id'] == trip_Switzerland, 'hash'] = hash(tuple(stop_sequence_Switzerland['stop_id']))
    trips_hash_Switzerland.loc[trips_hash_Switzerland['trip_id'] == trip_Switzerland, 'hash_inverse'] = hash(tuple(list(stop_sequence_Switzerland['stop_id'])[::-1]))

In [18]:
#trips_hash_Switzerland.to_csv(r'/Users/pol/Desktop/CSV_export/trips_hash_Switzerland.csv', index = False, header=True, encoding='utf-8-sig')

In [19]:
datalink = "https://raw.githubusercontent.com/polkuleuven/Thesis_Train/main"
trips_hash_Switzerland = pd.read_csv(datalink + "/hash_cleaning/trips_hash_Switzerland.csv", sep=",")
trips_hash_Switzerland

Unnamed: 0,route_id,service_id,trip_id,trip_headsign,route_short_name,route_long_name,hash,hash_inverse
0,04236.06____.020:4236,19311,0:1,KARLSRUHE HBF,RE,RE 4236,4.062624e+17,6.097513e+18
1,04700.06____.001:4700,133763,1:1,KARLSRUHE HBF,RE,RE 4700,1.524342e+18,-8.976875e+18
2,04700.06____.001:4700,1,1:2,KARLSRUHE HBF,RE,RE 4700,1.524342e+18,-8.976875e+18
3,04700.06____.001:4700,13336,1:3,KARLSRUHE HBF,RE,RE 4700,1.524342e+18,-8.976875e+18
4,04700.06____.001:4700,2610,1:4,KARLSRUHE HBF,RE,RE 4700,1.524342e+18,-8.976875e+18
...,...,...,...,...,...,...,...,...
86909,87947.L7____.001:5,2364,49348:1,WEIL AM RHEIN,S5,S 5,-6.490443e+18,3.777507e+17
86910,87948.L7____.001:5,56518,49349:1,LORRACH HBF,S5,S 5,3.777507e+17,-6.490443e+18
86911,87948.L7____.001:5,17102,49349:2,ZELL (WIESENTAL),S5,S 5,9.073753e+18,-6.629309e+18
86912,87949.L7____.001:5,17102,49350:1,WEIL AM RHEIN,S5,S 5,-6.629309e+18,9.073753e+18


In [20]:
''' To groupby the trip_id and to order the stop_sequence in an ascending order (the stop_sequences of some
routes are initially in descending order while other stop_sequences are in ascending order) '''

trips_stop_sequence_ascending_Switzerland = stop_times_stops_Switzerland.groupby(['trip_id'], as_index=False).apply(lambda x: x.sort_values('stop_sequence'))
trips_stop_sequence_ascending_Switzerland

Unnamed: 0,Unnamed: 1,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_name,stop_lat,stop_lon,country
0,0,0:1,18:16:00,18:16:00,8014554,0,ENGEN,47.856347,8.772786,Germany
0,541,0:1,18:28:00,18:30:00,8014534,1,IMMENDINGEN,47.936007,8.729536,Germany
0,1100,0:1,18:40:00,18:40:00,8014529,2,DONAUESCHINGEN,47.947786,8.498919,Germany
0,1895,0:1,18:49:00,18:50:00,8014521,3,VILLINGEN (SCHWARZW),48.058022,8.465261,Germany
0,2592,0:1,18:58:00,18:59:00,8014518,4,ST GEORGEN (SCHWARZW),48.123813,8.341955,Germany
...,...,...,...,...,...,...,...,...,...,...
86913,7970,9:4,07:24:00,07:25:00,8014277,14,BADEN-BADEN,48.790327,8.190831,Germany
86913,8657,9:4,07:31:00,07:32:00,8014245,15,RASTATT,48.860483,8.215623,Germany
86913,10637,9:4,07:35:00,07:35:00,8014241,16,MUGGENSTURM,48.876805,8.274293,Germany
86913,10705,9:4,07:38:00,07:39:00,8014240,17,MALSCH,48.889506,8.323585,Germany


In [21]:
''' To put the stop_names of a stop sequence of a trip_id in a list '''
trips_stop_sequence_Switzerland = trips_stop_sequence_ascending_Switzerland.groupby('trip_id')['stop_name'].apply(lambda group_series: group_series.tolist()).reset_index()
trips_stop_sequence_Switzerland.rename(columns={'stop_name':'stop_sequence'}, inplace=True)
trips_stop_sequence_Switzerland

Unnamed: 0,trip_id,stop_sequence
0,0:1,"[ENGEN, IMMENDINGEN, DONAUESCHINGEN, VILLINGEN..."
1,10000:1,"[ROMANSHORN, AMRISWIL, WEINFELDEN, FRAUENFELD,..."
2,10001:1,"[ZURICH HB, OLTEN, BAHN-2000-STRECKE, BERN]"
3,10002:1,"[ROMANSHORN, AMRISWIL, WEINFELDEN, FRAUENFELD,..."
4,10003:1,"[ROMANSHORN, AMRISWIL, WEINFELDEN, FRAUENFELD,..."
...,...,...
86909,99:1,"[KARLSRUHE HBF, RASTATT, BADEN-BADEN, BUHL (BA..."
86910,9:1,"[VILLINGEN (SCHWARZW), ST GEORGEN (SCHWARZW), ..."
86911,9:2,"[VILLINGEN (SCHWARZW), ST GEORGEN (SCHWARZW), ..."
86912,9:3,"[VILLINGEN (SCHWARZW), ST GEORGEN (SCHWARZW), ..."


In [22]:
''' To add the list of stop_sequence of stations to the trips_hash_Switzerland df by joining on trip_id'''
# To add the stop_sequence of stations to the trips_hash_Switzerland df by joining on trip_id
trips_hash_stop_sequence_Switzerland = pd.merge(trips_hash_Switzerland, trips_stop_sequence_Switzerland, on='trip_id', how='left')

# To put the columns in a more logical order
trips_hash_stop_sequence_Switzerland = trips_hash_stop_sequence_Switzerland[['route_id', 'route_long_name','service_id','trip_headsign','trip_id','hash', 'hash_inverse','stop_sequence']]
trips_hash_stop_sequence_Switzerland

Unnamed: 0,route_id,route_long_name,service_id,trip_headsign,trip_id,hash,hash_inverse,stop_sequence
0,04236.06____.020:4236,RE 4236,19311,KARLSRUHE HBF,0:1,4.062624e+17,6.097513e+18,"[ENGEN, IMMENDINGEN, DONAUESCHINGEN, VILLINGEN..."
1,04700.06____.001:4700,RE 4700,133763,KARLSRUHE HBF,1:1,1.524342e+18,-8.976875e+18,"[OFFENBURG, APPENWEIER, RENCHEN (D), ACHERN, B..."
2,04700.06____.001:4700,RE 4700,1,KARLSRUHE HBF,1:2,1.524342e+18,-8.976875e+18,"[OFFENBURG, APPENWEIER, RENCHEN (D), ACHERN, B..."
3,04700.06____.001:4700,RE 4700,13336,KARLSRUHE HBF,1:3,1.524342e+18,-8.976875e+18,"[OFFENBURG, APPENWEIER, RENCHEN (D), ACHERN, B..."
4,04700.06____.001:4700,RE 4700,2610,KARLSRUHE HBF,1:4,1.524342e+18,-8.976875e+18,"[OFFENBURG, APPENWEIER, RENCHEN (D), ACHERN, B..."
...,...,...,...,...,...,...,...,...
86909,87947.L7____.001:5,S 5,2364,WEIL AM RHEIN,49348:1,-6.490443e+18,3.777507e+17,"[LORRACH HBF, LORRACH MUSEUM/BURGHOF, LORRACH-..."
86910,87948.L7____.001:5,S 5,56518,LORRACH HBF,49349:1,3.777507e+17,-6.490443e+18,"[WEIL AM RHEIN, WEIL AM RHEIN GARTENSTADT, WEI..."
86911,87948.L7____.001:5,S 5,17102,ZELL (WIESENTAL),49349:2,9.073753e+18,-6.629309e+18,"[WEIL AM RHEIN, WEIL AM RHEIN GARTENSTADT, WEI..."
86912,87949.L7____.001:5,S 5,17102,WEIL AM RHEIN,49350:1,-6.629309e+18,9.073753e+18,"[ZELL (WIESENTAL), HAUSEN-RAITBACH, FAHRNAU, S..."


In [23]:
''' To count the number of dates for each service_id '''
service_id_df_Switzerland = calendar_dates_Switzerland.groupby(['service_id'])[['service_id']].count().rename(columns={'service_id':'count_service_id'}).reset_index()
service_id_df_Switzerland

Unnamed: 0,service_id,count_service_id
0,1,166
1,2,113
2,3,8
3,5,253
4,6,252
...,...,...
17632,244128,12
17633,244129,89
17634,254888,182
17635,254889,359


In [24]:
''' To regroup the days per service_id in a set '''
service_id_dates_Switzerland = calendar_dates_Switzerland.groupby('service_id')['date'].apply(lambda group_series: set(group_series.tolist())).reset_index()
service_id_dates_Switzerland.rename(columns={'date':'dates'}, inplace=True)
service_id_dates_Switzerland = service_id_dates_Switzerland.merge(service_id_df_Switzerland, on='service_id', how='left')
service_id_dates_Switzerland

Unnamed: 0,service_id,dates,count_service_id
0,1,"{20210429, 20210430, 20210201, 20210202, 20210...",166
1,2,"{20211201, 20211202, 20211203, 20211204, 20211...",113
2,3,"{20210213, 20210215, 20210216, 20210217, 20210...",8
3,5,"{20211201, 20211202, 20211203, 20211206, 20211...",253
4,6,"{20211201, 20211202, 20211203, 20211206, 20211...",252
...,...,...,...
17632,244128,"{20201218, 20201219, 20210116, 20210213, 20210...",12
17633,244129,"{20211203, 20211204, 20211210, 20211211, 20210...",89
17634,254888,"{20211204, 20211205, 20211206, 20211207, 20211...",182
17635,254889,"{20210101, 20210102, 20210103, 20210104, 20210...",359


In [25]:
''' To put the different trip_ids in a list after joining on (route_id, route_long_name, hash and service_id) '''
route_hash_freq_Switzerland = trips_hash_stop_sequence_Switzerland.groupby(['route_id','route_long_name','hash', 'hash_inverse', 'service_id'])['trip_id'].apply(lambda group_series: group_series.tolist()).reset_index()
route_hash_freq_Switzerland

Unnamed: 0,route_id,route_long_name,hash,hash_inverse,service_id,trip_id
0,00001.000044.018:1,R 1,3.884346e+18,5.894307e+18,936,[33248:1]
1,00001.000044.028:1,R 1,3.884346e+18,5.894307e+18,869,[33249:1]
2,00001.000104.001:1,CC 1,3.283807e+18,-6.597664e+18,5072,[4117:1]
3,00002.000044.017:2,R 2,5.894307e+18,3.884346e+18,936,[33250:1]
4,00002.000044.024:2,R 2,5.894307e+18,3.884346e+18,869,[33251:1]
...,...,...,...,...,...,...
86909,96806.000011.102:96806,TER 96806,5.622718e+18,1.467208e+18,44307,[23292:1]
86910,96810.000011.101:96810,TER 96810,1.875898e+18,-4.169908e+18,116609,[23293:1]
86911,96812.000011.101:96812,TER 96812,1.875898e+18,-4.169908e+18,9453,[23294:1]
86912,96814.000011.101:96814,TER 96814,1.875898e+18,-4.169908e+18,116609,[23295:1]


In [26]:
''' To add the sequence of stops to the route_hash_freq dataset '''
route_hash_freq_Switzerland = pd.merge(route_hash_freq_Switzerland, trips_hash_stop_sequence_Switzerland[['route_id','hash', 'hash_inverse', 'service_id','stop_sequence']], on=['route_id', 'hash', 'hash_inverse', 'service_id'], how='left')
route_hash_freq_Switzerland = route_hash_freq_Switzerland.drop_duplicates( subset = ['route_id', 'hash', 'service_id'], keep = 'first')
route_hash_freq_Switzerland

Unnamed: 0,route_id,route_long_name,hash,hash_inverse,service_id,trip_id,stop_sequence
0,00001.000044.018:1,R 1,3.884346e+18,5.894307e+18,936,[33248:1],"[LES BRENETS, LES FRETES, LE LOCLE LE CHALET, ..."
1,00001.000044.028:1,R 1,3.884346e+18,5.894307e+18,869,[33249:1],"[LES BRENETS, LES FRETES, LE LOCLE LE CHALET, ..."
2,00001.000104.001:1,CC 1,3.283807e+18,-6.597664e+18,5072,[4117:1],"[BRIENZ BRB, PLANALP, BRIENZER ROTHORN]"
3,00002.000044.017:2,R 2,5.894307e+18,3.884346e+18,936,[33250:1],"[LE LOCLE, LE LOCLE LE CHALET, LES FRETES, LES..."
4,00002.000044.024:2,R 2,5.894307e+18,3.884346e+18,869,[33251:1],"[LE LOCLE, LE LOCLE LE CHALET, LES FRETES, LES..."
...,...,...,...,...,...,...,...
86909,96806.000011.102:96806,TER 96806,5.622718e+18,1.467208e+18,44307,[23292:1],"[GENEVE, BELLEGARDE (AIN), CULOZ, GRENOBLE]"
86910,96810.000011.101:96810,TER 96810,1.875898e+18,-4.169908e+18,116609,[23293:1],"[GENEVE, BELLEGARDE (AIN), LYON PART DIEU]"
86911,96812.000011.101:96812,TER 96812,1.875898e+18,-4.169908e+18,9453,[23294:1],"[GENEVE, BELLEGARDE (AIN), LYON PART DIEU]"
86912,96814.000011.101:96814,TER 96814,1.875898e+18,-4.169908e+18,116609,[23295:1],"[GENEVE, BELLEGARDE (AIN), LYON PART DIEU]"


In [27]:
''' To calculate the number of trip_ids in the list of trip_ids and to add it as a new attribute '''
number_trip_ids_Switzerland = []
for list_trip_ids_Switzerland in route_hash_freq_Switzerland['trip_id']:
    count_Switzerland = len(list_trip_ids_Switzerland)
    number_trip_ids_Switzerland.append(count_Switzerland)
route_hash_freq_Switzerland['number_trip_ids'] = number_trip_ids_Switzerland

route_hash_freq_Switzerland

Unnamed: 0,route_id,route_long_name,hash,hash_inverse,service_id,trip_id,stop_sequence,number_trip_ids
0,00001.000044.018:1,R 1,3.884346e+18,5.894307e+18,936,[33248:1],"[LES BRENETS, LES FRETES, LE LOCLE LE CHALET, ...",1
1,00001.000044.028:1,R 1,3.884346e+18,5.894307e+18,869,[33249:1],"[LES BRENETS, LES FRETES, LE LOCLE LE CHALET, ...",1
2,00001.000104.001:1,CC 1,3.283807e+18,-6.597664e+18,5072,[4117:1],"[BRIENZ BRB, PLANALP, BRIENZER ROTHORN]",1
3,00002.000044.017:2,R 2,5.894307e+18,3.884346e+18,936,[33250:1],"[LE LOCLE, LE LOCLE LE CHALET, LES FRETES, LES...",1
4,00002.000044.024:2,R 2,5.894307e+18,3.884346e+18,869,[33251:1],"[LE LOCLE, LE LOCLE LE CHALET, LES FRETES, LES...",1
...,...,...,...,...,...,...,...,...
86909,96806.000011.102:96806,TER 96806,5.622718e+18,1.467208e+18,44307,[23292:1],"[GENEVE, BELLEGARDE (AIN), CULOZ, GRENOBLE]",1
86910,96810.000011.101:96810,TER 96810,1.875898e+18,-4.169908e+18,116609,[23293:1],"[GENEVE, BELLEGARDE (AIN), LYON PART DIEU]",1
86911,96812.000011.101:96812,TER 96812,1.875898e+18,-4.169908e+18,9453,[23294:1],"[GENEVE, BELLEGARDE (AIN), LYON PART DIEU]",1
86912,96814.000011.101:96814,TER 96814,1.875898e+18,-4.169908e+18,116609,[23295:1],"[GENEVE, BELLEGARDE (AIN), LYON PART DIEU]",1


In [28]:
''' To merge the route_hash_freq_Switzerland df with the service_id_dates to get the sets of corresponding dates '''
route_hash_service_freq_Switzerland = pd.merge(route_hash_freq_Switzerland, service_id_dates_Switzerland, on='service_id', how='left')
route_hash_service_freq_Switzerland_copy = route_hash_service_freq_Switzerland.copy()
route_hash_service_freq_Switzerland

Unnamed: 0,route_id,route_long_name,hash,hash_inverse,service_id,trip_id,stop_sequence,number_trip_ids,dates,count_service_id
0,00001.000044.018:1,R 1,3.884346e+18,5.894307e+18,936,[33248:1],"[LES BRENETS, LES FRETES, LE LOCLE LE CHALET, ...",1,"{20210701, 20210702, 20210705, 20210706, 20210...",171.0
1,00001.000044.028:1,R 1,3.884346e+18,5.894307e+18,869,[33249:1],"[LES BRENETS, LES FRETES, LE LOCLE LE CHALET, ...",1,"{20211201, 20211202, 20211203, 20211206, 20211...",83.0
2,00001.000104.001:1,CC 1,3.283807e+18,-6.597664e+18,5072,[4117:1],"[BRIENZ BRB, PLANALP, BRIENZER ROTHORN]",1,"{20210912, 20210725, 20210822, 20210919, 20210...",13.0
3,00002.000044.017:2,R 2,5.894307e+18,3.884346e+18,936,[33250:1],"[LE LOCLE, LE LOCLE LE CHALET, LES FRETES, LES...",1,"{20210701, 20210702, 20210705, 20210706, 20210...",171.0
4,00002.000044.024:2,R 2,5.894307e+18,3.884346e+18,869,[33251:1],"[LE LOCLE, LE LOCLE LE CHALET, LES FRETES, LES...",1,"{20211201, 20211202, 20211203, 20211206, 20211...",83.0
...,...,...,...,...,...,...,...,...,...,...
86909,96806.000011.102:96806,TER 96806,5.622718e+18,1.467208e+18,44307,[23292:1],"[GENEVE, BELLEGARDE (AIN), CULOZ, GRENOBLE]",1,"{20210418, 20210411}",2.0
86910,96810.000011.101:96810,TER 96810,1.875898e+18,-4.169908e+18,116609,[23293:1],"[GENEVE, BELLEGARDE (AIN), LYON PART DIEU]",1,"{20210410, 20210412, 20210413, 20210414, 20210...",8.0
86911,96812.000011.101:96812,TER 96812,1.875898e+18,-4.169908e+18,9453,[23294:1],"[GENEVE, BELLEGARDE (AIN), LYON PART DIEU]",1,"{20210417, 20210410}",2.0
86912,96814.000011.101:96814,TER 96814,1.875898e+18,-4.169908e+18,116609,[23295:1],"[GENEVE, BELLEGARDE (AIN), LYON PART DIEU]",1,"{20210410, 20210412, 20210413, 20210414, 20210...",8.0


In [29]:
'''Groups the service_id together for each route_id and hash combination'''
for index, combi_route_id_hash in route_hash_service_freq_Switzerland_copy.groupby(['route_id','hash'], as_index = False)['service_id'].last().iterrows():
    set_service_id = set(route_hash_service_freq_Switzerland_copy.loc[(route_hash_service_freq_Switzerland_copy['route_id'] == combi_route_id_hash['route_id']) & (route_hash_service_freq_Switzerland_copy['hash'] == combi_route_id_hash['hash'])]['service_id'])
    route_hash_service_freq_Switzerland_copy.loc[(route_hash_service_freq_Switzerland_copy['route_id'] == combi_route_id_hash['route_id']) & (route_hash_service_freq_Switzerland_copy['hash'] == combi_route_id_hash['hash']),['service_id']] = set_service_id
route_hash_service_freq_Switzerland_copy

Unnamed: 0,route_id,route_long_name,hash,hash_inverse,service_id,trip_id,stop_sequence,number_trip_ids,dates,count_service_id
0,00001.000044.018:1,R 1,3.884346e+18,5.894307e+18,{936},[33248:1],"[LES BRENETS, LES FRETES, LE LOCLE LE CHALET, ...",1,"{20210701, 20210702, 20210705, 20210706, 20210...",171.0
1,00001.000044.028:1,R 1,3.884346e+18,5.894307e+18,{869},[33249:1],"[LES BRENETS, LES FRETES, LE LOCLE LE CHALET, ...",1,"{20211201, 20211202, 20211203, 20211206, 20211...",83.0
2,00001.000104.001:1,CC 1,3.283807e+18,-6.597664e+18,{5072},[4117:1],"[BRIENZ BRB, PLANALP, BRIENZER ROTHORN]",1,"{20210912, 20210725, 20210822, 20210919, 20210...",13.0
3,00002.000044.017:2,R 2,5.894307e+18,3.884346e+18,{936},[33250:1],"[LE LOCLE, LE LOCLE LE CHALET, LES FRETES, LES...",1,"{20210701, 20210702, 20210705, 20210706, 20210...",171.0
4,00002.000044.024:2,R 2,5.894307e+18,3.884346e+18,{869},[33251:1],"[LE LOCLE, LE LOCLE LE CHALET, LES FRETES, LES...",1,"{20211201, 20211202, 20211203, 20211206, 20211...",83.0
...,...,...,...,...,...,...,...,...,...,...
86909,96806.000011.102:96806,TER 96806,5.622718e+18,1.467208e+18,{44307},[23292:1],"[GENEVE, BELLEGARDE (AIN), CULOZ, GRENOBLE]",1,"{20210418, 20210411}",2.0
86910,96810.000011.101:96810,TER 96810,1.875898e+18,-4.169908e+18,{116609},[23293:1],"[GENEVE, BELLEGARDE (AIN), LYON PART DIEU]",1,"{20210410, 20210412, 20210413, 20210414, 20210...",8.0
86911,96812.000011.101:96812,TER 96812,1.875898e+18,-4.169908e+18,{9453},[23294:1],"[GENEVE, BELLEGARDE (AIN), LYON PART DIEU]",1,"{20210417, 20210410}",2.0
86912,96814.000011.101:96814,TER 96814,1.875898e+18,-4.169908e+18,{116609},[23295:1],"[GENEVE, BELLEGARDE (AIN), LYON PART DIEU]",1,"{20210410, 20210412, 20210413, 20210414, 20210...",8.0


In [30]:
'''Get the distinct stop sequences for all routes to create the possible track combinations later on'''
distinct_stop_sequences_Switzerland = route_hash_service_freq_Switzerland_copy.drop_duplicates(subset = ["route_id", 'hash'])[['route_id','hash','stop_sequence', 'service_id']]
distinct_stop_sequences_Switzerland

Unnamed: 0,route_id,hash,stop_sequence,service_id
0,00001.000044.018:1,3.884346e+18,"[LES BRENETS, LES FRETES, LE LOCLE LE CHALET, ...",{936}
1,00001.000044.028:1,3.884346e+18,"[LES BRENETS, LES FRETES, LE LOCLE LE CHALET, ...",{869}
2,00001.000104.001:1,3.283807e+18,"[BRIENZ BRB, PLANALP, BRIENZER ROTHORN]",{5072}
3,00002.000044.017:2,5.894307e+18,"[LE LOCLE, LE LOCLE LE CHALET, LES FRETES, LES...",{936}
4,00002.000044.024:2,5.894307e+18,"[LE LOCLE, LE LOCLE LE CHALET, LES FRETES, LES...",{869}
...,...,...,...,...
86909,96806.000011.102:96806,5.622718e+18,"[GENEVE, BELLEGARDE (AIN), CULOZ, GRENOBLE]",{44307}
86910,96810.000011.101:96810,1.875898e+18,"[GENEVE, BELLEGARDE (AIN), LYON PART DIEU]",{116609}
86911,96812.000011.101:96812,1.875898e+18,"[GENEVE, BELLEGARDE (AIN), LYON PART DIEU]",{9453}
86912,96814.000011.101:96814,1.875898e+18,"[GENEVE, BELLEGARDE (AIN), LYON PART DIEU]",{116609}


##Functions for the route creation

In [31]:
'''Some functions to better factorise the functions in the coming cells'''

def select_stop_sequences(stop_sequences_df, route_id):
    '''retruns the stop sequences with the selected route_id'''
    return stop_sequences_df[stop_sequences_df['route_id'] == route_id].copy()

In [32]:
'''Finds the routes that can be either extended from behind or from after and those which are complete sequences'''

def get_extention_indexes(stop_sequences_df):
    '''returns the tree indexes: index_of_extendable, index_of_begin_sequences, index_of_complete_sequences'''
    #intiate the dictionnaries, that will be used to retrieve different rows later on
    index_of_extendable = {}
    index_of_begin_sequences = {}
    index_of_complete_sequences = {}
    for route_id in stop_sequences_df['route_id'].unique():
        #select the route with the route_id selected by the loop iteration
        route_sequences_route_id = select_stop_sequences(stop_sequences_df, route_id)
        for index_trip, trip in route_sequences_route_id.iterrows():
            #checks the extentions possible for the trip that can follow after its last stop
            possible_extentions_after = route_sequences_route_id[route_sequences_route_id['stop_sequence'].apply(lambda x: any(item for item in [trip['stop_sequence'][-1]] if (item == x[0]) and not(set(x[1:]) & set(trip['stop_sequence']))))].copy()
            #checks that those extentions have a common service_id as the trip
            possible_extentions_after = possible_extentions_after[possible_extentions_after['service_id'].apply(lambda x: any(item for item in trip['service_id'] if item in x))].copy()
            #checks the extentions possible for the trip that can follow before its first stop
            possible_extentions_behind = route_sequences_route_id[route_sequences_route_id['stop_sequence'].apply(lambda x: any(item for item in [trip['stop_sequence'][0]] if (item == x[-1]) and not(set(x[:-1]) & set(trip['stop_sequence']))))].copy()        
            #checks that those extentions have a common service_id as the trip
            possible_extentions_behind = possible_extentions_behind[possible_extentions_behind['service_id'].apply(lambda x: any(item for item in trip['service_id'] if item in x))].copy()
            #put all the sequences that can be extended either from the beginning either from the end together
            possible_extentions = possible_extentions_after.append(possible_extentions_behind, ignore_index = True)
            if not possible_extentions.empty:
                if route_id not in index_of_extendable:
                    index_of_extendable[route_id] = []
                index_of_extendable[route_id].append(index_trip)
                if possible_extentions_behind.empty:
                    if route_id not in index_of_begin_sequences:
                        index_of_begin_sequences[route_id] = []
                    index_of_begin_sequences[route_id].append(index_trip)
            elif possible_extentions.empty:
                if route_id not in index_of_complete_sequences:
                    index_of_complete_sequences[route_id] = []
                index_of_complete_sequences[route_id].append(index_trip)
                
    return index_of_extendable, index_of_begin_sequences, index_of_complete_sequences

In [33]:
'''Creates all the sequences of routes possible to reconstruct the real route'''

def possible_sequences_construction(stop_sequences_df, index_of_extendable, index_of_begin_sequences, index_of_complete_sequences):
    '''returns the first part of the route_creation, two others need to be added'''
    import copy
    #create an empty df for the process of route creation
    route_creation  = pd.DataFrame()
    for route_id in index_of_extendable:
        #checks if some parts are begin sequences, if not, then we can't build routes with multiple sequences
        if route_id in index_of_begin_sequences:
            #create a copy of the df with only the route considered in the loop iteration
            routes_with_route_id = select_stop_sequences(stop_sequences_df, route_id)
            #create a df where only the routes that have an end stop as their first element of the sequence
            route_creation_route_id = routes_with_route_id.loc[index_of_begin_sequences[route_id]][['route_id', 'hash', 'stop_sequence', 'service_id']]
            #create a df with the exentable sequences for that route_id
            route_creation_extensions_route_id = routes_with_route_id.loc[index_of_extendable[route_id]][['route_id', 'hash', 'stop_sequence','service_id']]    
            #make the hash column as a column of lists
            route_creation_route_id['hash'] = route_creation_route_id['hash'].apply(lambda x: [x])
            route_creation_route_id = route_creation_route_id.reset_index(drop=True)
            #to stop the while loop when all the routes are complete in the df for the route_id of the loop iteration
            complete_routes = 0
            while complete_routes < len(route_creation_route_id.index):
                #use a deepcopy to not impact the iterrows of the main loop
                route_creation_deep_copy = copy.deepcopy(route_creation_route_id)
                for index_original, route_part in route_creation_deep_copy.iterrows():
                    #create a dataframe of the possible extentions for each route_part
                    #select an extention only if the extention is the next part of the route and also that no other station are repeated in the sequence if this extention is added(otherwise it might cause an infinite loop)
                    possible_extentions = route_creation_extensions_route_id[route_creation_extensions_route_id['stop_sequence'].apply(lambda x: any(item for item in [route_part['stop_sequence'][-1]] if (item == x[0]) and not(set(x[1:]) & set(route_part['stop_sequence']))))].copy()
                    #take only those extentions that have a common service_id with the route_part
                    possible_extentions = possible_extentions[possible_extentions['service_id'].apply(lambda x: any(item for item in route_part['service_id'] if item in x))].copy()                
                    #checks whether any extention fullfilling the criterias has been found
                    if not (possible_extentions.empty):
                        #if so, extend it with every single possibilities
                        for index_extention, possible_extention in possible_extentions.iterrows():
                            #must create a deepcopy, otherwise the orignal hash list will change as well (mutable)
                            updated_hash = copy.deepcopy(route_part['hash'])
                            updated_hash.append(possible_extention['hash'])
                            updated_route_sequence = route_part['stop_sequence'] + possible_extention['stop_sequence'][1:]
                            common_service_id = possible_extention['service_id'] & route_part['service_id']
                            route_creation_route_id.loc[max(route_creation_route_id.index)+1] = [route_id, updated_hash, updated_route_sequence, common_service_id]
                        #then delete the route with the index (see loop here above)
                        route_creation_route_id = route_creation_route_id.drop(index = index_original)            
                    #the route can't be extended anymore
                    else:
                        complete_routes += 1
            #adds all the possible routes created with the trips of the route_id of the main loop
            route_creation = route_creation.append(route_creation_route_id, ignore_index = True)
    return route_creation

In [34]:
'''Adds the full sequences to the route_creation dataframe'''

def add_full_sequences(stop_sequences_df, route_creation, index_of_complete_sequences):
    '''returns the second part of the route_creation, one other needs to be added'''
    for route_id in index_of_complete_sequences:
        #findes all the complete sequences for that route_id
        copy_complete_sequences_df = stop_sequences_df.loc[index_of_complete_sequences[route_id]][['route_id','hash','stop_sequence', 'service_id']].copy()
        copy_complete_sequences_df['hash'] = copy_complete_sequences_df['hash'].apply(lambda x: [x])
        #adds each of them in the route_creation dataframe
        for index_complete_sequence, complete_sequence in copy_complete_sequences_df.iterrows():
            route_creation = route_creation.append(complete_sequence, ignore_index = True)
    route_creation = route_creation.sort_values(by=['route_id'], ignore_index = True)
    return route_creation 

In [35]:
'''Adds the sequences that were not yet added in the route_creation dataframe'''

def add_unused_sequences(stop_sequences_df, route_creation):
    '''returns the third part of the route_creation'''
    for route_id in stop_sequences_df['route_id'].unique():
        if route_id in route_creation['route_id'].unique():
            #get a set of the hashes that were employed to create the routes for that route_id
            used_sequences_hash = set(route_creation[route_creation['route_id'] == route_id].apply(lambda x: pd.Series(x['hash']),axis=1).stack().reset_index(level=1, drop=True))
            #get a tuple of all the route sequences for that route_id
            used_sequences = tuple(route_creation[route_creation['route_id'] == route_id]['stop_sequence'])
            copy_sequences_route_id = select_stop_sequences(stop_sequences_df, route_id)[['route_id','hash','stop_sequence', 'service_id']]
            copy_sequences_route_id['hash'] = copy_sequences_route_id['hash'].apply(lambda x: [x]) 
            #adds the hashes that were not employed in any route creations for that route_id
            for index_trip, trip in copy_sequences_route_id.iterrows():
                #first element of the list because there is always only one element
                if trip['hash'][0] not in used_sequences_hash:
                    #checks that the sequence is not a sublist of any existing sequences
                    is_subsequence = False
                    for sequence in used_sequences:
                        if set(trip['stop_sequence']).issubset(sequence):
                            is_subsequence = True
                    if not is_subsequence:
                        route_creation = route_creation.append(trip, ignore_index = True)
    return route_creation

In [36]:
'''Calculates the frequency of the constructed routes just made in the route_creation dataframe'''
    
def calculate_frequenty_new_sequences(number_of_trips_per_hash, service_id_count_dates, route_creation):
    '''calculates the frequencies of route_construction_third'''
    #put the default value of the frequency to 0
    route_creation['frequency'] = 0
    for index_sequence, sequence in route_creation[['route_id','hash','service_id']].iterrows():
        #initialize the varibles
        sequence_frequency = 0
        set_common_service_id = sequence['service_id']
        if set_common_service_id:
            #select the number_of_trips_per_hash only for the considered route_id
            number_of_trips_per_hash_route_id = number_of_trips_per_hash[number_of_trips_per_hash['route_id'] == sequence['route_id']]
            #only select the trips with the hash value contained in the sequence and with the same route_id
            containing_hash = number_of_trips_per_hash_route_id[number_of_trips_per_hash_route_id['hash'].apply(lambda x: any(item for item in sequence['hash'] if x == item))]
            #loop over each service_id that were common during the trip
            for service_id in set_common_service_id:
                if not service_id_count_dates[service_id_count_dates['service_id'] == service_id].empty:
                    service_id_number_days = service_id_count_dates[service_id_count_dates['service_id'] == service_id].iloc[0]['count_service_id']
                    #adds the minimum number of trips per day multiplied by the number of days in the service_id
                    sequence_frequency += containing_hash[containing_hash['service_id'] == service_id]['number_trip_ids'].min() * service_id_number_days
            #adds the frequency in of the new route sequence
            route_creation.loc[index_sequence, 'frequency'] = sequence_frequency
    return route_creation

In [37]:
def calculate_hash_route_creation(route_creation): 
    '''calculates the hash and the hash inverse of the route_creation'''
    #copy the route_creation dataFrame
    route_creation_hash = route_creation.copy()
    #create a column called hash and hash_invese that contains NaN values
    route_creation_hash['hash'] = np.nan
    route_creation_hash['hash_inverse'] = np.nan
    #calculate the hash and the hash inverse using the lists in stop_sequence
    for index, route_sequence in route_creation_hash.iterrows():
        route_creation_hash.loc[index, 'hash'] = hash(tuple(route_sequence['stop_sequence']))
        route_creation_hash.loc[index, 'hash_inverse'] = hash(tuple(list(route_sequence['stop_sequence'])[::-1]))
    return route_creation_hash

In [38]:
'''Regroup the routes that are the same (even though they are in the opposite direction)'''

def regroup_same_stop_sequences(route_creation_hash):
    '''regroups the stop_sequences that are the same'''
    
    route_creation_max_hash = route_creation_hash.copy()
    route_creation_max_hash['max_hash'] = route_creation_max_hash[['hash', 'hash_inverse']].max(axis=1)
    #create a df that sums the frequence of the trips going from opposite directions
    route_creation_max_hash_freq = route_creation_max_hash.groupby(['route_id','max_hash'], as_index = False)[['frequency']].sum()
    #renames the max_hash column into hash so it the dataframe can be merged with route_hash_without_freq
    route_creation_max_hash_freq = route_creation_max_hash_freq.rename(columns = {'max_hash':'hash'})
    #drops the column freq_sequence_route because the one that is of interest is in route_creation_max_hash_freq
    route_hash_without_freq = route_creation_hash.copy().drop(['frequency'], axis = 1)
    route_hash_without_freq = route_hash_without_freq.drop_duplicates(subset=['route_id', 'hash'])
    route_hash_freq_combined_first_merge = pd.merge(route_creation_max_hash_freq, route_hash_without_freq, on=['route_id', 'hash'], how='left')
    route_hash_freq_combined_first_merge = route_hash_freq_combined_first_merge.drop(['hash_inverse'], axis = 1)
    #selects the part of the dataset that doesn't have NaN (because for the NaN, their hash_value that was max was the one in hash_inverse and it didn't exist in the other df), so we can concatenate it with the part that had NaN later
    route_hash_freq_first_part = route_hash_freq_combined_first_merge[pd.notnull(route_hash_freq_combined_first_merge['stop_sequence'])]
    #selects one part the part of the dataset that does have NaN, so we can concatenate it with the part that has no NaN later on.
    #but first, we will need to fill those NaN values (done in the code lines behind this one)
    route_hash_freq_second_part = route_hash_freq_combined_first_merge[pd.isnull(route_hash_freq_combined_first_merge['stop_sequence'])][['route_id', 'hash', 'frequency']]
    #renames the hash column into hash_inverse so it the dataframe can be merged with route_hash_without_freq (because it didn't work with 'hash' on the first merge)
    route_hash_freq_second_part = route_hash_freq_second_part.rename(columns = {'hash':'hash_inverse'})
    route_hash_freq_second_part = pd.merge(route_hash_freq_second_part, route_hash_without_freq, on=['route_id', 'hash_inverse'], how='left')
    #the hash that is of interest in the final df will be hash and not hash_inverse
    route_hash_freq_second_part  = route_hash_freq_second_part.drop(['hash_inverse'], axis = 1)
    route_hash_freq_combined_not_sorted = pd.concat([route_hash_freq_first_part, route_hash_freq_second_part])
    route_hash_freq_combined = route_hash_freq_combined_not_sorted.sort_values(by = ['route_id'])
    route_hash_freq_combined = route_hash_freq_combined.reset_index(drop = True)
    return route_hash_freq_combined

In [39]:
'''Deletes the routes that do not represent 10% of the total route frequency and creates new route, if some of them are different'''

def apply_treshold_route_creation(route_hash_freq_combined): 
    #calculates the total frequency per route_id
    frequency_each_route = route_hash_freq_combined.groupby(['route_id'], as_index = False)['frequency'].sum()
    frequency_treshold = frequency_each_route.copy()
    #calculates the treshold (here 10%)
    frequency_treshold['frequency'] = frequency_treshold['frequency']/10
    frequency_treshold.rename(columns = {'frequency':'frequency_treshold'}, inplace = True)
    route_hash_freq_treshold = route_hash_freq_combined.merge(frequency_treshold, on='route_id', how = 'left')
    #find the sequences that are not more than 10% of the route frequency and delete them
    index_names = route_hash_freq_treshold[route_hash_freq_treshold['frequency'] < route_hash_freq_treshold['frequency_treshold']].index
    route_hash_freq_treshold.drop(index_names, inplace = True)
    #selects the sequences that are not the most frequent per route_id
    sequences_max_freq = route_hash_freq_treshold.groupby(['route_id'],as_index = False)['frequency'].max()
    sequences_max_freq.rename(columns = {'frequency':'max_frequency'}, inplace = True)
    sequences_max_freq_merged = route_hash_freq_treshold.merge(sequences_max_freq, on='route_id', how='left')
    sequences_non_max_freq_index = sequences_max_freq_merged[sequences_max_freq_merged['frequency'] != sequences_max_freq_merged['max_frequency']].index
    #those selected sequences get a new route_id that starts from routes['route_id'].max() + 1 and increments by one for each new route
    route_id_creation =  0 + 1
    new_route_id_column = list(range(route_id_creation, route_id_creation + len(sequences_non_max_freq_index)))    
    sequences_max_freq_merged.loc[sequences_non_max_freq_index, 'route_id'] = new_route_id_column
    sequences_max_freq_merged = sequences_max_freq_merged.reset_index(drop=True)
    #keep only the column route_id and stop_sequence
    final_routes = sequences_max_freq_merged.drop(columns=['hash', 'frequency', 'frequency_treshold', 'max_frequency', 'service_id'])
    return final_routes

In [40]:
''' To keep only the routes that have at least one Swiss station in their route_sequence'''

def keep_swiss_routes(final_routes):
    non_swiss_routes = set()
    for index_route, route in final_routes.iterrows():
        is_in_Switzerland = False
        for stop in route['stop_sequence']:
            if stop in set(swiss_stops_Switzerland_series):
                is_in_Switzerland = True
                break
        if not is_in_Switzerland:
            route_id = route['route_id']
            non_swiss_routes.add(route_id)
    swiss_routes = final_routes.loc[~final_routes['route_id'].isin(non_swiss_routes)]    

  return swiss_routes

In [41]:
'''Makes a set that can be used for building the edges of the graph using Networkx package'''

def create_df_for_Networkx(final_routes):
    '''return df_for_edges a df that can be used to build a Networkx L-space graph'''
    #takes the list stop sequence and make it a new column for each stop
    stop_sequence_values = final_routes.apply(lambda x: pd.Series(x['stop_sequence']),axis=1).stack().reset_index(level=1, drop=True)
    stop_sequence_values.name = 'stop_sequence'
    final_routes_stops = final_routes.drop('stop_sequence', axis=1).join(stop_sequence_values)
    final_routes_stops = final_routes_stops.reset_index(drop=True)
    #Creates a shifted instance of the df to use it for the final result
    final_routes_stops_shifted = final_routes_stops.shift()
    #Check if which of the rows are followed by a row with the same trip_id
    final_routes_stops_shifted['match'] = final_routes_stops_shifted['route_id'].eq(final_routes_stops['route_id'])
    #Drop the rows for which this condition is not satisfied
    final_routes_stops_shifted.drop(final_routes_stops_shifted[final_routes_stops_shifted['match'] == False].index, inplace = True)
    final_routes_stops_shifted.rename(columns=
      {"stop_sequence": "stop_name_1",
      "stop_name": "stop_name_1"}, inplace=True)
    #joins the df with its shifted version sothat each sequence of two stations is represented in the table as a row
    df_for_edges = final_routes_stops_shifted.join(final_routes_stops[['stop_sequence']], lsuffix='_caller', rsuffix='_other', how='left')
    df_for_edges.rename(columns=
      {"stop_sequence": "stop_name_2",
      "stop_name": "stop_name_2"}, inplace=True)

    df_for_edges = df_for_edges.drop_duplicates()
    df_for_edges = df_for_edges[['route_id','stop_name_1', 'stop_name_2']]
    df_for_edges = df_for_edges.reset_index(drop=True)
    return df_for_edges

In [42]:
def full_route_creation(stop_sequences_df, number_of_trips_per_hash, service_id_count_dates):
    '''return a df that can be used to make a Networkx L-space (with treshold applied of 10%)'''
    index_of_extendable, index_of_begin_sequences, index_of_complete_sequences = get_extention_indexes(stop_sequences_df)
    route_creation_first = possible_sequences_construction(stop_sequences_df, index_of_extendable, index_of_begin_sequences, index_of_complete_sequences)
    route_creation_second = add_full_sequences(stop_sequences_df, route_creation_first, index_of_complete_sequences)
    route_creation_third = add_unused_sequences(stop_sequences_df, route_creation_second)
    route_creation_frequency_single = calculate_frequenty_new_sequences(number_of_trips_per_hash, service_id_count_dates, route_creation_third)
    route_creation_hash = calculate_hash_route_creation(route_creation_frequency_single)
    route_hash_freq_combined = regroup_same_stop_sequences(route_creation_hash)
    final_routes = apply_treshold_route_creation(route_hash_freq_combined)
    swiss_routes = keep_swiss_routes(final_routes)
    df_for_edges = create_df_for_Networkx(swiss_routes)
    
    return final_routes, swiss_routes, df_for_edges

In [44]:
final_routes_Switzerland, swiss_routes_Switzerland, df_for_edges_Switzerland = full_route_creation(distinct_stop_sequences_Switzerland, route_hash_service_freq_Switzerland.copy(), service_id_df_Switzerland)
final_routes_Switzerland
swiss_routes_Switzerland
df_for_edges_Switzerland

KeyboardInterrupt: 

In [None]:
#df_for_edges.to_csv(r'/Users/pol/Desktop/CSV_export/df_for_edges_Switzerland.csv', index = False, header=True, encoding='utf-8-sig')