# Import of packages

In [1]:
'''To import the required packages.'''
import pandas as pd
import numpy as np
import networkx as nx
import collections
import matplotlib.pyplot as plt
import math
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

from kneed import KneeLocator
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.decomposition import PCA

#import osmnx as ox

# Settings

In [2]:
'''To display all output results of a Jupyter cell.'''
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
'''To ensure that the output results of extensive output results are not truncated.'''
#pd.options.display.max_rows = 4000

'To ensure that the output results of extensive output results are not truncated.'

In [4]:
'''To change the width of the Notebook to see the output on the screen'''
#from IPython.core.display import display, HTML
#display(HTML("<style>.container { width:100% !important; }</style>"))

'To change the width of the Notebook to see the output on the screen'

# **Belgian railway system**

# Import of the Belgian railway datasets

In [5]:
'''To register the GitHub link with the Belgian data as a variable.'''
datalink = "https://raw.githubusercontent.com/polkuleuven/Thesis_Train/main/gtfs_train_Belgium_1503/"

'To register the GitHub link with the Belgian data as a variable.'

In [6]:
'''Import all the GTFS data'''
#To import the agency dataset that contains limited information about Belgian NMBS/SNCB railway agency
agency_Belgium = pd.read_csv(datalink + "agency.txt", sep=",")
#To import the stops dataset that contains information about the ids, the names and the geographical coordinates of the Belgian railway stations.
stops_Belgium = pd.read_csv(datalink + "stops.txt", sep=",")
#To import the translations dataset that provides the French-, Dutch-, German- and English-language translations of the Belgian railway stations.
translations_Belgium = pd.read_csv(datalink + "translations.txt", sep=",")
#To import the transfers dataset that gives the minimum transfer time to switch routes at each Belgian railway station.
transfers_Belgium = pd.read_csv(datalink + "transfers.txt", sep=",")
#To import the routes dataset that provides the id, the name and the type of vehicle used for all Belgian railway routes.
routes_Belgium = pd.read_csv(datalink + "routes.txt", sep=",")
#To import the trips dataset that gives for all routes an overview of the trips and the headsigns of these trips belonging to the Belgian railway route.
#The service_id is an indication of all the dates this trip is valid (consultable in the calendar_dates dataset).
trips_Belgium = pd.read_csv(datalink + "trips.txt", sep=",")
#To import the stop_times dataset that gives for all trips an overview of the ids of the stations served and the sequence in which these stations are served. 
#In addition, for all the trips the arrival and departure times at the stations served are given.
stop_times_Belgium = pd.read_csv(datalink + "stop_times.txt", sep=",")
#To import the calendar dataset that gives the first and last date of all data observations.
calendar_Belgium = pd.read_csv(datalink + "calendar.txt", sep=",")
#To import the calendar_dates dataset that gives for each service_id all the exact dates when that service_id is valid.
calendar_dates_Belgium = pd.read_csv(datalink + "calendar_dates.txt", sep=",")
#To import the stop_time_overrides dataset 
stop_time_overrides_Belgium = pd.read_csv(datalink + "stop_time_overrides.txt", sep=",")

'Import all the GTFS data'

# Cleaning of the Belgian railway data

In [7]:
'''To clean the routes_Belgium df'''
allowed_route_type = {'IC', 'L', 'P', 'ICT', 'IZY'}
routes_cleaned_Belgium = routes_Belgium[(routes_Belgium['route_short_name'].isin(allowed_route_type)) | (routes_Belgium['route_short_name'].str.startswith('S'))]
routes_cleaned_Belgium

'To clean the routes_Belgium df'

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_desc,route_type,route_url,route_color,route_text_color
18,115,NMBS/SNCB,IC,Tournai -- Mouscron,,103,,,
19,116,NMBS/SNCB,IC,Bruges -- Knokke,,103,,,
20,117,NMBS/SNCB,L,Verviers-Central -- Spa-Geronstère,,100,,,
21,118,NMBS/SNCB,IC,Knokke -- Gand-Saint-Pierre,,103,,,
22,119,NMBS/SNCB,L,Grammont -- Denderleeuw,,100,,,
...,...,...,...,...,...,...,...,...,...
701,730,NMBS/SNCB,L,Haversin -- Libramont,,100,,,
702,731,NMBS/SNCB,L,Marloie -- Libramont,,100,,,
703,732,NMBS/SNCB,IZY,Paris Nord (FR) -- Bruxelles-Midi,,101,,,
704,733,NMBS/SNCB,IC,Den Haag HS (NL) -- Bruxelles-Midi,,103,,,


In [8]:
'''To clean the calendar_dates_Belgium df'''
#To filter the dates from the selected begin to the end date
begin_date = 20210314
end_date = 20210713
calendar_dates_cleaned_Belgium = calendar_dates_Belgium.copy()
calendar_dates_cleaned_Belgium = calendar_dates_cleaned_Belgium.drop(calendar_dates_cleaned_Belgium[(calendar_dates_cleaned_Belgium['date'] > end_date) | (calendar_dates_cleaned_Belgium['date'] < begin_date)].index)
calendar_dates_cleaned_Belgium

'To clean the calendar_dates_Belgium df'

Unnamed: 0,service_id,date,exception_type
0,1,20210314,1
1,2,20210315,1
2,2,20210316,1
3,2,20210317,1
4,2,20210318,1
...,...,...,...
487564,0,20210709,1
487565,0,20210710,1
487566,0,20210711,1
487567,0,20210712,1


In [10]:
'''To clean the stops_Belgium df.''' 
#To eliminate the stop_ids in the stops dataset that contain an underscore or that start with a character 'S'. 
stops_cleaned_Belgium = stops_Belgium[(~stops_Belgium['stop_id'].str.contains('_')) & (~stops_Belgium['stop_id'].str.contains('S'))]

#To modify the object datatype of the stop_id column to the NumPy int64 datatype
stops_cleaned_Belgium.loc[:,'stop_id'] = stops_cleaned_Belgium.loc[:,'stop_id'].astype(np.int64)

#To remove the accents from the stop_name and to change to uppercase
stops_cleaned_Belgium.loc[:,'stop_name'] = stops_cleaned_Belgium.loc[:,'stop_name'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
stops_cleaned_Belgium.loc[:,'stop_name'] = stops_cleaned_Belgium.loc[:,'stop_name'].str.upper()

#To initialize the Nominatim API to get the location from the input string 
geolocator = Nominatim(user_agent="application")
reverse = RateLimiter(geolocator.reverse, min_delay_seconds=0.2)

#To get the location with the geolocator.reverse() function and to extract the country from the location instance
country_list = []
for index, row in stops_cleaned_Belgium.iterrows():
    latitude = row['stop_lat']
    longitude = row['stop_lon']
    # To assign the latitude and longitude into a geolocator.reverse() method
    location = reverse((latitude, longitude), language='en', exactly_one=True)
    # To get the country from the given list and parsed into a dictionary with raw function()
    address = location.raw['address']
    country = address.get('country', '')
    country_list.append(country)

#To add the values of country_list as a new attribute country 
stops_cleaned_Belgium.loc[:,'country'] = country_list
stops_cleaned_Belgium

#To calculate the total number of Belgian stations in the stops_cleaned dataset
belgian_stops_Belgium = stops_cleaned_Belgium[stops_cleaned_Belgium['country'] == 'Belgium']
belgian_stops_Belgium_series = stops_cleaned_Belgium.loc[stops_cleaned_Belgium['country'] == 'Belgium', 'stop_name']

'To clean the stops_Belgium df.'

RateLimiter caught an error, retrying (0/2 tries). Called with (*((50.77083, 6.105277),), **{'language': 'en', 'exactly_one': True}).
Traceback (most recent call last):
  File "/Users/pol/opt/anaconda3/lib/python3.8/site-packages/urllib3/connectionpool.py", line 426, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/pol/opt/anaconda3/lib/python3.8/site-packages/urllib3/connectionpool.py", line 421, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/pol/opt/anaconda3/lib/python3.8/http/client.py", line 1347, in getresponse
    response.begin()
  File "/Users/pol/opt/anaconda3/lib/python3.8/http/client.py", line 307, in begin
    version, status, reason = self._read_status()
  File "/Users/pol/opt/anaconda3/lib/python3.8/http/client.py", line 268, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/Users/pol/opt/anaconda3/lib/python3.8/socket.py", line 669, in readinto
    return s

RateLimiter caught an error, retrying (0/2 tries). Called with (*((49.61913, 6.132853),), **{'language': 'en', 'exactly_one': True}).
Traceback (most recent call last):
  File "/Users/pol/opt/anaconda3/lib/python3.8/site-packages/urllib3/connectionpool.py", line 426, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/pol/opt/anaconda3/lib/python3.8/site-packages/urllib3/connectionpool.py", line 421, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/pol/opt/anaconda3/lib/python3.8/http/client.py", line 1347, in getresponse
    response.begin()
  File "/Users/pol/opt/anaconda3/lib/python3.8/http/client.py", line 307, in begin
    version, status, reason = self._read_status()
  File "/Users/pol/opt/anaconda3/lib/python3.8/http/client.py", line 268, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/Users/pol/opt/anaconda3/lib/python3.8/socket.py", line 669, in readinto
    return s

RateLimiter caught an error, retrying (0/2 tries). Called with (*((49.897290000000005, 6.09149),), **{'language': 'en', 'exactly_one': True}).
Traceback (most recent call last):
  File "/Users/pol/opt/anaconda3/lib/python3.8/site-packages/urllib3/connectionpool.py", line 426, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/pol/opt/anaconda3/lib/python3.8/site-packages/urllib3/connectionpool.py", line 421, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/pol/opt/anaconda3/lib/python3.8/http/client.py", line 1347, in getresponse
    response.begin()
  File "/Users/pol/opt/anaconda3/lib/python3.8/http/client.py", line 307, in begin
    version, status, reason = self._read_status()
  File "/Users/pol/opt/anaconda3/lib/python3.8/http/client.py", line 268, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/Users/pol/opt/anaconda3/lib/python3.8/socket.py", line 669, in readinto
   

RateLimiter caught an error, retrying (1/2 tries). Called with (*((49.95278, 6.02),), **{'language': 'en', 'exactly_one': True}).
Traceback (most recent call last):
  File "/Users/pol/opt/anaconda3/lib/python3.8/site-packages/urllib3/connectionpool.py", line 426, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/pol/opt/anaconda3/lib/python3.8/site-packages/urllib3/connectionpool.py", line 421, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/pol/opt/anaconda3/lib/python3.8/http/client.py", line 1347, in getresponse
    response.begin()
  File "/Users/pol/opt/anaconda3/lib/python3.8/http/client.py", line 307, in begin
    version, status, reason = self._read_status()
  File "/Users/pol/opt/anaconda3/lib/python3.8/http/client.py", line 268, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/Users/pol/opt/anaconda3/lib/python3.8/socket.py", line 669, in readinto
    return self.

RateLimiter caught an error, retrying (0/2 tries). Called with (*((50.05472, 6.031389),), **{'language': 'en', 'exactly_one': True}).
Traceback (most recent call last):
  File "/Users/pol/opt/anaconda3/lib/python3.8/site-packages/urllib3/connectionpool.py", line 426, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/pol/opt/anaconda3/lib/python3.8/site-packages/urllib3/connectionpool.py", line 421, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/pol/opt/anaconda3/lib/python3.8/http/client.py", line 1347, in getresponse
    response.begin()
  File "/Users/pol/opt/anaconda3/lib/python3.8/http/client.py", line 307, in begin
    version, status, reason = self._read_status()
  File "/Users/pol/opt/anaconda3/lib/python3.8/http/client.py", line 268, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/Users/pol/opt/anaconda3/lib/python3.8/socket.py", line 669, in readinto
    return s

RateLimiter caught an error, retrying (0/2 tries). Called with (*((49.61111, 6.05),), **{'language': 'en', 'exactly_one': True}).
Traceback (most recent call last):
  File "/Users/pol/opt/anaconda3/lib/python3.8/site-packages/urllib3/connectionpool.py", line 426, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/pol/opt/anaconda3/lib/python3.8/site-packages/urllib3/connectionpool.py", line 421, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/pol/opt/anaconda3/lib/python3.8/http/client.py", line 1347, in getresponse
    response.begin()
  File "/Users/pol/opt/anaconda3/lib/python3.8/http/client.py", line 307, in begin
    version, status, reason = self._read_status()
  File "/Users/pol/opt/anaconda3/lib/python3.8/http/client.py", line 268, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/Users/pol/opt/anaconda3/lib/python3.8/socket.py", line 669, in readinto
    return self.

RateLimiter caught an error, retrying (0/2 tries). Called with (*((49.62587, 6.020131),), **{'language': 'en', 'exactly_one': True}).
Traceback (most recent call last):
  File "/Users/pol/opt/anaconda3/lib/python3.8/site-packages/urllib3/connectionpool.py", line 426, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/pol/opt/anaconda3/lib/python3.8/site-packages/urllib3/connectionpool.py", line 421, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/pol/opt/anaconda3/lib/python3.8/http/client.py", line 1347, in getresponse
    response.begin()
  File "/Users/pol/opt/anaconda3/lib/python3.8/http/client.py", line 307, in begin
    version, status, reason = self._read_status()
  File "/Users/pol/opt/anaconda3/lib/python3.8/http/client.py", line 268, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/Users/pol/opt/anaconda3/lib/python3.8/socket.py", line 669, in readinto
    return s

RateLimiter swallowed an error after 2 retries. Called with (*((49.62587, 6.020131),), **{'language': 'en', 'exactly_one': True}).
Traceback (most recent call last):
  File "/Users/pol/opt/anaconda3/lib/python3.8/site-packages/urllib3/connectionpool.py", line 426, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "/Users/pol/opt/anaconda3/lib/python3.8/site-packages/urllib3/connectionpool.py", line 421, in _make_request
    httplib_response = conn.getresponse()
  File "/Users/pol/opt/anaconda3/lib/python3.8/http/client.py", line 1347, in getresponse
    response.begin()
  File "/Users/pol/opt/anaconda3/lib/python3.8/http/client.py", line 307, in begin
    version, status, reason = self._read_status()
  File "/Users/pol/opt/anaconda3/lib/python3.8/http/client.py", line 268, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/Users/pol/opt/anaconda3/lib/python3.8/socket.py", line 669, in readinto
    return self

AttributeError: 'NoneType' object has no attribute 'raw'

### To merge the files

In [12]:
'''To select all required fields'''
agency_cleaned_Belgium = agency_Belgium[['agency_id', 'agency_name', 'agency_url', 'agency_timezone']]
routes_cleaned_Belgium = routes_cleaned_Belgium[['route_id', 'agency_id', 'route_short_name', 'route_long_name', 'route_type']]
trips_cleaned_Belgium = trips_Belgium[['trip_id', 'route_id', 'service_id', 'trip_headsign']]
calendar_dates_cleaned_Belgium = calendar_dates_cleaned_Belgium[['service_id', 'date']]
stops_cleaned_Belgium = stops_cleaned_Belgium[['stop_id', 'stop_name', 'stop_lat', 'stop_lon']]
stop_times_cleaned_Belgium = stop_times_Belgium[['trip_id', 'stop_id', 'arrival_time', 'departure_time', 'stop_sequence']]

'To select all required fields'

In [14]:
''' To merge the Belgian files '''
#To merge the stop_times df with the stops df on stop_id
stop_times_stops_Belgium = pd.merge(stop_times_cleaned_Belgium, stops_cleaned_Belgium[['stop_id', 'stop_name', 'stop_lat', 'stop_lon']], on='stop_id')

#To merge the trips df with the routes df on route_id
routes_trips_Belgium = pd.merge(routes_cleaned_Belgium[['route_id']], trips_cleaned_Belgium, on='route_id')

#To merge the stop_times_stops df with the trips_routes df on trip_id
routes_trips_stop_times_stops_Belgium = pd.merge(routes_trips_Belgium, stop_times_stops_Belgium, on='trip_id')

#To take only the service_ids present in both the routes_trips_stop_times_stops df and the calendar_dates df into account
calendar_dates_cleaned_unique_Belgium = calendar_dates_cleaned_Belgium['service_id'].unique()
routes_trips_stop_times_stops_calendar_dates_Belgium = routes_trips_stop_times_stops_Belgium[(routes_trips_stop_times_stops_Belgium['service_id'].isin(calendar_dates_cleaned_unique_Belgium))]

' To merge the Belgian files '

In [16]:
routes_trips_stop_times_stops_calendar_dates_Belgium

Unnamed: 0,route_id,trip_id,service_id,trip_headsign,stop_id,arrival_time,departure_time,stop_sequence,stop_name,stop_lat,stop_lon
0,115,88____:007::8885704:8885001:4:523:20210418,14,Tournai,8885001,05:23:00,05:23:00,4,TOURNAI,50.61313,3.396940
1,115,88____:007::8885704:8885001:4:523:20210418,14,Tournai,8885068,05:19:00,05:19:00,3,FROYENNES,50.62989,3.354835
2,115,88____:007::8885704:8885001:4:523:20210418,14,Tournai,8885753,05:12:00,05:12:00,2,HERSEAUX,50.71390,3.245961
3,115,88____:007::8885704:8885001:4:523:20210418,14,Tournai,8885704,05:07:00,05:07:00,1,MOUSCRON,50.74100,3.228449
4,115,88____:007::8885704:8885001:4:623:20210418,14,Tournai,8885001,06:23:00,06:23:00,4,TOURNAI,50.61313,3.396940
...,...,...,...,...,...,...,...,...,...,...,...
431487,734,88____:007::8821105:8812005:22:1723:20210418,25,Bruxelles-Nord,8811270,17:01:00,17:01:00,19,VELTEM,50.90052,4.633520
431488,734,88____:007::8821105:8812005:22:1723:20210418,25,Bruxelles-Nord,8811288,16:59:00,16:59:00,18,HERENT,50.90353,4.672190
431489,734,88____:007::8821105:8812005:22:1723:20210418,25,Bruxelles-Nord,8819406,17:10:00,17:12:00,23,BRUSSELS AIRPORT-ZAVENTEM,50.89646,4.482072
431490,734,88____:007::8821105:8812005:22:1723:20210418,25,Bruxelles-Nord,8821063,16:11:00,16:11:00,5,ANVERS-LUCHTBAL,51.24413,4.425033


In [15]:
# Stuur je mij een foto van deze output?

#test
test = routes_trips_stop_times_stops_calendar_dates_Belgium.copy()
test = test.drop(['trip_headsign', 'arrival_time', 'departure_time', 'stop_sequence', 'stop_lat', 'stop_lon'], axis=1)
test = test.astype(str)
test.describe(include=['object'])

Unnamed: 0,route_id,trip_id,service_id,stop_id,stop_name
count,417593,417593,417593,417593,417593
unique,591,25183,1125,608,608
top,471,88____:007::8844628:8891702:73:2559:20210325,187,8814001,BRUXELLES-MIDI
freq,17798,73,23367,7679,7679


# Preparation space-of-stops

In [None]:
#verderwerken met routes_trips_stop_times_stops_calendar_dates_Belgium (evt te hernoemen)