# Import of packages

In [1]:
'''To import the required packages.'''
import pandas as pd
import numpy as np
import networkx as nx
import collections
import matplotlib.pyplot as plt
import math
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

from kneed import KneeLocator
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.decomposition import PCA

#import osmnx as ox

# Settings

In [2]:
'''To display all output results of a Jupyter cell.'''
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
'''To ensure that the output results of extensive output results are not truncated.'''
#pd.options.display.max_rows = 4000

In [None]:
'''To change the width of the Notebook to see the output on the screen'''
#from IPython.core.display import display, HTML
#display(HTML("<style>.container { width:100% !important; }</style>"))

# **Dutch railway system**

# Import of the Dutch railway datasets

In [3]:
'''To register the GitHub link with the Dutch data as a variable.'''
datalink = "https://raw.githubusercontent.com/polkuleuven/Thesis_Train/main/gtfs_train_Netherlands_1503/"

'To register the GitHub link with the Dutch data as a variable.'

In [4]:
'''Import all the GTFS data'''
#To import the agency dataset that contains limited information about the Dutch NS railway agency.
agency_Netherlands = pd.read_csv(datalink + "agency.txt", sep=",")
#To import the stops dataset that contains information about the ids, the names and the geographical coordinates of the Dutch NS railway stations.
stops_Netherlands = pd.read_csv(datalink + "stops.txt", sep=",")
#To import the feed_info dataset that contains limited information about the Dutch NS railway feed.
feed_info_Netherlands = pd.read_csv(datalink + "feed_info.txt", sep=",")
#To import the transfers dataset that gives the minimum transfer time to switch routes at each Belgian railway station.
transfers_not_cleaned_Netherlands = pd.read_csv(datalink + "transfers.txt", sep=",")
#To import the routes dataset that provides the id, the name and the type of vehicle used for all Dutch NS railway routes.
routes_Netherlands = pd.read_csv(datalink + "routes.txt", sep=",")
#To import the trips dataset that gives for all routes an overview of the trips and the headsigns of these trips belonging to the Dutch NS railway route.
#The service_id is an indication of all the dates this trip is valid (consultable in the calendar_dates dataset).
trips_Netherlands = pd.read_csv(datalink + "trips.txt", sep=",")
#To import the calendar_dates dataset that gives for each service_id all the exact dates when that service_id is valid.
calendar_dates_Netherlands = pd.read_csv(datalink + "calendar_dates.txt", sep=",")

'Import all the GTFS data'

In [5]:
#stop_times_Netherlands = pd.read_csv("stop_times_Netherlands.csv", sep=",")

In [None]:
#To import the stop_times dataset that gives for all trips an overview of the ids of the stations served and the sequence in which these stations are served. 
#In addition, for all the trips the arrival and departure times at the stations served are given.
stop_times_range = [*range(2, 19)]
stop_times_Netherlands = pd.read_csv(datalink + "stop_times-1.csv", sep=",")
for index in stop_times_range:
    stop_times_Netherlands = pd.concat([stop_times_Netherlands, pd.read_csv(datalink + "stop_times-" + str(index)+ ".csv", sep=",")])
stop_times_Netherlands

# Cleaning of the Dutch railway data

In [6]:
'''To clean the routes_Netherlands df'''
#To keep the train routes
routes_cleaned_Netherlands = routes_Netherlands[routes_Netherlands['route_type'] == 2]
routes_cleaned_Netherlands = routes_cleaned_Netherlands.astype(str)
routes_cleaned_Netherlands.describe(include=['object'])

#To change the route_id object datatype to a NumPy int64 datatype
routes_cleaned_Netherlands.loc[:,'route_id'] = routes_cleaned_Netherlands.loc[:,'route_id'].astype(np.int64)

'To clean the routes_Netherlands df'

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_desc,route_type,route_color,route_text_color,route_url
count,145,145,145,145,145.0,145,145.0,145.0,145.0
unique,145,11,15,144,1.0,1,1.0,1.0,1.0
top,67407,IFF:NS,Sprinter,Nachtnettrein Utrecht Centraal <-> Rotterdam C...,,2,,,
freq,1,87,47,2,145.0,145,145.0,145.0,145.0


In [7]:
'''To clean the calendar_dates_Netherlands df '''
#To filter the dates from the selected begin to the end date
begin_date = 20210314
end_date = 20210713
calendar_dates_cleaned_Netherlands = calendar_dates_Netherlands.copy()
calendar_dates_cleaned_Netherlands = calendar_dates_cleaned_Netherlands.drop(calendar_dates_cleaned_Netherlands[(calendar_dates_cleaned_Netherlands['date'] > end_date) | (calendar_dates_cleaned_Netherlands['date'] < begin_date)].index)
calendar_dates_cleaned_Netherlands

'To clean the calendar_dates_Netherlands df '

Unnamed: 0,service_id,date,exception_type
0,1,20210314,1
1,2,20210314,1
2,2,20210412,1
3,2,20210419,1
4,2,20210426,1
...,...,...,...
181776,4074,20210713,1
181779,4075,20210712,1
181780,4075,20210713,1
181784,4076,20210712,1


In [8]:
# To define a definition to remove the accents from a string
def remove_accents(text):
    import unicodedata
    try:
        text = unicode(text, 'utf-8')
    except NameError:
        pass
    text = unicodedata.normalize('NFD', text).encode('ascii', 'ignore').decode("utf-8")
    return str(text)

In [None]:
''' To clean the stops df '''
#To take from the stops_initial_Netherlands df all stop_ids that contain a 'stoparea:' to get the correct stop coordinates
stops_cleaned_Netherlands = stops_Netherlands[stops_Netherlands['stop_id'].str.contains('stoparea:')]

#To remove the accents from the accented characters and to convert the remaining characters to uppercase characters
stops_cleaned_Netherlands.loc[:,'stop_name'] = stops_cleaned_Netherlands.loc[:,'stop_name'].apply(remove_accents)
stops_cleaned_Netherlands.loc[:,'stop_name'] = stops_cleaned_Netherlands.loc[:,'stop_name'].str.upper()
stops_cleaned_Netherlands

#To initialize the Nominatim API to get the location from the input string 
geolocator = Nominatim(user_agent="application")
reverse = RateLimiter(geolocator.reverse, min_delay_seconds=0.2)

#To get the location with the geolocator.reverse() function and to extract the country from the location instance
country_list = []
i = 0
for index, row in stops_cleaned_Netherlands.iterrows():
    i += 1
    print(i)
    latitude = row['stop_lat']
    longitude = row['stop_lon']
    # To assign the latitude and longitude into a geolocator.reverse() method
    location = reverse((latitude, longitude), language='en', exactly_one=True)
    # To get the country from the given list and parsed into a dictionary with raw function()
    address = location.raw['address']
    country = address.get('country', '')
    country_list.append(country)

#To add the values of country_list as a new attribute country 
stops_cleaned_Netherlands.loc[:,'country'] = country_list
stops_cleaned_Netherlands
stops_cleaned_Netherlands.to_csv('stops_cleaned_Netherlands_country.csv')

In [None]:
'''To clean the stop_times df'''
stop_times_cleaned_Netherlands = stop_times_Netherlands.copy()
stop_times_cleaned_Netherlands.loc[:,'stop_id'] = stop_times_cleaned_Netherlands.stop_id.apply(str)
stop_times_cleaned_Netherlands = pd.merge(stop_times_cleaned_Netherlands, stops_Netherlands[['stop_id', 'stop_name']], on='stop_id')
stop_times_cleaned_Netherlands.loc[:,'stop_name'] = stop_times_cleaned_Netherlands.loc[:,'stop_name'].apply(remove_accents)
stop_times_cleaned_Netherlands.loc[:,'stop_name'] = stop_times_cleaned_Netherlands.loc[:,'stop_name'].str.upper()
stop_times_cleaned_Netherlands

In [9]:
stops_cleaned_Netherlands = pd.read_csv("stops_cleaned_Netherlands_country.csv", sep=",")

### To merge the files

In [12]:
'''To select all required fields'''
agency_cleaned_Netherlands = agency_Netherlands[['agency_id', 'agency_name', 'agency_url', 'agency_timezone']]
routes_cleaned_Netherlands = routes_cleaned_Netherlands[['route_id', 'agency_id', 'route_short_name', 'route_long_name', 'route_type']]
trips_cleaned_Netherlands = trips_Netherlands[['trip_id', 'route_id', 'service_id', 'trip_headsign']]
calendar_dates_cleaned_Netherlands = calendar_dates_cleaned_Netherlands[['service_id', 'date']]
stops_cleaned_Netherlands = stops_cleaned_Netherlands[['stop_name', 'stop_lat', 'stop_lon', 'country']]
stop_times_cleaned_Netherlands = stop_times_cleaned_Netherlands[['trip_id', 'stop_name', 'arrival_time', 'departure_time', 'stop_sequence']]

'To select all required fields'

In [13]:
''' To merge the Dutch files '''
#To merge the stop_times df with the stops df on stop_id
stop_times_stops_Netherlands = pd.merge(stop_times_cleaned_Netherlands, stops_cleaned_Netherlands[['stop_name', 'stop_lat', 'stop_lon', 'country']], on='stop_name')

#To merge the trips df with the routes df on route_id
routes_trips_Netherlands = pd.merge(routes_cleaned_Netherlands[['route_id']], trips_cleaned_Netherlands, on='route_id')

#To merge the stop_times_stops df with the trips_routes df on trip_id
routes_trips_stop_times_stops_Netherlands = pd.merge(routes_trips_Netherlands, stop_times_stops_Netherlands, on='trip_id')

#To take only the service_ids present in both the routes_trips_stop_times_stops df and the calendar_dates df into account
calendar_dates_cleaned_unique_Netherlands = calendar_dates_cleaned_Netherlands['service_id'].unique()
routes_trips_stop_times_stops_calendar_dates_Netherlands = routes_trips_stop_times_stops_Netherlands[(routes_trips_stop_times_stops_Netherlands['service_id'].isin(calendar_dates_cleaned_unique_Netherlands))]

' To merge the Dutch files '

# Preparation space-of-stops

In [None]:
# Verderwerken met routes_trips_stop_times_stops_calendar_dates_Netherlands of deze dataframe hernoemd naar een gemakkelijkere naam