# Import of packages

In [None]:
'''To import the required packages.'''
import pandas as pd
import numpy as np
import networkx as nx
import collections
import matplotlib.pyplot as plt
import math
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

from kneed import KneeLocator
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.decomposition import PCA

#import osmnx as ox

# Settings

In [None]:
'''To display all output results of a Jupyter cell.'''
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
'''To ensure that the output results of extensive output results are not truncated.'''
#pd.options.display.max_rows = 4000

In [None]:
'''To change the width of the Notebook to see the output on the screen'''
#from IPython.core.display import display, HTML
#display(HTML("<style>.container { width:100% !important; }</style>"))

# **Swiss railway system**

# Import of the Swiss railway datasets

In [None]:
'''To register the GitHub link with the Swiss data as a variable.'''
datalink = "https://raw.githubusercontent.com/polkuleuven/Thesis_Train/main/gtfs_train_Switzerland_1503/"

In [None]:
'''Import all the GTFS data'''
#To import the agency dataset that contains limited information about the Swiss SBB railway agency.
agency_Switzerland = pd.read_csv(datalink + "agency.txt", sep=",")
#To import the stops dataset that contains information about the ids, the names and the geographical coordinates of the Swiss SBB railway stations.
stops_Switzerland = pd.read_csv(datalink + "stops.txt", sep=",")
#To import the feed_info dataset that contains limited information about the Swiss SBB railway feed.
feed_info_Switzerland = pd.read_csv(datalink + "feed_info.txt", sep=",")
#To import the transfers dataset that gives the minimum transfer time to switch routes at each Swiss SBB railway station.
transfers_not_cleaned_Switzerland = pd.read_csv(datalink + "transfers.txt", sep=",")
#To import the routes dataset that provides the id, the name and the type of vehicle used for all Swiss SBB railway routes.
routes_Switzerland = pd.read_csv(datalink + "routes.txt", sep=",")
#To import the trips dataset that gives for all routes an overview of the trips and the headsigns of these trips belonging to the Swiss SBB railway route.
#The service_id is an indication of all the dates this trip is valid (consultable in the calendar_dates dataset).
trips_Switzerland = pd.read_csv(datalink + "trips.txt", sep=",")
#To import the stop_times dataset that gives for all trips an overview of the ids of the stations served and the sequence in which these stations are served. 
#In addition, for all the trips the arrival and departure times at the stations served are given.
stop_times_Switzerland = pd.read_csv(datalink + "stop_times.txt", sep=",")
#To import the calendar dataset that gives the first and last date of all data observations.
calendar_Switzerland = pd.read_csv(datalink + "calendar.txt", sep=",")
#To import the calendar_dates dataset that gives for each service_id all the exact dates when that service_id is valid.
calendar_dates_Switzerland = pd.read_csv(datalink + "calendar_dates.txt", sep=",")

# Cleaning of the Swiss railway data

In [None]:
'''To clean the routes_Switzerland df'''
#To keep the train routes
routes_cleaned_Switzerland = routes_Switzerland[routes_Switzerland['route_type'] == 2]
routes_cleaned_Switzerland

In [None]:
'''To clean the calendar_dates_Switzerland df'''
#To filter the dates from the selected begin to the end date
begin_date = 20210314
end_date = 20210713
calendar_dates_cleaned_Switzerland = calendar_dates_Switzerland.copy()
calendar_dates_cleaned_Switzerland = calendar_dates_cleaned_Switzerland.drop(calendar_dates_cleaned_Switzerland[(calendar_dates_cleaned_Switzerland['date'] > end_date) |(calendar_dates_cleaned_Switzerland['date'] < begin_date)].index)
calendar_dates_cleaned_Switzerland

In [None]:
'''To clean the stop_times_Switzerland df'''
# To remove the superfluous characters of the stop_id (platform codes)
stop_times_cleaned_Switzerland = stop_times_Switzerland.copy()
stop_times_cleaned_Switzerland_column = stop_times_cleaned_Switzerland['stop_id'].str.split(':').str[0]
stop_times_cleaned_Switzerland.loc[:,'stop_id'] = stop_times_cleaned_Switzerland_column

# To make the stop_ids numerical 
stop_times_cleaned_Switzerland.loc[:,'stop_id'] = stop_times_cleaned_Switzerland.loc[:,'stop_id'].astype(np.int64)
stop_times_cleaned_Switzerland

In [None]:
'''To clean the stops_Switzerland df '''
#To remove the superfluous characters (platform codes)
stops_cleaned_Switzerland_column = stops_Switzerland['stop_id'].str.split(':').str[0]
stops_cleaned_Switzerland = stops_Switzerland.copy()
stops_cleaned_Switzerland.loc[:,'stop_id'] = stops_cleaned_Switzerland_column

#To make the stop_ids numerical and to remove the duplicate stop_ids
stops_cleaned_Switzerland = stops_cleaned_Switzerland[['stop_id', 'stop_name', 'stop_lat', 'stop_lon']]
stops_cleaned_Switzerland.loc[:,'stop_id'] = stops_cleaned_Switzerland.loc[:,'stop_id'].astype(np.int64)
stops_cleaned_Switzerland = stops_cleaned_Switzerland.drop_duplicates()

#To remove the accents from the stop_name and to change to uppercase
stops_cleaned_Switzerland.loc[:,'stop_name'] = stops_cleaned_Switzerland.loc[:,'stop_name'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
stops_cleaned_Switzerland.loc[:,'stop_name'] = stops_cleaned_Switzerland.loc[:,'stop_name'].str.upper()

#To initialize the Nominatim API to get the location from the input string 
geolocator = Nominatim(user_agent="application")
reverse = RateLimiter(geolocator.reverse, min_delay_seconds=0.2)

#To get the location with the geolocator.reverse() function and to extract the country from the location instance
country_list = []
for index, row in stops_cleaned_Switzerland.iterrows():
    latitude = row['stop_lat']
    longitude = row['stop_lon']
    # To assign the latitude and longitude into a geolocator.reverse() method
    location = reverse((latitude, longitude), language='en', exactly_one=True)
    # To get the country from the given list and parsed into a dictionary with raw function()
    address = location.raw['address']
    country = address.get('country', '')
    country_list.append(country)

#To add the values of country_list as a new attribute country     
stops_cleaned_Switzerland.loc[:,'country'] = country_list
stops_cleaned_Switzerland

#To calculate the total number of Belgian stations in the stops_cleaned dataset
swiss_stops_Switzerland = stops_cleaned_Switzerland[stops_cleaned_Switzerland['country'] == 'Switzerland']
swiss_stops_Switzerland_series = stops_cleaned_Switzerland.loc[stops_cleaned_Switzerland['country'] == 'Switzerland', 'stop_name']
#stops_cleaned_Switzerland.to_csv('stops_cleaned_Switzerland.csv')

In [None]:
#stops_cleaned_Switzerland = pd.read_csv("stops_cleaned_Switzerland.csv", sep=",")

### To merge the files

In [None]:
'''To select all required fields'''
agency_cleaned_Switzerland = agency_Switzerland[['agency_id', 'agency_name', 'agency_url', 'agency_timezone']]
routes_cleaned_Switzerland = routes_cleaned_Switzerland[['route_id', 'agency_id', 'route_short_name', 'route_long_name', 'route_type']]
trips_cleaned_Switzerland = trips_Switzerland[['trip_id', 'route_id', 'service_id', 'trip_headsign']]
calendar_dates_cleaned_Switzerland = calendar_dates_cleaned_Switzerland[['service_id', 'date']]
stops_cleaned_Switzerland = stops_cleaned_Switzerland[['stop_id', 'stop_name', 'stop_lat', 'stop_lon', 'country']]
stop_times_cleaned_Switzerland = stop_times_cleaned_Switzerland[['trip_id', 'stop_id', 'arrival_time', 'departure_time', 'stop_sequence']]

In [None]:
''' To merge the Swiss files '''
#To merge the stop_times df with the stops df on stop_id
stop_times_stops_Switzerland = pd.merge(stop_times_cleaned_Switzerland, stops_cleaned_Switzerland[['stop_id','stop_name', 'stop_lat', 'stop_lon', 'country']], on='stop_id')

#To merge the trips df with the routes df on route_id
routes_trips_Switzerland = pd.merge(routes_cleaned_Switzerland[['route_id']], trips_cleaned_Switzerland, on='route_id')

#To merge the stop_times_stops df with the trips_routes df on trip_id
routes_trips_stop_times_stops_Switzerland = pd.merge(routes_trips_Switzerland, stop_times_stops_Switzerland, on='trip_id')

#To take only the service_ids present in both the routes_trips_stop_times_stops df and the calendar_dates df into account
calendar_dates_cleaned_unique_Switzerland = calendar_dates_cleaned_Switzerland['service_id'].unique()
routes_trips_stop_times_stops_calendar_dates_Switzerland = routes_trips_stop_times_stops_Switzerland[(routes_trips_stop_times_stops_Switzerland['service_id'].isin(calendar_dates_cleaned_unique_Switzerland))]

# Preparation space-of-stops

In [None]:
# Verder te gaan met routes_trips_stop_times_stops_calendar_dates_Switzerland of eventueel te hernoemen