In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

df = pd.read_csv('NS_Data\combined_trein_data_modified.csv')

  df = pd.read_csv('NS_Data\combined_trein_data_modified.csv')


In [2]:
df.dtypes

Service:RDT-ID                    int64
Service:Date                     object
Service:Type                     object
Service:Company                  object
Service:Train number              int64
Service:Completely cancelled       bool
Service:Partly cancelled           bool
Service:Maximum delay             int64
Stop:RDT-ID                       int64
Stop:Station code                object
Stop:Station name                object
Stop:Arrival time                object
Stop:Arrival delay              float64
Stop:Arrival cancelled             bool
Stop:Departure time              object
Stop:Departure delay            float64
Stop:Departure cancelled           bool
Stop:Platform change               bool
Stop:Planned platform            object
Stop:Actual platform             object
dtype: object

In [3]:
df = df[['Service:Type',
        'Service:Company',
        'Service:Completely cancelled',
        'Service:Partly cancelled',
        'Stop:Station name',
        'Stop:Departure time',
        'Stop:Departure delay'
        ]]

df.dtypes

Service:Type                     object
Service:Company                  object
Service:Completely cancelled       bool
Service:Partly cancelled           bool
Stop:Station name                object
Stop:Departure time              object
Stop:Departure delay            float64
dtype: object

### Feature engineering
In this file I will be taking a deep look into the changes I will be making to some of the features, for example changing the date-time to different columns showing which month, which day and if it is a holiday. These temporal features are needed because I will be training a time based model. I will also be deciding what exactly I will be predicting.

#### Temporal features
The temporal features in my dataset are the timestamps, right now this is set in a single column, this column has a string in the format yyyy-mm-dd and yyyy-mm-dd-hh-mm-ss, when training a model seperating these values is vital since the model will then learn from different features and make it so it will find correlations between these values, which is not possible when having these values in a single big string.
<br><br>
First I need to change the columns I plan on using to date time, I can then use pandas functions to generate seperate columns for days, hours, years etc. For now I will be looking at departure time.

In [4]:
import numpy as np

df['departure datetime'] = pd.to_datetime(df['Stop:Departure time'], errors='coerce')
df['departure year'] = df['departure datetime'].dt.year
df['departure month'] = df['departure datetime'].dt.month
df['departure day'] = df['departure datetime'].dt.day
df['departure hour'] = df['departure datetime'].dt.hour
df['departure minute'] = df['departure datetime'].dt.minute
df['departure weekday'] = df['departure datetime'].dt.weekday

conditions = [
    df['departure month'].isin([12, 1, 2]),
    df['departure month'].isin([3, 4, 5]),
    df['departure month'].isin([6, 7, 8]),
    df['departure month'].isin([9, 10, 11])
]
choices = [4, 1, 2, 3]
df['season'] = np.select(conditions, choices)

df = df.drop(["Stop:Departure time", "departure datetime"], axis=1)

df.head()

Unnamed: 0,Service:Type,Service:Company,Service:Completely cancelled,Service:Partly cancelled,Stop:Station name,Stop:Departure delay,departure year,departure month,departure day,departure hour,departure minute,departure weekday,season
0,Intercity,NS,False,False,Rotterdam Centraal,1.0,2019.0,1.0,1.0,2.0,0.0,1.0,4
1,Intercity,NS,False,False,Delft,0.0,2019.0,1.0,1.0,2.0,12.0,1.0,4
2,Intercity,NS,False,False,Den Haag HS,1.0,2019.0,1.0,1.0,2.0,21.0,1.0,4
3,Intercity,NS,False,False,Leiden Centraal,0.0,2019.0,1.0,1.0,2.0,45.0,1.0,4
4,Intercity,NS,False,False,Schiphol Airport,0.0,2019.0,1.0,1.0,3.0,2.0,1.0,4


In [5]:
df.dtypes

Service:Type                     object
Service:Company                  object
Service:Completely cancelled       bool
Service:Partly cancelled           bool
Stop:Station name                object
Stop:Departure delay            float64
departure year                  float64
departure month                 float64
departure day                   float64
departure hour                  float64
departure minute                float64
departure weekday               float64
season                            int64
dtype: object

## !!!!!Obsidian clipping about which option to use

The next feature that I will be using is the type of train that is driving a route, in doing this I can make a distinguisment between different types of train and thus create more accuracy for each type. The column Service:Type gives this, but this is currently still in a categorical text value, I will need to change this to a numeric value for a model to learn from this feature.

In [6]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded = encoder.fit_transform(df[["Service:Type"]])
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(["Service:Type"]))
df = pd.concat([df.reset_index(drop=True), encoded_df], axis=1)
df = df.drop(["Service:Type"], axis=1)
df.head()


Unnamed: 0,Service:Company,Service:Completely cancelled,Service:Partly cancelled,Stop:Station name,Stop:Departure delay,departure year,departure month,departure day,departure hour,departure minute,...,Service:Type_Intercity direct,Service:Type_RE 19,Service:Type_Snelbus i.p.v. trein,Service:Type_Sneltrein,Service:Type_Speciale Trein,Service:Type_Sprinter,Service:Type_Stopbus i.p.v. trein,Service:Type_Stoptrein,Service:Type_Thalys,Service:Type_stoptrein
0,NS,False,False,Rotterdam Centraal,1.0,2019.0,1.0,1.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,NS,False,False,Delft,0.0,2019.0,1.0,1.0,2.0,12.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,NS,False,False,Den Haag HS,1.0,2019.0,1.0,1.0,2.0,21.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,NS,False,False,Leiden Centraal,0.0,2019.0,1.0,1.0,2.0,45.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,NS,False,False,Schiphol Airport,0.0,2019.0,1.0,1.0,3.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Train company / operator is another feature that will be important, in my data analysis I found that there are differences between the average delay of each operator, using this in my model as feature will most likely also improve accuracy.
<br> I could also use Train number to detect certain patterns in which trains are most likely to have failures or delays, for my first version I decided to skip this feature because this would add a lot of complications due to there being so many different trains that this would not be as helpful.

In [7]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded = encoder.fit_transform(df[["Service:Company"]])
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(["Service:Company"]))
df = pd.concat([df.reset_index(drop=True), encoded_df], axis=1)
df = df.drop(["Service:Company"], axis=1)
df.head()

Unnamed: 0,Service:Completely cancelled,Service:Partly cancelled,Stop:Station name,Stop:Departure delay,departure year,departure month,departure day,departure hour,departure minute,departure weekday,...,Service:Company_Arriva,Service:Company_Blauwnet,Service:Company_Breng,Service:Company_DB,Service:Company_Eurobahn,Service:Company_NMBS,Service:Company_NS,Service:Company_R-net,Service:Company_Railexpert,Service:Company_Valleilijn
0,False,False,Rotterdam Centraal,1.0,2019.0,1.0,1.0,2.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,False,False,Delft,0.0,2019.0,1.0,1.0,2.0,12.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,False,False,Den Haag HS,1.0,2019.0,1.0,1.0,2.0,21.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,False,False,Leiden Centraal,0.0,2019.0,1.0,1.0,2.0,45.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,False,False,Schiphol Airport,0.0,2019.0,1.0,1.0,3.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


The last thing I need to do is split the stations into different values, for this I decided to use Label encoding in stead of one-hot encoding because there are many different stations, 100+. This would create too many columns for a model.

In [9]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['station_encoded'] = le.fit_transform(df['Stop:Station name'])
df = df.drop(["Stop:Station name"], axis=1)
df.head()

Unnamed: 0,Service:Completely cancelled,Service:Partly cancelled,Stop:Departure delay,departure year,departure month,departure day,departure hour,departure minute,departure weekday,season,...,Service:Company_Blauwnet,Service:Company_Breng,Service:Company_DB,Service:Company_Eurobahn,Service:Company_NMBS,Service:Company_NS,Service:Company_R-net,Service:Company_Railexpert,Service:Company_Valleilijn,station_encoded
0,False,False,1.0,2019.0,1.0,1.0,2.0,0.0,1.0,4,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,287
1,False,False,0.0,2019.0,1.0,1.0,2.0,12.0,1.0,4,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,89
2,False,False,1.0,2019.0,1.0,1.0,2.0,21.0,1.0,4,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,95
3,False,False,0.0,2019.0,1.0,1.0,2.0,45.0,1.0,4,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,225
4,False,False,0.0,2019.0,1.0,1.0,3.0,2.0,1.0,4,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,302


In [10]:
df.dtypes

Service:Completely cancelled            bool
Service:Partly cancelled                bool
Stop:Departure delay                 float64
departure year                       float64
departure month                      float64
departure day                        float64
departure hour                       float64
departure minute                     float64
departure weekday                    float64
season                                 int64
Service:Type_Alpen Express           float64
Service:Type_Bus                     float64
Service:Type_Eurostar                float64
Service:Type_Extra trein             float64
Service:Type_ICE International       float64
Service:Type_Int. Trein              float64
Service:Type_Intercity               float64
Service:Type_Intercity direct        float64
Service:Type_RE 19                   float64
Service:Type_Snelbus i.p.v. trein    float64
Service:Type_Sneltrein               float64
Service:Type_Speciale Trein          float64
Service:Ty

In [None]:
df.to_csv('NS_Data\combined_trein_data_feature_engineered.csv', index=False)

  df.to_csv('NS_Data\combined_trein_data_feature_engineered.csv', index=False)
