# Train data preparation

In [214]:
# Libraries
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET

print('Pandas {}'.format(pd.__version__))
print('Numpy {}'.format(np.__version__))

Pandas 1.3.4
Numpy 1.20.3


In [122]:
# We get the data from link
df = pd.read_csv('https://trein.fwrite.org/AMS-Aurora-archive/2016-01/DVS_2016-01-01.csv.xz')
df.columns = ["date", "data", "code"]

In [196]:
# This function iterates through each row (day) in the given dataset
# The dataset, which in this case is one whole day is parsed as a parameter to the function
# Two lists, ritstations and trains are created at the beginning. Then in the loop, ritstation and train objects are created and added to these lists.
# Later on, the lists are combined and used in dataframe for visualization.
rows = []

def GetTrainDayData(smalldf):
    # The xml data
    xml_data = smalldf["data"]
    
    # It goes through every row in the data (1 day)
    for i,train_data in enumerate(xml_data):
        root = ET.fromstring(train_data)

        # Check if there is always root
        if len(root) != 0:
            
            # This is the prefix for each element
            prefix = './/{urn:ndov:cdm:trein:reisinformatie:data:2}'
            # The data wrapper
            productDVS = root.find('{}ReisInformatieProductDVS'.format(prefix))
            # Getting the version of the data for the day (just shows how to get attributes)
            data_version = productDVS.attrib['Versie']

            # the wrapper that contain the ritstation and the train
            wrapper = root.find('{}DynamischeVertrekStaat'.format(prefix))
            
            # All data needed is extracted and put in variables here
            station_code = wrapper.find('{}StationCode'.format(prefix)).text
            station_name = wrapper.find('{}LangeNaam'.format(prefix)).text
            train_number = wrapper.find('{}TreinNummer'.format(prefix)).text
            train_type = wrapper.find('{}TreinSoort'.format(prefix)).text
            train_company = wrapper.find('{}Vervoerder'.format(prefix)).text
            train_destination = wrapper.find('{}TreinEindBestemming[@InfoStatus="Actueel"]'.format(prefix)).find('{}LangeNaam'.format(prefix)).text
            train_planned_departure = wrapper.find('{}VertrekTijd[@InfoStatus="Actueel"]'.format(prefix)).text
            train_delays = wrapper.find('{}ExacteVertrekVertraging'.format(prefix)).text
            
            rows.append({"Station Code":station_code,"Station Name":station_name,"Train Number":train_number,"Train Type":train_type,"Train Company":train_company,"Train Destination":train_destination,"Train Planned Departure":train_planned_departure,"Train Delay": train_delays})


In [202]:
smalldf = df.head(50000)

# make loop to parse few days
GetTrainDayData(smalldf)

In [203]:
data = pd.DataFrame(rows,columns=["Station Code","Station Name","Train Number","Train Type", "Train Company", "Train Destination", "Train Planned Departure","Train Delay"])
data.head(5)

Unnamed: 0,Station Code,Station Name,Train Number,Train Type,Train Company,Train Destination,Train Planned Departure,Train Delay
0,HLMS,Haarlem Spaarnwoude,14882,Sprinter,NS,Haarlem Spaarnwoude,2016-01-01T00:10:00Z,PT0S
1,NSCH,Bad Nieuweschans,37598,Stoptrein,Arriva,Bad Nieuweschans,2016-01-01T00:10:00Z,PT0S
2,DVD,Duivendrecht,7495,Sprinter,NS,Duivendrecht,2016-01-01T00:11:00Z,PT0S
3,DDRS,Dordrecht Stadspolders,36795,Stoptrein,Arriva,Dordrecht Stadspolders,2016-01-01T00:11:00Z,PT0S
4,EMNZ,Emmen Zuid,8083,Stoptrein,Arriva,Emmen Zuid,2016-01-01T00:11:00Z,PT0S


In [209]:
companies = data['Train Type'].value_counts()
companies

Sprinter             28031
Intercity            15360
Stoptrein             9624
Intercity direct       388
Sneltrein              342
Thalys                  75
ICE International       30
CityNightLine/IC        17
Speciale Trein          11
Name: Train Type, dtype: int64

In [210]:
thalys = data[data['Train Type'] == "Thalys"]
thalys['Train Destination'].value_counts()

Paris-Nord            49
Amsterdam Centraal    26
Name: Train Destination, dtype: int64