# Download this file on your local computer


In [2]:
import pandas as pd
import numpy as np
    
import xml.etree.ElementTree as ET

from datetime import datetime 

In [3]:
def extract_station_info(data, node, prefix):
  if node:
    data[prefix + 'Code'].append(node.find('./ns2:StationCode', ns).text)
    data[prefix + 'UIC'].append(int(node.find('./ns2:UICCode', ns).text))
    data[prefix + 'Type'].append(int(node.find('./ns2:Type', ns).text))
  else:
    data[prefix + 'Code'].append(np.NaN)
    data[prefix + 'UIC'].append(np.NaN)
    data[prefix + 'Type'].append(np.NaN)

def parse_timestamp(date_string):
  return datetime.strptime(date_string, '%Y-%m-%dT%H:%M:%S.%fZ')

def encode_list(nodes, sufix):
  UIC_codes = [node.find(sufix, ns).text for node in nodes]
  return ';'.join(UIC_codes)

In [4]:
# namespace of the xml object
ns={
    'ns1d': 'urn:ndov:cdm:trein:reisinformatie:messages:5',
    'ns1a': 'urn:ndov:cdm:trein:reisinformatie:messages:dynamischeaankomststaat:1',
    'ns2': 'urn:ndov:cdm:trein:reisinformatie:data:4',
}

# dictionary that will save the data while loading it in
def arrival_data_template():

    return {
        'ObservationTime': [],

        # Ride
        'RideId':[],
        'RideTime': [],

        # Departure station
        'DepartureStationCode': [],
        'DepartureStationUIC': [],
        'DepartureStationType': [],

        # Train
        'TrainId': [],
        'TrainType': [],
        'TrainOperator': [],

        # Actual destination
        'DestinationStationCode': [],
        'DestinationStationUIC': [],
        'DestinationStationType': [],

        # Arrival times
        'PlannedArrivalTime': [],
        'ActualArrivalTime': [],

        # Arrival platform
        'PlannedArrivalPlatform': [],
        'PlannedArrivalPlatformSuffix': [],
        'ActualArrivalPlatform': [],
        'ActualArrivalPlatformSuffix': [],

#         # Departure platforms
#         'PlannedDeparturePlatform': [],
#         'ActualDeparturePlatform': [],

#         # Stop stations
#         'PlannedStopStations': [],
#         'ActualStopStations': [],

#         # Matirial type
#         'MaterialType': [],
#         'MaterialDesignation': [],
#         'MaterialLength': [],

#         'ChangeType': [],

    }

def departure_data_template():
    return {
    'ObservationTime': [],

    # Ride
    'RideId':[],
    'RideTime': [],

    # Departure station
    'DepartureStationCode': [],
    'DepartureStationUIC': [],
    'DepartureStationType': [],

    # Train
    'TrainId': [],
    'TrainType': [],
    'TrainOperator': [],

    # Actual destination
    'DestinationStationCode': [],
    'DestinationStationUIC': [],
    'DestinationStationType': [],

    # Departure times
    'PlannedDepartureTime': [],
    'ActualDepartureTime': [],

    # Departure platform
    'PlannedDeparturePlatform': [],
    'PlannedDeparturePlatformSuffix': [],
    'ActualDeparturePlatform': [],
    'ActualDeparturePlatformSuffix': [],

    # Departure platforms
    'PlannedDeparturePlatform': [],
    'ActualDeparturePlatform': [],

    # Stop stations
    'PlannedStopStations': [],
    'ActualStopStations': [],

    # Matirial type
    'MaterialType': [],
    'MaterialDesignation': [],
    'MaterialLength': [],
    
    'HasChange': [],
    'ChangeType': [],

}
  


In [5]:
das_tag = b'ReisInformatieProductDAS'
# station_amsterdam = b'<ns2:RitStation><ns2:StationCode>ASD</ns2:StationCode>'
# station_utrecht = b'<ns2:RitStation><ns2:StationCode>UT</ns2:StationCode>'
station_amsterdam = b'<ns2:StationCode>ASD</ns2:StationCode>'
station_utrecht = b'<ns2:StationCode>UT</ns2:StationCode>'
source_amsterdam = b'<ns2:TreinHerkomst InfoStatus=""Gepland""><ns2:StationCode>ASD</ns2:StationCode'
source_utrecht = b'<ns2:TreinHerkomst InfoStatus=""Gepland""><ns2:StationCode>UT</ns2:StationCode>'
destination_amsterdam = b'<ns2:TreinEindBestemming InfoStatus=""Gepland""><ns2:StationCode>AMS</ns2:StationCode>'
destination_utrecht = b'<ns2:TreinEindBestemming InfoStatus=""Gepland""><ns2:StationCode>UT</ns2:StationCode>'



In [7]:
def construct_tag(name):
    opening = b'<'+name+b'>'
    closing = b'</'+name+b'>'
    return opening, closing

In [8]:
import requests
import lzma

arrival_lines = []
departure_lines = []
temp_lines = {
    'Date': [],
    'RideId': [],
    'TrainId': [],
}

parsing_days = pd.date_range(start='2019/1/1', end='2019/6/30').strftime('%Y-%m-%d').values
print(parsing_days)

for day in parsing_days:
    print('downloading: ',day) 
    url = f'https://trein.fwrite.org/AMS-Aurora-archive/{day[:7]}/DVSPPV_{day}.csv.xz'

    req = requests.get(url, stream=True)
    with lzma.LZMAFile(req.raw) as file:
        for line in file:
            # check if this is related to arrivals
            if das_tag in line:
                # check if the data includes amsterdam and utrecht
                if (station_amsterdam in line) and (station_utrecht in line):     
                    for tag_name in [b'ns2:RitDatum', b'ns2:RitId', b'ns2:TreinNummer']:
                        # get tag and index
                        tags =  construct_tag(tag_name)
                        start = line.index(tags[0])
                        end = line.index(tags[1])
                        # set the start to the end of the start string
                        start += len(tags[0])
                        # get the text we need
                        text = line[start:end]
                        text = text.decode('utf-8')
                        # add the data to coresponding column
                        if tag_name == b'ns2:RitDatum':
                            temp_lines['Date'].append(text)
                        if tag_name == b'ns2:RitId':
                            temp_lines['RideId'].append(text)
                        if tag_name == b'ns2:TreinNummer':
                            temp_lines['TrainId'].append(text)
df = pd.DataFrame(temp_lines)

['2019-01-01' '2019-01-02' '2019-01-03' '2019-01-04' '2019-01-05'
 '2019-01-06' '2019-01-07' '2019-01-08' '2019-01-09' '2019-01-10'
 '2019-01-11' '2019-01-12' '2019-01-13' '2019-01-14' '2019-01-15'
 '2019-01-16' '2019-01-17' '2019-01-18' '2019-01-19' '2019-01-20'
 '2019-01-21' '2019-01-22' '2019-01-23' '2019-01-24' '2019-01-25'
 '2019-01-26' '2019-01-27' '2019-01-28' '2019-01-29' '2019-01-30'
 '2019-01-31' '2019-02-01' '2019-02-02' '2019-02-03' '2019-02-04'
 '2019-02-05' '2019-02-06' '2019-02-07' '2019-02-08' '2019-02-09'
 '2019-02-10' '2019-02-11' '2019-02-12' '2019-02-13' '2019-02-14'
 '2019-02-15' '2019-02-16' '2019-02-17' '2019-02-18' '2019-02-19'
 '2019-02-20' '2019-02-21' '2019-02-22' '2019-02-23' '2019-02-24'
 '2019-02-25' '2019-02-26' '2019-02-27' '2019-02-28' '2019-03-01'
 '2019-03-02' '2019-03-03' '2019-03-04' '2019-03-05' '2019-03-06'
 '2019-03-07' '2019-03-08' '2019-03-09' '2019-03-10' '2019-03-11'
 '2019-03-12' '2019-03-13' '2019-03-14' '2019-03-15' '2019-03-16'
 '2019-03-

In [9]:
df.shape

(1188666, 3)

In [10]:
df.head()

Unnamed: 0,Date,RideId,TrainId
0,2018-12-31,1405,1405
1,2018-12-31,1402,1402
2,2018-12-31,1405,1405
3,2018-12-31,1405,1405
4,2018-12-31,1405,1405


In [11]:
df.drop_duplicates(['Date', 'RideId'], inplace=True)

In [12]:
df.shape

(46399, 3)

In [90]:
df.to_csv('../assets/data/2019-UT-ASD-Full/2019-01-01--2019-06-30-')

Unnamed: 0,Date,RideId,TrainId
0,2018-12-31,1405,1405
1,2018-12-31,1402,1402
9,2019-01-01,1409,1409
10,2018-12-31,1406,1406
26,2019-01-01,1413,1413
32,2019-01-01,1410,1410
56,2019-01-01,1417,1417
61,2019-01-01,1414,1414
76,2019-01-01,1421,1421
82,2019-01-01,1418,1418


In [13]:
len(temp_lines)

4911

In [None]:
import requests
import lzma

arrival_lines = []
departure_lines = []

parsing_days = pd.date_range(start='2019/12/1', end='2019/12/31').strftime('%Y-%m-%d').values
print(parsing_days)

for day in parsing_days:
    print('downloading: ',day) 
    url = f'https://trein.fwrite.org/AMS-Aurora-archive/{day[:7]}/DVSPPV_{day}.csv.xz'

    req = requests.get(url, stream=True)

    with lzma.LZMAFile(req.raw) as file:
        for line in file:
            if (source_utrecht in line and station_amsterdam in line) or (source_amsterdam in line and station_utrecht in line) :
                arrival_lines.append(line)
            if (station_amsterdam in line and destination_utrecht in line) or (station_utrecht in line and destination_amsterdam in line) :
                departure_lines.append(line)

In [6]:
from io import StringIO

df_arrivals = pd.read_csv(StringIO(b'\n'.join(arrival_lines).decode('utf-8')), header=None, names=['date', 'xml_obj', 'uuid'])
df_departures = pd.read_csv(StringIO(b'\n'.join(departure_lines).decode('utf-8')), header=None, names=['date', 'xml_obj', 'uuid'])

In [7]:
df_arrivals.shape

(2534, 3)

In [1]:
df_departures.to_csv('../assets/data/2019 UT-ASD/2019-12-01--2019-12-31_departures.csv', index=None)
df_arrivals.to_csv('../assets/data/2019 UT-ASD/2019-12-01--2019-12-31_arrivals.csv', index=None)

NameError: name 'df_departures' is not defined

# Please keep in mind this is memory intensive good approach is to do each month seperately and later merge them

In [58]:
arrival_data = arrival_data_template()

# loop through every day

for root_text, observation_time in df_arrivals[['xml_obj', 'date']].values:  
            
    arrival_data['ObservationTime'].append(observation_time)

    root = ET.fromstring(root_text)

    # Ride
    ride = root.find('./ns2:ReisInformatieProductDAS/ns2:DynamischeAankomstStaat', ns)

    arrival_data['RideId'].append(int(ride.find('./ns2:RitId', ns).text))
    arrival_data['RideTime'].append(root.find('./ns2:ReisInformatieProductDAS/ns2:RIPAdministratie/ns2:ReisInformatieTijdstip', ns).text)

    # Stations
    train = ride.find('./ns2:TreinAankomst', ns)

    destinationStation = ride.find('./ns2:RitStation', ns)
    departureStation = train.find('./ns2:TreinHerkomst[@InfoStatus="Gepland"]',ns)
    
    extract_station_info(arrival_data, departureStation, 'DepartureStation')
    extract_station_info(arrival_data, destinationStation, 'DestinationStation')

    # Arrival times
    arrival_data['ActualArrivalTime'].append(train.find('./ns2:AankomstTijd[@InfoStatus="Actueel"]', ns).text)
    arrival_data['PlannedArrivalTime'].append(train.find('./ns2:AankomstTijd[@InfoStatus="Gepland"]', ns).text)
    
    # Train
    arrival_data['TrainId'].append(train.find('./ns2:TreinNummer', ns).text)
    arrival_data['TrainType'].append(train.find('./ns2:TreinSoort', ns).text)
    arrival_data['TrainOperator'].append(train.find('./ns2:Vervoerder', ns).text)

    arrival_data['PlannedArrivalPlatform'].append(train.find('./ns2:TreinAankomstSpoor[@InfoStatus="Gepland"]/ns2:SpoorNummer', ns).text)
    suffix = train.find('./ns2:TreinAankomstSpoor[@InfoStatus="Gepland"]/ns2:SpoorFase', ns)
    arrival_data['PlannedArrivalPlatformSuffix'].append(None if suffix is None else suffix.text)

    arrival_data['ActualArrivalPlatform'].append(train.find('./ns2:TreinAankomstSpoor[@InfoStatus="Actueel"]/ns2:SpoorNummer', ns).text)
    suffix = train.find('./ns2:TreinAankomstSpoor[@InfoStatus="Actueel"]/ns2:SpoorFase', ns)
    arrival_data['ActualArrivalPlatformSuffix'].append(None if suffix is None else suffix.text)
    
# Convert the dictionary to a dataframe
dfa = pd.DataFrame(arrival_data)    
print(dfa.shape) # show the file size kinda
# # save the file, specify your path and name for the file
# df_flat.to_csv('./data/september.csv.zip', 
#                index=False, 
#                compression=dict(method='zip', archive_name='january.csv'))



(80, 18)


In [62]:
departure_data = departure_data_template()
# Parse records about departures
for root_text, observation_time in df_departures[['xml_obj', 'date']].values:  
            
    departure_data['ObservationTime'].append(observation_time)

    root = ET.fromstring(root_text)

    # Ride
    ride = root.find('./ns2:ReisInformatieProductDVS/ns2:DynamischeVertrekStaat', ns)

    departure_data['RideId'].append(int(ride.find('./ns2:RitId', ns).text))
    departure_data['RideTime'].append(root.find('./ns2:ReisInformatieProductDVS/ns2:RIPAdministratie/ns2:ReisInformatieTijdstip', ns).text)

    # Stations
    train = ride.find('./ns2:Trein', ns)

    departureStation = ride.find('./ns2:RitStation', ns)
    destinationStation = train.find('./ns2:TreinEindBestemming[@InfoStatus="Gepland"]',ns)
    
    extract_station_info(departure_data, departureStation, 'DepartureStation')
    extract_station_info(departure_data, destinationStation, 'DestinationStation')

    # Departure times
    departure_data['ActualDepartureTime'].append(train.find('./ns2:VertrekTijd[@InfoStatus="Actueel"]', ns).text)
    departure_data['PlannedDepartureTime'].append(train.find('./ns2:VertrekTijd[@InfoStatus="Gepland"]', ns).text)
    
    # Train
    departure_data['TrainId'].append(train.find('./ns2:TreinNummer', ns).text)
    departure_data['TrainType'].append(train.find('./ns2:TreinSoort', ns).text)
    departure_data['TrainOperator'].append(train.find('./ns2:Vervoerder', ns).text)

    departure_data['PlannedDeparturePlatform'].append(train.find('./ns2:TreinVertrekSpoor[@InfoStatus="Gepland"]/ns2:SpoorNummer', ns).text)
    suffix = train.find('./ns2:TreinVertrekSpoor[@InfoStatus="Gepland"]/ns2:SpoorFase', ns)
    departure_data['PlannedDeparturePlatformSuffix'].append(None if suffix is None else suffix.text)

    departure_data['ActualDeparturePlatform'].append(train.find('./ns2:TreinVertrekSpoor[@InfoStatus="Actueel"]/ns2:SpoorNummer', ns).text)
    suffix = train.find('./ns2:TreinVertrekSpoor[@InfoStatus="Actueel"]/ns2:SpoorFase', ns)
    departure_data['ActualDeparturePlatformSuffix'].append(None if suffix is None else suffix.text)
    
    # Stop stations
    wagons = train.find('./ns2:TreinVleugel', ns)
    if not wagons: print('No wagons, wtf?')
    stop_stations = wagons.findall('./ns2:StopStations[@InfoStatus="Gepland"]/ns2:Station', ns)
    if not stop_stations: print('No stop_stations, wtf?')

    departure_data['PlannedStopStations'].append(encode_list(stop_stations, './ns2:UICCode'))

    stop_stations = wagons.findall('./ns2:StopStations[@InfoStatus="Actueel"]/ns2:Station', ns)
    departure_data['ActualStopStations'].append(encode_list(stop_stations, './ns2:UICCode'))
    if not stop_stations: print('No stop_stations2, wtf?')

    # Material
    material = wagons.find('./ns2:MaterieelDeelDVS', ns)
    if material:
        departure_data['MaterialType'].append(material.find('./ns2:MaterieelSoort', ns).text)
        departure_data['MaterialDesignation'].append(material.find('./ns2:MaterieelAanduiding', ns).text)
        departure_data['MaterialLength'].append(material.find('./ns2:MaterieelLengte', ns).text)
    else:
        departure_data['MaterialType'].append(np.NaN)
        departure_data['MaterialDesignation'].append(np.NaN)
        departure_data['MaterialLength'].append(np.NaN)

    # Change
    changes = root.findall('./ns2:Wijziging', ns)
    if changes:
        departure_data['HasChange'].append(True)
        departure_data['ChangeType'].append(encode_list(changes, './ns2:WijzigingType'))
    else:
        departure_data['HasChange'].append(False)
        departure_data['ChangeType'].append(np.NaN)
        
dfd = pd.DataFrame(departure_data)  
print(dfa.shape)

(80, 18)


In [60]:
dfa.sort_values(['RideId']).head()

Unnamed: 0,ObservationTime,RideId,RideTime,DepartureStationCode,DepartureStationUIC,DepartureStationType,TrainId,TrainType,TrainOperator,DestinationStationCode,DestinationStationUIC,DestinationStationType,PlannedArrivalTime,ActualArrivalTime,PlannedArrivalPlatform,PlannedArrivalPlatformSuffix,ActualArrivalPlatform,ActualArrivalPlatformSuffix
49,2019-03-01 07:23:10.246488+01:00,105,2019-03-01T07:33:00.000Z,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-03-01T07:33:00.000Z,2019-03-01T07:33:00.000Z,18,,18,
51,2019-03-01 08:31:34.442566+01:00,105,2019-03-01T07:33:00.000Z,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-03-01T07:33:00.000Z,2019-03-01T07:34:21.000Z,18,,18,
52,2019-03-01 08:32:04.804135+01:00,105,2019-03-01T07:33:00.000Z,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-03-01T07:33:00.000Z,2019-03-01T07:33:54.000Z,18,,18,
53,2019-03-01 08:33:26.746973+01:00,105,2019-03-01T07:33:00.000Z,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-03-01T07:33:00.000Z,2019-03-01T07:33:54.000Z,18,,18,
54,2019-03-01 08:34:09.156114+01:00,105,2019-03-01T07:33:00.000Z,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-03-01T07:33:00.000Z,2019-03-01T07:34:27.000Z,18,,18,


In [64]:
dfd.sort_values(['RideId']).head()

Unnamed: 0,ObservationTime,RideId,RideTime,DepartureStationCode,DepartureStationUIC,DepartureStationType,TrainId,TrainType,TrainOperator,DestinationStationCode,...,PlannedDeparturePlatformSuffix,ActualDeparturePlatform,ActualDeparturePlatformSuffix,PlannedStopStations,ActualStopStations,MaterialType,MaterialDesignation,MaterialLength,HasChange,ChangeType
0,2019-03-01 00:09:03.534112+01:00,1402,2019-03-01T00:19:00.000Z,ASD,8400058,6,1402,Intercity,NS,UT,...,a,7,a,8400074;8400621,8400074;8400621,VIRM,6,16210,False,
5,2019-03-01 01:19:29.990044+01:00,1402,2019-03-01T00:19:00.000Z,ASD,8400058,6,1402,Intercity,NS,UT,...,a,7,a,8400074;8400621,8400074;8400621,VIRM,6,16210,False,
4,2019-03-01 01:09:02.298013+01:00,1406,2019-03-01T01:19:00.000Z,ASD,8400058,6,1406,Intercity,NS,UT,...,a,11,a,8400621,8400621,ICM,3,8060,False,
7,2019-03-01 02:15:38.112751+01:00,1406,2019-03-01T01:19:00.000Z,ASD,8400058,6,1406,Intercity,NS,UT,...,a,11,a,8400621,8400621,ICM,3,8060,False,
8,2019-03-01 02:19:54.973100+01:00,1406,2019-03-01T01:19:00.000Z,ASD,8400058,6,1406,Intercity,NS,UT,...,a,11,a,8400621,8400621,ICM,3,8060,False,


In [69]:
dfd[['RideId','PlannedDepartureTime', 'ActualDepartureTime']]

Unnamed: 0,RideId,PlannedDepartureTime,ActualDepartureTime
0,1402,2019-03-01T00:19:00.000Z,2019-03-01T00:19:00.000Z
1,7393,2019-02-28T23:28:00.000Z,2019-02-28T23:30:00.000Z
2,7393,2019-02-28T23:28:00.000Z,2019-02-28T23:30:00.000Z
3,7395,2019-02-28T23:57:00.000Z,2019-02-28T23:57:00.000Z
4,1406,2019-03-01T01:19:00.000Z,2019-03-01T01:19:00.000Z
5,1402,2019-03-01T00:19:00.000Z,2019-03-01T00:19:00.000Z
6,1412,2019-03-01T02:19:00.000Z,2019-03-01T02:19:00.000Z
7,1406,2019-03-01T01:19:00.000Z,2019-03-01T01:19:00.000Z
8,1406,2019-03-01T01:19:00.000Z,2019-03-01T01:19:00.000Z
9,1414,2019-03-01T03:19:00.000Z,2019-03-01T03:19:00.000Z


In [None]:
# you can laod the files again in 
df_july = pd.read_csv('../data/july.csv.zip')
df_august = pd.read_csv('../data/august.csv.zip')
df_september = pd.read_csv('../data/september.csv.zip')

In [None]:
# here you merge them into one big dataframe
months = [df_july, df_august, df_september]
df_months = pd.concat(months)

In [None]:
# save the big dataframe again
df_months.to_csv('../data/july_to_september.csv.zip', 
                 index=False, 
                 compression=dict(method='zip', archive_name='july_to_september.csv'))

In [None]:
# here we filter out only all rides from utrecht to amsterdam and vise versa
df_ut_asd = df_months.loc[((df_months['DepartureStationCode'] == 'UT') & 
               (df_months['PlannedDestinationStationCode'] == 'ASD')) | 
              ((df_months['DepartureStationCode'] == 'ASD') & 
               (df_months['PlannedDestinationStationCode'] == 'UT'))]

In [None]:
# save the reduced dataframe again
df_ut_asd.to_csv('../data/july_to_september_utrecht_amsterdam.csv.zip', 
                 index=False, 
                 compression=dict(method='zip', archive_name='july_to_september_utrecht_amsterdam.csv'))