# Download this file on your local computer


In [1]:
import pandas as pd
import numpy as np
    
import xml.etree.ElementTree as ET

from datetime import datetime 

In [2]:
def extract_station_info(data, node, prefix):
  if node:
    data[prefix + 'Code'].append(node.find('./ns2:StationCode', ns).text)
    data[prefix + 'UIC'].append(int(node.find('./ns2:UICCode', ns).text))
    data[prefix + 'Type'].append(int(node.find('./ns2:Type', ns).text))
  else:
    data[prefix + 'Code'].append(np.NaN)
    data[prefix + 'UIC'].append(np.NaN)
    data[prefix + 'Type'].append(np.NaN)

def parse_timestamp(date_string):
  return datetime.strptime(date_string, '%Y-%m-%dT%H:%M:%S.%fZ')

def encode_list(nodes, sufix):
  UIC_codes = [node.find(sufix, ns).text for node in nodes]
  return ';'.join(UIC_codes)

In [3]:
# namespace of the xml object
ns={
    'ns1d': 'urn:ndov:cdm:trein:reisinformatie:messages:5',
    'ns1a': 'urn:ndov:cdm:trein:reisinformatie:messages:dynamischeaankomststaat:1',
    'ns2': 'urn:ndov:cdm:trein:reisinformatie:data:4',
}

# dictionary that will save the data while loading it in
def arrival_data_template():

    return {
        'ObservationTime': [],

        # Ride
        'RideId':[],
        'RideTime': [],

        # Departure station
        'DepartureStationCode': [],
        'DepartureStationUIC': [],
        'DepartureStationType': [],

        # Train
        'TrainId': [],
        'TrainType': [],
        'TrainOperator': [],

        # Actual destination
        'DestinationStationCode': [],
        'DestinationStationUIC': [],
        'DestinationStationType': [],

        # Arrival times
        'PlannedArrivalTime': [],
        'ActualArrivalTime': [],

        # Arrival platform
        'PlannedArrivalPlatform': [],
        'PlannedArrivalPlatformSuffix': [],
        'ActualArrivalPlatform': [],
        'ActualArrivalPlatformSuffix': [],

        # # Departure platforms
        # 'PlannedDeparturePlatform': [],
        # 'ActualDeparturePlatform': [],

        # # Stop stations
        # 'PlannedStopStations': [],
        # 'ActualStopStations': [],

        # # Matirial type
        # 'MaterialType': [],
        # 'MaterialDesignation': [],
        # 'MaterialLength': [],

        # 'ChangeType': [],

    }

def departure_data_template():
    return {
    'ObservationTime': [],

    # Ride
    'RideId':[],
    'RideTime': [],

    # Departure station
    'DepartureStationCode': [],
    'DepartureStationUIC': [],
    'DepartureStationType': [],

    # Train
    'TrainId': [],
    'TrainType': [],
    'TrainOperator': [],

    # Actual destination
    'DestinationStationCode': [],
    'DestinationStationUIC': [],
    'DestinationStationType': [],

    # Arrival times
    'PlannedArrivalTime': [],
    'ActualArrivalTime': [],

    # Arrival platform
    'PlannedArrivalPlatform': [],
    'PlannedArrivalPlatformSuffix': [],
    'ActualArrivalPlatform': [],
    'ActualArrivalPlatformSuffix': [],

    # # Departure platforms
    # 'PlannedDeparturePlatform': [],
    # 'ActualDeparturePlatform': [],

    # # Stop stations
    # 'PlannedStopStations': [],
    # 'ActualStopStations': [],

    # # Matirial type
    # 'MaterialType': [],
    # 'MaterialDesignation': [],
    # 'MaterialLength': [],

    # 'ChangeType': [],

}
  


array(['2019-01-20'], dtype=object)

In [16]:
station_amsterdam = b'<ns2:RitStation><ns2:StationCode>ASD</ns2:StationCode>'
station_utrecht = b'<ns2:RitStation><ns2:StationCode>UT</ns2:StationCode>'
source_amsterdam = b'<ns2:TreinHerkomst InfoStatus=""Gepland""><ns2:StationCode>ASD</ns2:StationCode'
source_utrecht = b'<ns2:TreinHerkomst InfoStatus=""Gepland""><ns2:StationCode>UT</ns2:StationCode>'
destination_amsterdam = b'<ns2:TreinEindBestemming InfoStatus=""Gepland""><ns2:StationCode>AMS</ns2:StationCode>'
destination_utrecht = b'<ns2:TreinEindBestemming InfoStatus=""Gepland""><ns2:StationCode>UT</ns2:StationCode>'

In [26]:
import requests
import lzma

arrival_lines = []
departure_lines = []

parsing_days = pd.date_range(start='2019/3/1', end='2019/3/31').strftime('%Y-%m-%d').values
print(parsing_days)

for day in parsing_days:
    print('downloading: ',day) 
    url = f'https://trein.fwrite.org/AMS-Aurora-archive/{day[:7]}/DVSPPV_{day}.csv.xz'

    req = requests.get(url, stream=True)

    with lzma.LZMAFile(req.raw) as file:
        for line in file:
            if (source_utrecht in line and station_amsterdam in line) or (source_amsterdam in line and station_utrecht in line) :
                arrival_lines.append(line)
            if (station_amsterdam in line and destination_utrecht in line) or (station_utrecht in line and destination_amsterdam in line) :
                departure_lines.append(line)
    

['2019-03-01' '2019-03-02' '2019-03-03' '2019-03-04' '2019-03-05'
 '2019-03-06' '2019-03-07' '2019-03-08' '2019-03-09' '2019-03-10'
 '2019-03-11' '2019-03-12' '2019-03-13' '2019-03-14' '2019-03-15'
 '2019-03-16' '2019-03-17' '2019-03-18' '2019-03-19' '2019-03-20'
 '2019-03-21' '2019-03-22' '2019-03-23' '2019-03-24' '2019-03-25'
 '2019-03-26' '2019-03-27' '2019-03-28' '2019-03-29' '2019-03-30'
 '2019-03-31']
downloading:  2019-03-01
downloading:  2019-03-02
downloading:  2019-03-03
downloading:  2019-03-04
downloading:  2019-03-05
downloading:  2019-03-06
downloading:  2019-03-07
downloading:  2019-03-08
downloading:  2019-03-09
downloading:  2019-03-10
downloading:  2019-03-11
downloading:  2019-03-12
downloading:  2019-03-13
downloading:  2019-03-14
downloading:  2019-03-15
downloading:  2019-03-16
downloading:  2019-03-17
downloading:  2019-03-18
downloading:  2019-03-19
downloading:  2019-03-20
downloading:  2019-03-21
downloading:  2019-03-22
downloading:  2019-03-23
downloading:  

In [27]:
from io import StringIO

df_arrivals = pd.read_csv(StringIO(b'\n'.join(arrival_lines).decode('utf-8')), header=None, names=['date', 'xml_obj', 'uuid'])
df_departures = pd.read_csv(StringIO(b'\n'.join(departure_lines).decode('utf-8')), header=None, names=['date', 'xml_obj', 'uuid'])

In [30]:
df_departures.to_csv('departures.csv', index=None)
df_arrivals.to_csv('arrivals.csv', index=None)

# Please keep in mind this is memory intensive good approach is to do each month seperately and later merge them

In [31]:
arrival_data = arrival_data_template()
departure_data = departure_data_template()

# loop through every day

for root_text, observation_time in df_arrivals[['xml_obj', 'date']].values:  
            
    arrival_data['ObservationTime'].append(observation_time)

    root = ET.fromstring(root_text)

    # Ride
    ride = root.find('./ns2:ReisInformatieProductDAS/ns2:DynamischeAankomstStaat', ns)

    arrival_data['RideId'].append(int(ride.find('./ns2:RitId', ns).text))
    arrival_data['RideTime'].append(root.find('./ns2:ReisInformatieProductDAS/ns2:RIPAdministratie/ns2:ReisInformatieTijdstip', ns).text)

    # Stations
    train = ride.find('./ns2:TreinAankomst', ns)

    destinationStation = ride.find('./ns2:RitStation', ns)
    departureStation = train.find('./ns2:TreinHerkomst[@InfoStatus="Gepland"]',ns)
    
    extract_station_info(arrival_data, departureStation, 'DepartureStation')
    extract_station_info(arrival_data, destinationStation, 'DestinationStation')

    # Arrival times
    arrival_data['ActualArrivalTime'].append(train.find('./ns2:AankomstTijd[@InfoStatus="Actueel"]', ns).text)
    arrival_data['PlannedArrivalTime'].append(train.find('./ns2:AankomstTijd[@InfoStatus="Gepland"]', ns).text)
    
    # Train
    arrival_data['TrainId'].append(train.find('./ns2:TreinNummer', ns).text)
    arrival_data['TrainType'].append(train.find('./ns2:TreinSoort', ns).text)
    arrival_data['TrainOperator'].append(train.find('./ns2:Vervoerder', ns).text)

    arrival_data['PlannedArrivalPlatform'].append(train.find('./ns2:TreinAankomstSpoor[@InfoStatus="Gepland"]/ns2:SpoorNummer', ns).text)
    suffix = train.find('./ns2:TreinAankomstSpoor[@InfoStatus="Gepland"]/ns2:SpoorFase', ns)
    arrival_data['PlannedArrivalPlatformSuffix'].append(None if suffix is None else suffix.text)

    arrival_data['ActualArrivalPlatform'].append(train.find('./ns2:TreinAankomstSpoor[@InfoStatus="Actueel"]/ns2:SpoorNummer', ns).text)
    suffix = train.find('./ns2:TreinAankomstSpoor[@InfoStatus="Actueel"]/ns2:SpoorFase', ns)
    arrival_data['ActualArrivalPlatformSuffix'].append(None if suffix is None else suffix.text)
    
    # # Stop stations
    # wagons = train.find('./ns1:TreinVleugel', ns)
    # if not wagons: print('No wagons, wtf?')
    # stop_stations = wagons.findall('./ns1:StopStations[@InfoStatus="Gepland"]/ns1:Station', ns)
    # if not stop_stations: print('No stop_stations, wtf?')

    # data['PlannedStopStations'].append(encode_list(stop_stations, './ns1:UICCode'))

    # stop_stations = wagons.findall('./ns1:StopStations[@InfoStatus="Actueel"]/ns1:Station', ns)
    # data['ActualStopStations'].append(encode_list(stop_stations, './ns1:UICCode'))
    # if not stop_stations: print('No stop_stations2, wtf?')

    # # Material
    # material = wagons.find('./ns1:MaterieelDeelDVS', ns)
    # if material:
    #     data['MaterialType'].append(material.find('./ns1:MaterieelSoort', ns).text)
    #     data['MaterialDesignation'].append(material.find('./ns1:MaterieelAanduiding', ns).text)
    #     data['MaterialLength'].append(material.find('./ns1:MaterieelLengte', ns).text)
    # else:
    #     data['MaterialType'].append(np.NaN)
    #     data['MaterialDesignation'].append(np.NaN)
    #     data['MaterialLength'].append(np.NaN)

    # # Change
    # changes = root.findall('.//ns1:Wijziging', ns)
    # if changes:
    #     data['HasChange'].append(True)
    #     data['ChangeType'].append(encode_list(changes, './ns1:WijzigingType'))
    # else:
    #     data['HasChange'].append(False)
    #     data['ChangeType'].append(np.NaN)

# Parse records about departures
"""for root_text, observation_time in df_departures[['xml_obj', 'date']].values:  
            
    departure_data['ObservationTime'].append(observation_time)

    root = ET.fromstring(root_text)

    # Ride
    ride = root.find('./ns2:ReisInformatieProductDVS/ns2:DynamischeVertrekStaat', ns)

    departure_data['RideId'].append(int(ride.find('./ns2:RitId', ns).text))
    departure_data['RideTime'].append(root.find('./ns2:ReisInformatieProductDVS/ns2:RIPAdministratie/ns2:ReisInformatieTijdstip', ns).text)

    # Stations
    train = ride.find('./ns2:Trein', ns)

    departureStation = ride.find('./ns2:RitStation', ns)
    destinationStation = train.find('./ns2:TreinEindBestemming[@InfoStatus="Gepland"]',ns)
    
    extract_station_info(departure_data, departureStation, 'DepartureStation')
    extract_station_info(departure_data, destinationStation, 'DestinationStation')

    # Arrival times
    departure_data['ActualDepartureTime'].append(train.find('./ns2:VertrekTijd[@InfoStatus="Actueel"]', ns).text)
    departure_data['PlannedDepartureTime'].append(train.find('./ns2:VertrekTijd[@InfoStatus="Gepland"]', ns).text)
    
    # Train
    departure_data['TrainId'].append(train.find('./ns2:TreinNummer', ns).text)
    departure_data['TrainType'].append(train.find('./ns2:TreinSoort', ns).text)
    departure_data['TrainOperator'].append(train.find('./ns2:Vervoerder', ns).text)

    departure_data['PlannedDeparturePlatform'].append(train.find('./ns2:TreinVertrekSpoor[@InfoStatus="Gepland"]/ns2:SpoorNummer', ns).text)
    suffix = train.find('./ns2:TreinVertrekSpoor[@InfoStatus="Gepland"]/ns2:SpoorFase', ns)
    departure_data['PlannedDeparturePlatformSuffix'].append(None if suffix is None else suffix.text)

    departure_data['ActualDeparturePlatform'].append(train.find('./ns2:TreinVertrekSpoor[@InfoStatus="Actueel"]/ns2:SpoorNummer', ns).text)
    suffix = train.find('./ns2:TreinVertrekSpoor[@InfoStatus="Actueel"]/ns2:SpoorFase', ns)
    departure_data['ActualDeparturePlatformSuffix'].append(None if suffix is None else suffix.text)
"""
            
# # Convert the dictionary to a dataframe
dfa = pd.DataFrame(arrival_data)    
print(dfa.shape) # show the file size kinda
# # save the file, specify your path and name for the file
# df_flat.to_csv('./data/september.csv.zip', 
#                index=False, 
#                compression=dict(method='zip', archive_name='january.csv'))



(7146, 18)


In [22]:
dfa.sort_values(['RideId', 'PlannedArrivalTime', ]).head(50)

Unnamed: 0,ObservationTime,RideId,RideTime,DepartureStationCode,DepartureStationUIC,DepartureStationType,TrainId,TrainType,TrainOperator,DestinationStationCode,DestinationStationUIC,DestinationStationType,PlannedArrivalTime,ActualArrivalTime,PlannedArrivalPlatform,PlannedArrivalPlatformSuffix,ActualArrivalPlatform,ActualArrivalPlatformSuffix
129,2019-01-02 07:23:08.704680+01:00,105,2019-01-02T07:33:00.000Z,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-01-02T07:33:00.000Z,2019-01-02T07:33:00.000Z,18,,18,
131,2019-01-02 08:14:53.961020+01:00,105,2019-01-02T07:33:00.000Z,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-01-02T07:33:00.000Z,2019-01-02T07:34:02.000Z,18,,18,
132,2019-01-02 08:16:34.890657+01:00,105,2019-01-02T07:33:00.000Z,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-01-02T07:33:00.000Z,2019-01-02T07:33:12.000Z,18,,18,
133,2019-01-02 08:32:18.975099+01:00,105,2019-01-02T07:33:00.000Z,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-01-02T07:33:00.000Z,2019-01-02T07:33:12.000Z,18,,18,
134,2019-01-02 08:37:42.943253+01:00,105,2019-01-02T07:33:00.000Z,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-01-02T07:33:00.000Z,2019-01-02T07:33:12.000Z,18,,18,
203,2019-01-03 07:23:10.802265+01:00,105,2019-01-03T07:33:00.000Z,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-01-03T07:33:00.000Z,2019-01-03T07:33:00.000Z,18,,18,
205,2019-01-03 08:32:11.170526+01:00,105,2019-01-03T07:33:00.000Z,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-01-03T07:33:00.000Z,2019-01-03T07:33:00.000Z,18,,18,
206,2019-01-03 08:49:03.881832+01:00,105,2019-01-03T07:33:00.000Z,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-01-03T07:33:00.000Z,2019-01-03T07:33:00.000Z,18,,18,
272,2019-01-04 07:23:10.747901+01:00,105,2019-01-04T07:33:00.000Z,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-01-04T07:33:00.000Z,2019-01-04T07:33:00.000Z,18,,18,
274,2019-01-04 08:19:24.291712+01:00,105,2019-01-04T07:33:00.000Z,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-01-04T07:33:00.000Z,2019-01-04T07:34:13.000Z,18,,18,


In [None]:
# you can laod the files again in 
df_july = pd.read_csv('../data/july.csv.zip')
df_august = pd.read_csv('../data/august.csv.zip')
df_september = pd.read_csv('../data/september.csv.zip')

In [None]:
# here you merge them into one big dataframe
months = [df_july, df_august, df_september]
df_months = pd.concat(months)

In [None]:
# save the big dataframe again
df_months.to_csv('../data/july_to_september.csv.zip', 
                 index=False, 
                 compression=dict(method='zip', archive_name='july_to_september.csv'))

In [None]:
# here we filter out only all rides from utrecht to amsterdam and vise versa
df_ut_asd = df_months.loc[((df_months['DepartureStationCode'] == 'UT') & 
               (df_months['PlannedDestinationStationCode'] == 'ASD')) | 
              ((df_months['DepartureStationCode'] == 'ASD') & 
               (df_months['PlannedDestinationStationCode'] == 'UT'))]

In [None]:
# save the reduced dataframe again
df_ut_asd.to_csv('../data/july_to_september_utrecht_amsterdam.csv.zip', 
                 index=False, 
                 compression=dict(method='zip', archive_name='july_to_september_utrecht_amsterdam.csv'))