In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import xml.etree.ElementTree as ET

from functools import reduce
from datetime import datetime, timedelta

pd.set_option('display.max_columns', None)
%config IPCompleter.greedy=True

In [2]:
# load all data time chunks
dfa_1 = pd.read_csv('../assets/data/2019 UT-ASD/2019-01-01--2019-03-31 departures.csv')  
dfa_2 = pd.read_csv('../assets/data/2019 UT-ASD/2019-04-01--2019-06-30_departures.csv')  
dfa_3 = pd.read_csv('../assets/data/2019 UT-ASD/2019-07-01--2019-07-31_departures.csv')  
dfa_4 = pd.read_csv('../assets/data/2019 UT-ASD/2019-08-01--2019-08-31 departures.csv')  
dfa_5 = pd.read_csv('../assets/data/2019 UT-ASD/2019-09-01--2019-10-31_departures.csv') 
dfa_6 = pd.read_csv('../assets/data/2019 UT-ASD/2019-11-01--2019-11-30_departures.csv')  
dfa_7 = pd.read_csv('../assets/data/2019 UT-ASD/2019-12-01--2019-12-31-departures.csv')  

In [3]:
dfa_all = [dfa_1, dfa_2, dfa_3, dfa_4, dfa_5, dfa_6, dfa_7]
dfa_not_parsed = pd.concat(dfa_all)

In [4]:
dfa_not_parsed.shape

(14863, 3)

In [5]:
dfa_not_parsed.head()

Unnamed: 0,date,xml_obj,uuid
0,2019-01-01 00:08:01.533926+01:00,"<?xml version=""1.0"" encoding=""UTF-8""?><ns1:Put...",ebc0de90-0d50-11e9-b8b1-06550c001849
1,2019-01-01 01:09:01.050124+01:00,"<?xml version=""1.0"" encoding=""UTF-8""?><ns1:Put...",70feba98-0d59-11e9-b8b1-06550c001849
2,2019-01-01 01:18:41.841361+01:00,"<?xml version=""1.0"" encoding=""UTF-8""?><ns1:Put...",cb2c6e10-0d5a-11e9-b8b1-06550c001849
3,2019-01-01 02:08:20.000065+01:00,"<?xml version=""1.0"" encoding=""UTF-8""?><ns1:Put...",ba4b62f2-0d61-11e9-b8b1-06550c001849
4,2019-01-01 02:09:01.224701+01:00,"<?xml version=""1.0"" encoding=""UTF-8""?><ns1:Put...",d2ddc602-0d61-11e9-b8b1-06550c001849


# Parse into csv

In [6]:
def extract_station_info(data, node, prefix):
  if node:
    data[prefix + 'Code'].append(node.find('./ns2:StationCode', ns).text)
    data[prefix + 'UIC'].append(int(node.find('./ns2:UICCode', ns).text))
    data[prefix + 'Type'].append(int(node.find('./ns2:Type', ns).text))
  else:
    data[prefix + 'Code'].append(np.NaN)
    data[prefix + 'UIC'].append(np.NaN)
    data[prefix + 'Type'].append(np.NaN)

def parse_timestamp(date_string):
  return datetime.strptime(date_string, '%Y-%m-%dT%H:%M:%S.%fZ')

def encode_list(nodes, sufix):
  UIC_codes = [node.find(sufix, ns).text for node in nodes]
  return ';'.join(UIC_codes)

In [7]:
station_amsterdam = b'<ns2:RitStation><ns2:StationCode>ASD</ns2:StationCode>'
station_utrecht = b'<ns2:RitStation><ns2:StationCode>UT</ns2:StationCode>'
source_amsterdam = b'<ns2:TreinHerkomst InfoStatus=""Gepland""><ns2:StationCode>ASD</ns2:StationCode'
source_utrecht = b'<ns2:TreinHerkomst InfoStatus=""Gepland""><ns2:StationCode>UT</ns2:StationCode>'
destination_amsterdam = b'<ns2:TreinEindBestemming InfoStatus=""Gepland""><ns2:StationCode>AMS</ns2:StationCode>'
destination_utrecht = b'<ns2:TreinEindBestemming InfoStatus=""Gepland""><ns2:StationCode>UT</ns2:StationCode>' 

In [8]:
# namespace of the xml object
ns={
    'ns1d': 'urn:ndov:cdm:trein:reisinformatie:messages:5',
    'ns1a': 'urn:ndov:cdm:trein:reisinformatie:messages:dynamischeaankomststaat:1',
    'ns2': 'urn:ndov:cdm:trein:reisinformatie:data:4',
}

# dictionary that will save the data while loading it in
def arrival_data_template():

    return {
        'ObservationTime': [],

        # Ride
        'RideId':[],
        'RideTime': [],

        # Departure station
        'DepartureStationCode': [],
        'DepartureStationUIC': [],
        'DepartureStationType': [],

        # Train
        'TrainId': [],
        'TrainType': [],
        'TrainOperator': [],

        # Actual destination
        'DestinationStationCode': [],
        'DestinationStationUIC': [],
        'DestinationStationType': [],

        # Arrival times
        'PlannedArrivalTime': [],
        'ActualArrivalTime': [],

        # Arrival platform
        'PlannedArrivalPlatform': [],
        'PlannedArrivalPlatformSuffix': [],
        'ActualArrivalPlatform': [],
        'ActualArrivalPlatformSuffix': [],

#         # Departure platforms
#         'PlannedDeparturePlatform': [],
#         'ActualDeparturePlatform': [],

#         # Stop stations
#         'PlannedStopStations': [],
#         'ActualStopStations': [],

#         # Matirial type
#         'MaterialType': [],
#         'MaterialDesignation': [],
#         'MaterialLength': [],

#         'ChangeType': [],

    }

def departure_data_template():
    return {
    'ObservationTime': [],

    # Ride
    'RideId':[],
    'RideTime': [],

    # Departure station
    'DepartureStationCode': [],
    'DepartureStationUIC': [],
    'DepartureStationType': [],

    # Train
    'TrainId': [],
    'TrainType': [],
    'TrainOperator': [],

    # Actual destination
    'DestinationStationCode': [],
    'DestinationStationUIC': [],
    'DestinationStationType': [],

    # Departure times
    'PlannedDepartureTime': [],
    'ActualDepartureTime': [],

    # Departure platform
    'PlannedDeparturePlatform': [],
    'PlannedDeparturePlatformSuffix': [],
    'ActualDeparturePlatform': [],
    'ActualDeparturePlatformSuffix': [],

    # Departure platforms
    'PlannedDeparturePlatform': [],
    'ActualDeparturePlatform': [],

    # Stop stations
    'PlannedStopStations': [],
    'ActualStopStations': [],

    # Matirial type
    'MaterialType': [],
    'MaterialDesignation': [],
    'MaterialLength': [],
    
    'HasChange': [],
    'ChangeType': [],

}

In [9]:
departure_data = departure_data_template()
# Parse records about departures
for root_text, observation_time in dfa_not_parsed[['xml_obj', 'date']].values:  
            
    departure_data['ObservationTime'].append(observation_time)

    root = ET.fromstring(root_text)

    # Ride
    ride = root.find('./ns2:ReisInformatieProductDVS/ns2:DynamischeVertrekStaat', ns)

    departure_data['RideId'].append(int(ride.find('./ns2:RitId', ns).text))
    departure_data['RideTime'].append(root.find('./ns2:ReisInformatieProductDVS/ns2:RIPAdministratie/ns2:ReisInformatieTijdstip', ns).text)

    # Stations
    train = ride.find('./ns2:Trein', ns)

    departureStation = ride.find('./ns2:RitStation', ns)
    destinationStation = train.find('./ns2:TreinEindBestemming[@InfoStatus="Gepland"]',ns)
    
    extract_station_info(departure_data, departureStation, 'DepartureStation')
    extract_station_info(departure_data, destinationStation, 'DestinationStation')

    # Departure times
    departure_data['ActualDepartureTime'].append(train.find('./ns2:VertrekTijd[@InfoStatus="Actueel"]', ns).text)
    departure_data['PlannedDepartureTime'].append(train.find('./ns2:VertrekTijd[@InfoStatus="Gepland"]', ns).text)
    
    # Train
    departure_data['TrainId'].append(train.find('./ns2:TreinNummer', ns).text)
    departure_data['TrainType'].append(train.find('./ns2:TreinSoort', ns).text)
    departure_data['TrainOperator'].append(train.find('./ns2:Vervoerder', ns).text)

    pd_platform = train.find('./ns2:TreinVertrekSpoor[@InfoStatus="Gepland"]/ns2:SpoorNummer', ns)
    ad_platfrom = train.find('./ns2:TreinVertrekSpoor[@InfoStatus="Actueel"]/ns2:SpoorNummer', ns)
    
    if pd_platform and ad_platf
    departure_data['PlannedDeparturePlatform'].append(train.find('./ns2:TreinVertrekSpoor[@InfoStatus="Gepland"]/ns2:SpoorNummer', ns).text)
    suffix = train.find('./ns2:TreinVertrekSpoor[@InfoStatus="Gepland"]/ns2:SpoorFase', ns)
    departure_data['PlannedDeparturePlatformSuffix'].append(None if suffix is None else suffix.text)

    departure_data['ActualDeparturePlatform'].append(train.find('./ns2:TreinVertrekSpoor[@InfoStatus="Actueel"]/ns2:SpoorNummer', ns).text)
    suffix = train.find('./ns2:TreinVertrekSpoor[@InfoStatus="Actueel"]/ns2:SpoorFase', ns)
    departure_data['ActualDeparturePlatformSuffix'].append(None if suffix is None else suffix.text)
    
    # Stop stations
    wagons = train.find('./ns2:TreinVleugel', ns)
    if not wagons: print('No wagons, wtf?')
    stop_stations = wagons.findall('./ns2:StopStations[@InfoStatus="Gepland"]/ns2:Station', ns)
    if not stop_stations: print('No stop_stations, wtf?')

    departure_data['PlannedStopStations'].append(encode_list(stop_stations, './ns2:UICCode'))

    stop_stations = wagons.findall('./ns2:StopStations[@InfoStatus="Actueel"]/ns2:Station', ns)
    departure_data['ActualStopStations'].append(encode_list(stop_stations, './ns2:UICCode'))
    if not stop_stations: print('No stop_stations2, wtf?')

    # Material
    material = wagons.find('./ns2:MaterieelDeelDVS', ns)
    if material:
        departure_data['MaterialType'].append(material.find('./ns2:MaterieelSoort', ns).text)
        departure_data['MaterialDesignation'].append(material.find('./ns2:MaterieelAanduiding', ns).text)
        departure_data['MaterialLength'].append(material.find('./ns2:MaterieelLengte', ns).text)
    else:
        departure_data['MaterialType'].append(np.NaN)
        departure_data['MaterialDesignation'].append(np.NaN)
        departure_data['MaterialLength'].append(np.NaN)

    # Change
    changes = root.findall('./ns2:Wijziging', ns)
    if changes:
        departure_data['HasChange'].append(True)
        departure_data['ChangeType'].append(encode_list(changes, './ns2:WijzigingType'))
    else:
        departure_data['HasChange'].append(False)
        departure_data['ChangeType'].append(np.NaN)
        
dfd = pd.DataFrame(departure_data)  
print(dfa.shape)

AttributeError: 'NoneType' object has no attribute 'text'

In [11]:
arrival_data = arrival_data_template()

# loop through every day

for root_text, observation_time in dfa_not_parsed[['xml_obj', 'date']].values:  
            
    arrival_data['ObservationTime'].append(observation_time)

    root = ET.fromstring(root_text)

    # Ride
    ride = root.find('./ns2:ReisInformatieProductDAS/ns2:DynamischeAankomstStaat', ns)

    arrival_data['RideId'].append(int(ride.find('./ns2:RitId', ns).text))
    arrival_data['RideTime'].append(root.find('./ns2:ReisInformatieProductDAS/ns2:RIPAdministratie/ns2:ReisInformatieTijdstip', ns).text)

    # Stations
    train = ride.find('./ns2:TreinAankomst', ns)

    destinationStation = ride.find('./ns2:RitStation', ns)
    departureStation = train.find('./ns2:TreinHerkomst[@InfoStatus="Gepland"]',ns)
    
    extract_station_info(arrival_data, departureStation, 'DepartureStation')
    extract_station_info(arrival_data, destinationStation, 'DestinationStation')

    # Arrival times
    arrival_data['ActualArrivalTime'].append(train.find('./ns2:AankomstTijd[@InfoStatus="Actueel"]', ns).text)
    arrival_data['PlannedArrivalTime'].append(train.find('./ns2:AankomstTijd[@InfoStatus="Gepland"]', ns).text)
    
    # Train
    arrival_data['TrainId'].append(train.find('./ns2:TreinNummer', ns).text)
    arrival_data['TrainType'].append(train.find('./ns2:TreinSoort', ns).text)
    arrival_data['TrainOperator'].append(train.find('./ns2:Vervoerder', ns).text)

    arrival_data['PlannedArrivalPlatform'].append(train.find('./ns2:TreinAankomstSpoor[@InfoStatus="Gepland"]/ns2:SpoorNummer', ns).text)
    suffix = train.find('./ns2:TreinAankomstSpoor[@InfoStatus="Gepland"]/ns2:SpoorFase', ns)
    arrival_data['PlannedArrivalPlatformSuffix'].append(None if suffix is None else suffix.text)

    arrival_data['ActualArrivalPlatform'].append(train.find('./ns2:TreinAankomstSpoor[@InfoStatus="Actueel"]/ns2:SpoorNummer', ns).text)
    suffix = train.find('./ns2:TreinAankomstSpoor[@InfoStatus="Actueel"]/ns2:SpoorFase', ns)
    arrival_data['ActualArrivalPlatformSuffix'].append(None if suffix is None else suffix.text)
    
# Convert the dictionary to a dataframe
dfa = pd.DataFrame(arrival_data)    
print(dfa.shape) # show the file size kinda

(32350, 18)


In [10]:
dfa.head()

NameError: name 'dfa' is not defined

In [12]:
dfa.to_csv('../assets/data/2019 UT-ASD/2019-parsed-data.csv')

# Merge data with weather

In [5]:
dfa = pd.read_csv('../assets/data/2019 UT-ASD/2019-parsed-data.csv')

In [10]:
all(dfa.RideTime == dfa.PlannedArrivalTime)

True

In [64]:
dfa.head()

Unnamed: 0.1,Unnamed: 0,ObservationTime,RideId,RideTime,DepartureStationCode,DepartureStationUIC,DepartureStationType,TrainId,TrainType,TrainOperator,DestinationStationCode,DestinationStationUIC,DestinationStationType,PlannedArrivalTime,ActualArrivalTime,PlannedArrivalPlatform,PlannedArrivalPlatformSuffix,ActualArrivalPlatform,ActualArrivalPlatformSuffix
0,0,2019-01-01 00:19:01.914659+01:00,1405,2019-01-01T00:29:00.000Z,UT,8400621,6,1405,Intercity,NS,ASD,8400058,6,2019-01-01T00:29:00.000Z,2019-01-01T00:29:00.000Z,2,a,2,a
1,1,2019-01-01 00:43:02.633669+01:00,1402,2019-01-01T00:53:00.000Z,ASD,8400058,6,1402,Intercity,NS,UT,8400621,6,2019-01-01T00:53:00.000Z,2019-01-01T00:53:00.000Z,15,,15,
2,2,2019-01-01 01:20:59.756816+01:00,1405,2019-01-01T00:29:00.000Z,UT,8400621,6,1405,Intercity,NS,ASD,8400058,6,2019-01-01T00:29:00.000Z,2019-01-01T00:30:49.000Z,2,a,2,a
3,3,2019-01-01 01:28:35.627872+01:00,1405,2019-01-01T00:29:00.000Z,UT,8400621,6,1405,Intercity,NS,ASD,8400058,6,2019-01-01T00:29:00.000Z,2019-01-01T00:29:31.000Z,2,a,2,a
4,4,2019-01-01 01:30:22.532675+01:00,1405,2019-01-01T00:29:00.000Z,UT,8400621,6,1405,Intercity,NS,ASD,8400058,6,2019-01-01T00:29:00.000Z,2019-01-01T00:29:31.000Z,2,a,2,a


In [65]:
dfa.shape

(32350, 19)

In [66]:
if 'RideInstance' in dfa:
    del dfa['RideInstance']
dfa.insert(0, 'RideInstance', dfa.RideId.astype(str) + '#' + dfa.RideTime )

In [67]:
dfa.head()

Unnamed: 0.1,RideInstance,Unnamed: 0,ObservationTime,RideId,RideTime,DepartureStationCode,DepartureStationUIC,DepartureStationType,TrainId,TrainType,TrainOperator,DestinationStationCode,DestinationStationUIC,DestinationStationType,PlannedArrivalTime,ActualArrivalTime,PlannedArrivalPlatform,PlannedArrivalPlatformSuffix,ActualArrivalPlatform,ActualArrivalPlatformSuffix
0,1405#2019-01-01T00:29:00.000Z,0,2019-01-01 00:19:01.914659+01:00,1405,2019-01-01T00:29:00.000Z,UT,8400621,6,1405,Intercity,NS,ASD,8400058,6,2019-01-01T00:29:00.000Z,2019-01-01T00:29:00.000Z,2,a,2,a
1,1402#2019-01-01T00:53:00.000Z,1,2019-01-01 00:43:02.633669+01:00,1402,2019-01-01T00:53:00.000Z,ASD,8400058,6,1402,Intercity,NS,UT,8400621,6,2019-01-01T00:53:00.000Z,2019-01-01T00:53:00.000Z,15,,15,
2,1405#2019-01-01T00:29:00.000Z,2,2019-01-01 01:20:59.756816+01:00,1405,2019-01-01T00:29:00.000Z,UT,8400621,6,1405,Intercity,NS,ASD,8400058,6,2019-01-01T00:29:00.000Z,2019-01-01T00:30:49.000Z,2,a,2,a
3,1405#2019-01-01T00:29:00.000Z,3,2019-01-01 01:28:35.627872+01:00,1405,2019-01-01T00:29:00.000Z,UT,8400621,6,1405,Intercity,NS,ASD,8400058,6,2019-01-01T00:29:00.000Z,2019-01-01T00:29:31.000Z,2,a,2,a
4,1405#2019-01-01T00:29:00.000Z,4,2019-01-01 01:30:22.532675+01:00,1405,2019-01-01T00:29:00.000Z,UT,8400621,6,1405,Intercity,NS,ASD,8400058,6,2019-01-01T00:29:00.000Z,2019-01-01T00:29:31.000Z,2,a,2,a


In [68]:
dfa = dfa.drop(columns=['Unnamed: 0'])

In [69]:
dfa.sort_values(['RideInstance', 'ActualArrivalTime'], ascending = True, inplace = True)
dfa.head()

Unnamed: 0,RideInstance,ObservationTime,RideId,RideTime,DepartureStationCode,DepartureStationUIC,DepartureStationType,TrainId,TrainType,TrainOperator,DestinationStationCode,DestinationStationUIC,DestinationStationType,PlannedArrivalTime,ActualArrivalTime,PlannedArrivalPlatform,PlannedArrivalPlatformSuffix,ActualArrivalPlatform,ActualArrivalPlatformSuffix
15090,104#2019-06-22T19:58:00.000Z,2019-06-22 21:03:14.027447+02:00,104,2019-06-22T19:58:00.000Z,UT,8400621,6,300104,ICE International,NS,ASD,8400058,6,2019-06-22T19:58:00.000Z,2019-06-22T19:58:00.000Z,7,b,7,b
15092,104#2019-06-22T19:58:00.000Z,2019-06-22 21:16:20.243615+02:00,104,2019-06-22T19:58:00.000Z,UT,8400621,6,300104,ICE International,NS,ASD,8400058,6,2019-06-22T19:58:00.000Z,2019-06-22T19:58:00.000Z,7,b,7,b
129,105#2019-01-02T07:33:00.000Z,2019-01-02 07:23:08.704680+01:00,105,2019-01-02T07:33:00.000Z,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-01-02T07:33:00.000Z,2019-01-02T07:33:00.000Z,18,,18,
132,105#2019-01-02T07:33:00.000Z,2019-01-02 08:16:34.890657+01:00,105,2019-01-02T07:33:00.000Z,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-01-02T07:33:00.000Z,2019-01-02T07:33:12.000Z,18,,18,
133,105#2019-01-02T07:33:00.000Z,2019-01-02 08:32:18.975099+01:00,105,2019-01-02T07:33:00.000Z,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-01-02T07:33:00.000Z,2019-01-02T07:33:12.000Z,18,,18,


In [70]:
# drop near duplicate values
dfa.drop_duplicates('RideInstance', keep='last', inplace = True)

In [71]:
dfa.shape

(8190, 19)

In [72]:
dfa['RideTime'].dtype
dfa['RideTime'] = pd.to_datetime(dfa['RideTime'])

In [73]:
dfa_weather_U = pd.read_csv('../assets/data/de_bilt_weather_2019.csv')  
dfa_weather_A = pd.read_csv('../assets/data/schiphol_weather_2019.csv') 

In [74]:
dfa_weather_A['WeatherKey'] = dfa_weather_A['Timestamp'].astype(str) + '-' + dfa_weather_A['Hour'].astype(str) + '-' + dfa_weather_A['StationCode'].astype(str)
dfa_weather_U['WeatherKey'] = dfa_weather_U['Timestamp'].astype(str) + '-' + dfa_weather_U['Hour'].astype(str)+ '-'+ dfa_weather_U['StationCode'].astype(str)

In [25]:
print(dfa_weather_A.head())
print(dfa_weather_U.head())

   StationCode   Timestamp  Hour  WindDir  WindHour  WindSpeed  MaxWindSpeed  \
0          240  2019-01-01     1      260       7.0        6.0          10.0   
1          240  2019-01-01     2      260       7.0        7.0          10.0   
2          240  2019-01-01     3      250       7.0        7.0          11.0   
3          240  2019-01-01     4      250       7.0        8.0          11.0   
4          240  2019-01-01     5      260       9.0        9.0          12.0   

   Temperature  MinTemp10M  DewPointTemp  SunshineDur  Radiation  PrecipDur  \
0          8.5         NaN           5.7          0.0          0        0.0   
1          8.6         NaN           5.1          0.0          0        0.0   
2          8.5         NaN           5.1          0.0          0        0.0   
3          8.2         NaN           5.4          0.0          0        0.0   
4          8.7         NaN           5.8          0.0          0        0.0   

   PrecipHour  AirPressure  Visibility  Clou

In [75]:
def add_uic_code(destination, weather, weather_station):
    if destination == 'ASD':
        return f'{weather[0:13]}-{weather_station[0]}'
    if destination == 'UT':
        return f'{weather[0:13]}-{weather_station[1]}'

In [76]:
# this function renames the weather columns with a specific prefix
def rename_weather(suffix, df):
    return df.rename(columns = {
        'StationCode' : f'{suffix}WeatherStationCode',
        'Timestamp' : f'{suffix}Timestamp',  # date (YYYY=year,MM=month,DD=day)
        'Hour' : f'{suffix}Hour' ,  # time (HH uur/hour, UT. 12 UT=13 MET, 14 MEZT. Hourly division 05 runs from 04.00 UT to 5.00 UT
        'WindDir' : f'{suffix}WindDir' ,  # Mean wind direction (in degrees) during the 10-minute period preceding the time of observation (360=north, 90=east, 180=south, 270=west, 0=calm 990=variable)
        'WindHour' : f'{suffix}WindHour' ,  # Hourly mean wind speed (in 0.1 m/s)
        'WindSpeed' : f'{suffix}WindSpeed' , # Mean wind speed (in 0.1 m/s) during the 10-minute period preceding the time of observation  
        'MaxWindSpeed' : f'{suffix}MaxWindSpeed' ,  # Maximum wind gust (in 0.1 m/s) during the hourly division
        'Temperature' : f'{suffix}Temperature' ,  # Temperature (in 0.1 degrees Celsius) at 1.50 m at the time of observation  
        'MinTemp10M' : f'{suffix}MinTemp10M' ,  # Minimum temperature (in 0.1 degrees Celsius) at 0.1 m in the preceding 6-hour period
        'DewPointTemp' : f'{suffix}DewPointTemp' ,  # Dew point temperature (in 0.1 degrees Celsius) at 1.50 m at the time of observation 
        'SunshineDur' : f'{suffix}SunshineDur' ,  # Sunshine duration (in 0.1 hour) during the hourly division, calculated from global radiation (-1 for <0.05 hour) 
        'Radiation' : f'{suffix}Radiation' ,  # Global radiation (in J/cm2) during the hourly division    
        'PrecipDur' : f'{suffix}PrecipDur',  # Precipitation duration (in 0.1 hour) during the hourly division
        'PrecipHour' : f'{suffix}PrecipHour',  # Hourly precipitation amount (in 0.1 mm) (-1 for <0.05 mm)
        'AirPressure' : f'{suffix}AirPressure',  # Air pressure (in 0.1 hPa) reduced to mean sea level, at the time of observation 
        'Visibility' : f'{suffix}Visibility',  # Horizontal visibility at the time of observation (0=less than 100m, 1=100-200m, 2=200-300m,..., 49=4900-5000m, 50=5-6km, 56=6-7km, 57=7-8km, ..., 79=29-30km, 80=30-35km, 81=35-40km,..., 89=more than 70km)
        'Cloudines' : f'{suffix}Cloudiness',  # Cloud cover (in octants), at the time of observation (9=sky invisible)
        'Humidity': f'{suffix}Humidity',  # Relative atmospheric humidity (in percents) at 1.50 m at the time of observation
        'WeatherCode' : f'{suffix}WeatherCode',  # Present weather code (00-99), description for the hourly division. (http://bibliotheek.knmi.nl/scholierenpdf/weercodes_Nederland)
        'WeatherCodeIndicator': f'{suffix}WeatherCodeIndicator',  # Indicator present weather code (1=manned and recorded (using code from visual observations), 2,3=manned and omitted (no significant weather phenomenon to report, not available), 4=automatically recorded (using code from visual observations), 5,6=automatically omitted (no significant weather phenomenon to report, not available), 7=automatically set (using code from automated observations) 
        'Fog' : f'{suffix}Fog',  # Fog 0=no occurrence, 1=occurred during the preceding hour and/or at the time of observation
        'Rain' : f'{suffix}Rain',  # Rainfall 0=no occurrence, 1=occurred during the preceding hour and/or at the time of observation
        'Snow' : f'{suffix}Snow',  # Snow 0=no occurrence, 1=occurred during the preceding hour and/or at the time of observation
        'Thunder' : f'{suffix}Thunder',  # Thunder  0=no occurrence, 1=occurred during the preceding hour and/or at the time of observation 
        'IceFormation' : f'{suffix}IceFormation'  # Ice formation 0=no occurrence, 1=occurred during the preceding hour and/or at the time of observation
    })

In [77]:
dfa['WeatherKey'] = dfa['RideTime'].dt.strftime('%Y-%m-%d') + '-' + (dfa['RideTime'].dt.hour.astype(int) + 1).astype(str)

In [78]:
dfa['WeatherKey'] = dfa.apply(lambda row: add_uic_code(row['DestinationStationCode'], row['WeatherKey'], [240, 260]), axis=1)

In [79]:
dfa.head()

Unnamed: 0,RideInstance,ObservationTime,RideId,RideTime,DepartureStationCode,DepartureStationUIC,DepartureStationType,TrainId,TrainType,TrainOperator,DestinationStationCode,DestinationStationUIC,DestinationStationType,PlannedArrivalTime,ActualArrivalTime,PlannedArrivalPlatform,PlannedArrivalPlatformSuffix,ActualArrivalPlatform,ActualArrivalPlatformSuffix,WeatherKey
15092,104#2019-06-22T19:58:00.000Z,2019-06-22 21:16:20.243615+02:00,104,2019-06-22 19:58:00+00:00,UT,8400621,6,300104,ICE International,NS,ASD,8400058,6,2019-06-22T19:58:00.000Z,2019-06-22T19:58:00.000Z,7,b,7,b,2019-06-22-20-240
131,105#2019-01-02T07:33:00.000Z,2019-01-02 08:14:53.961020+01:00,105,2019-01-02 07:33:00+00:00,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-01-02T07:33:00.000Z,2019-01-02T07:34:02.000Z,18,,18,,2019-01-02-8-260
206,105#2019-01-03T07:33:00.000Z,2019-01-03 08:49:03.881832+01:00,105,2019-01-03 07:33:00+00:00,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-01-03T07:33:00.000Z,2019-01-03T07:33:00.000Z,18,,18,,2019-01-03-8-260
276,105#2019-01-04T07:33:00.000Z,2019-01-04 08:37:38.178122+01:00,105,2019-01-04 07:33:00+00:00,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-01-04T07:33:00.000Z,2019-01-04T07:34:13.000Z,18,,18,,2019-01-04-8-260
354,105#2019-01-05T07:32:00.000Z,2019-01-05 08:40:55.071092+01:00,105,2019-01-05 07:32:00+00:00,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-01-05T07:32:00.000Z,2019-01-05T07:38:37.000Z,19,,19,,2019-01-05-8-260


In [80]:
 dfa.sort_values('WeatherKey').head()

Unnamed: 0,RideInstance,ObservationTime,RideId,RideTime,DepartureStationCode,DepartureStationUIC,DepartureStationType,TrainId,TrainType,TrainOperator,DestinationStationCode,DestinationStationUIC,DestinationStationType,PlannedArrivalTime,ActualArrivalTime,PlannedArrivalPlatform,PlannedArrivalPlatformSuffix,ActualArrivalPlatform,ActualArrivalPlatformSuffix,WeatherKey
2,1405#2019-01-01T00:29:00.000Z,2019-01-01 01:20:59.756816+01:00,1405,2019-01-01 00:29:00+00:00,UT,8400621,6,1405,Intercity,NS,ASD,8400058,6,2019-01-01T00:29:00.000Z,2019-01-01T00:30:49.000Z,2,a,2,a,2019-01-01-1-240
8,1402#2019-01-01T00:53:00.000Z,2019-01-01 01:51:40.442110+01:00,1402,2019-01-01 00:53:00+00:00,ASD,8400058,6,1402,Intercity,NS,UT,8400621,6,2019-01-01T00:53:00.000Z,2019-01-01T00:53:00.000Z,15,,15,,2019-01-01-1-260
50,123#2019-01-01T10:03:00.000Z,2019-01-01 11:02:25.017695+01:00,123,2019-01-01 10:03:00+00:00,ASD,8400058,6,123,ICE International,NS,UT,8400621,6,2019-01-01T10:03:00.000Z,2019-01-01T10:05:11.000Z,18,,18,,2019-01-01-11-260
56,125#2019-01-01T12:03:00.000Z,2019-01-01 13:07:58.375403+01:00,125,2019-01-01 12:03:00+00:00,ASD,8400058,6,125,ICE International,NS,UT,8400621,6,2019-01-01T12:03:00.000Z,2019-01-01T12:03:00.000Z,18,,18,,2019-01-01-13-260
59,127#2019-01-01T14:03:00.000Z,2019-01-01 15:07:55.997611+01:00,127,2019-01-01 14:03:00+00:00,ASD,8400058,6,127,ICE International,NS,UT,8400621,6,2019-01-01T14:03:00.000Z,2019-01-01T14:03:00.000Z,18,,18,,2019-01-01-15-260


In [81]:
# combine weather in one big weather dataset
result_weather = pd.concat([dfa_weather_A, dfa_weather_U])

In [83]:
result_weather.head()

Unnamed: 0,StationCode,Timestamp,Hour,WindDir,WindHour,WindSpeed,MaxWindSpeed,Temperature,MinTemp10M,DewPointTemp,SunshineDur,Radiation,PrecipDur,PrecipHour,AirPressure,Visibility,Cloudines,Humidity,WeatherCode,WeatherCodeIndicator,Fog,Rain,Snow,Thunder,IceFormation,WeatherKey
0,240,2019-01-01,1,260,7.0,6.0,10.0,8.5,,5.7,0.0,0,0.0,0.0,1030.6,69.0,8.0,82,,5,0.0,0.0,0.0,0.0,0.0,2019-01-01-1-240
1,240,2019-01-01,2,260,7.0,7.0,10.0,8.6,,5.1,0.0,0,0.0,0.0,1030.1,75.0,8.0,78,,5,0.0,0.0,0.0,0.0,0.0,2019-01-01-2-240
2,240,2019-01-01,3,250,7.0,7.0,11.0,8.5,,5.1,0.0,0,0.0,0.0,1029.5,75.0,8.0,79,,5,0.0,0.0,0.0,0.0,0.0,2019-01-01-3-240
3,240,2019-01-01,4,250,7.0,8.0,11.0,8.2,,5.4,0.0,0,0.0,0.0,1029.0,70.0,8.0,82,,5,0.0,0.0,0.0,0.0,0.0,2019-01-01-4-240
4,240,2019-01-01,5,260,9.0,9.0,12.0,8.7,,5.8,0.0,0,0.0,-0.1,1028.3,70.0,8.0,81,22.0,7,0.0,1.0,0.0,0.0,0.0,2019-01-01-5-240


In [84]:
# merge weather with train data
dfa = pd.merge(dfa,result_weather, on = 'WeatherKey', how='left')
dfa.head()

Unnamed: 0,RideInstance,ObservationTime,RideId,RideTime,DepartureStationCode,DepartureStationUIC,DepartureStationType,TrainId,TrainType,TrainOperator,DestinationStationCode,DestinationStationUIC,DestinationStationType,PlannedArrivalTime,ActualArrivalTime,PlannedArrivalPlatform,PlannedArrivalPlatformSuffix,ActualArrivalPlatform,ActualArrivalPlatformSuffix,WeatherKey,StationCode,Timestamp,Hour,WindDir,WindHour,WindSpeed,MaxWindSpeed,Temperature,MinTemp10M,DewPointTemp,SunshineDur,Radiation,PrecipDur,PrecipHour,AirPressure,Visibility,Cloudines,Humidity,WeatherCode,WeatherCodeIndicator,Fog,Rain,Snow,Thunder,IceFormation
0,104#2019-06-22T19:58:00.000Z,2019-06-22 21:16:20.243615+02:00,104,2019-06-22 19:58:00+00:00,UT,8400621,6,300104,ICE International,NS,ASD,8400058,6,2019-06-22T19:58:00.000Z,2019-06-22T19:58:00.000Z,7,b,7,b,2019-06-22-20-240,240,2019-06-22,20,50,6.0,7.0,10.0,18.3,,13.0,0.5,12,0.0,0.0,1020.5,75.0,0.0,71,,5,0.0,0.0,0.0,0.0,0.0
1,105#2019-01-02T07:33:00.000Z,2019-01-02 08:14:53.961020+01:00,105,2019-01-02 07:33:00+00:00,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-01-02T07:33:00.000Z,2019-01-02T07:34:02.000Z,18,,18,,2019-01-02-8-260,260,2019-01-02,8,340,4.0,4.0,10.0,4.9,,1.9,0.0,0,0.1,0.1,1036.8,70.0,8.0,81,23.0,7,0.0,1.0,0.0,0.0,0.0
2,105#2019-01-03T07:33:00.000Z,2019-01-03 08:49:03.881832+01:00,105,2019-01-03 07:33:00+00:00,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-01-03T07:33:00.000Z,2019-01-03T07:33:00.000Z,18,,18,,2019-01-03-8-260,260,2019-01-03,8,310,2.0,1.0,3.0,2.6,,1.5,0.0,0,0.0,0.0,1039.9,65.0,8.0,92,,5,0.0,0.0,0.0,0.0,0.0
3,105#2019-01-04T07:33:00.000Z,2019-01-04 08:37:38.178122+01:00,105,2019-01-04 07:33:00+00:00,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-01-04T07:33:00.000Z,2019-01-04T07:34:13.000Z,18,,18,,2019-01-04-8-260,260,2019-01-04,8,270,3.0,2.0,6.0,4.6,,2.4,0.0,0,0.0,0.0,1039.1,64.0,8.0,85,,5,0.0,0.0,0.0,0.0,0.0
4,105#2019-01-05T07:32:00.000Z,2019-01-05 08:40:55.071092+01:00,105,2019-01-05 07:32:00+00:00,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-01-05T07:32:00.000Z,2019-01-05T07:38:37.000Z,19,,19,,2019-01-05-8-260,260,2019-01-05,8,320,4.0,4.0,8.0,7.7,,4.4,0.0,0,0.0,0.0,1032.4,75.0,8.0,79,,5,0.0,0.0,0.0,0.0,0.0


In [85]:
dfa.iloc[1].to_frame()

Unnamed: 0,1
RideInstance,105#2019-01-02T07:33:00.000Z
ObservationTime,2019-01-02 08:14:53.961020+01:00
RideId,105
RideTime,2019-01-02 07:33:00+00:00
DepartureStationCode,ASD
DepartureStationUIC,8400058
DepartureStationType,6
TrainId,105
TrainType,ICE International
TrainOperator,NS


In [86]:
dfa = rename_weather('Departure', dfa)

In [87]:
dfa.head()

Unnamed: 0,RideInstance,ObservationTime,RideId,RideTime,DepartureStationCode,DepartureStationUIC,DepartureStationType,TrainId,TrainType,TrainOperator,DestinationStationCode,DestinationStationUIC,DestinationStationType,PlannedArrivalTime,ActualArrivalTime,PlannedArrivalPlatform,PlannedArrivalPlatformSuffix,ActualArrivalPlatform,ActualArrivalPlatformSuffix,WeatherKey,DepartureWeatherStationCode,DepartureTimestamp,DepartureHour,DepartureWindDir,DepartureWindHour,DepartureWindSpeed,DepartureMaxWindSpeed,DepartureTemperature,DepartureMinTemp10M,DepartureDewPointTemp,DepartureSunshineDur,DepartureRadiation,DeparturePrecipDur,DeparturePrecipHour,DepartureAirPressure,DepartureVisibility,DepartureCloudiness,DepartureHumidity,DepartureWeatherCode,DepartureWeatherCodeIndicator,DepartureFog,DepartureRain,DepartureSnow,DepartureThunder,DepartureIceFormation
0,104#2019-06-22T19:58:00.000Z,2019-06-22 21:16:20.243615+02:00,104,2019-06-22 19:58:00+00:00,UT,8400621,6,300104,ICE International,NS,ASD,8400058,6,2019-06-22T19:58:00.000Z,2019-06-22T19:58:00.000Z,7,b,7,b,2019-06-22-20-240,240,2019-06-22,20,50,6.0,7.0,10.0,18.3,,13.0,0.5,12,0.0,0.0,1020.5,75.0,0.0,71,,5,0.0,0.0,0.0,0.0,0.0
1,105#2019-01-02T07:33:00.000Z,2019-01-02 08:14:53.961020+01:00,105,2019-01-02 07:33:00+00:00,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-01-02T07:33:00.000Z,2019-01-02T07:34:02.000Z,18,,18,,2019-01-02-8-260,260,2019-01-02,8,340,4.0,4.0,10.0,4.9,,1.9,0.0,0,0.1,0.1,1036.8,70.0,8.0,81,23.0,7,0.0,1.0,0.0,0.0,0.0
2,105#2019-01-03T07:33:00.000Z,2019-01-03 08:49:03.881832+01:00,105,2019-01-03 07:33:00+00:00,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-01-03T07:33:00.000Z,2019-01-03T07:33:00.000Z,18,,18,,2019-01-03-8-260,260,2019-01-03,8,310,2.0,1.0,3.0,2.6,,1.5,0.0,0,0.0,0.0,1039.9,65.0,8.0,92,,5,0.0,0.0,0.0,0.0,0.0
3,105#2019-01-04T07:33:00.000Z,2019-01-04 08:37:38.178122+01:00,105,2019-01-04 07:33:00+00:00,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-01-04T07:33:00.000Z,2019-01-04T07:34:13.000Z,18,,18,,2019-01-04-8-260,260,2019-01-04,8,270,3.0,2.0,6.0,4.6,,2.4,0.0,0,0.0,0.0,1039.1,64.0,8.0,85,,5,0.0,0.0,0.0,0.0,0.0
4,105#2019-01-05T07:32:00.000Z,2019-01-05 08:40:55.071092+01:00,105,2019-01-05 07:32:00+00:00,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-01-05T07:32:00.000Z,2019-01-05T07:38:37.000Z,19,,19,,2019-01-05-8-260,260,2019-01-05,8,320,4.0,4.0,8.0,7.7,,4.4,0.0,0,0.0,0.0,1032.4,75.0,8.0,79,,5,0.0,0.0,0.0,0.0,0.0


# Create the departure time field

In [88]:
dfa['PlannedArrivalTime'] = pd.to_datetime(dfa['PlannedArrivalTime'])

In [89]:
dfa['PlannedDepartureTime'] = dfa['PlannedArrivalTime'] - timedelta(minutes = 30)

In [90]:
dfa['PlannedDepartureTime']

0      2019-06-22 19:28:00+00:00
1      2019-01-02 07:03:00+00:00
2      2019-01-03 07:03:00+00:00
3      2019-01-04 07:03:00+00:00
4      2019-01-05 07:02:00+00:00
                  ...           
8185   2019-09-29 17:01:00+00:00
8186   2019-06-30 17:31:00+00:00
8187   2019-09-29 17:31:00+00:00
8188   2019-06-30 18:01:00+00:00
8189   2019-09-29 18:01:00+00:00
Name: PlannedDepartureTime, Length: 8190, dtype: datetime64[ns, UTC]

In [91]:
dfa['PlannedArrivalTime'] 

0      2019-06-22 19:58:00+00:00
1      2019-01-02 07:33:00+00:00
2      2019-01-03 07:33:00+00:00
3      2019-01-04 07:33:00+00:00
4      2019-01-05 07:32:00+00:00
                  ...           
8185   2019-09-29 17:31:00+00:00
8186   2019-06-30 18:01:00+00:00
8187   2019-09-29 18:01:00+00:00
8188   2019-06-30 18:31:00+00:00
8189   2019-09-29 18:31:00+00:00
Name: PlannedArrivalTime, Length: 8190, dtype: datetime64[ns, UTC]

In [92]:
# create weather key for departure
dfa['WeatherKey'] = dfa['PlannedDepartureTime'].dt.strftime('%Y-%m-%d') + '-' + (dfa['PlannedDepartureTime'].dt.hour.astype(int) + 1).astype(str)

In [93]:
dfa.head()

Unnamed: 0,RideInstance,ObservationTime,RideId,RideTime,DepartureStationCode,DepartureStationUIC,DepartureStationType,TrainId,TrainType,TrainOperator,DestinationStationCode,DestinationStationUIC,DestinationStationType,PlannedArrivalTime,ActualArrivalTime,PlannedArrivalPlatform,PlannedArrivalPlatformSuffix,ActualArrivalPlatform,ActualArrivalPlatformSuffix,WeatherKey,DepartureWeatherStationCode,DepartureTimestamp,DepartureHour,DepartureWindDir,DepartureWindHour,DepartureWindSpeed,DepartureMaxWindSpeed,DepartureTemperature,DepartureMinTemp10M,DepartureDewPointTemp,DepartureSunshineDur,DepartureRadiation,DeparturePrecipDur,DeparturePrecipHour,DepartureAirPressure,DepartureVisibility,DepartureCloudiness,DepartureHumidity,DepartureWeatherCode,DepartureWeatherCodeIndicator,DepartureFog,DepartureRain,DepartureSnow,DepartureThunder,DepartureIceFormation,PlannedDepartureTime
0,104#2019-06-22T19:58:00.000Z,2019-06-22 21:16:20.243615+02:00,104,2019-06-22 19:58:00+00:00,UT,8400621,6,300104,ICE International,NS,ASD,8400058,6,2019-06-22 19:58:00+00:00,2019-06-22T19:58:00.000Z,7,b,7,b,2019-06-22-20,240,2019-06-22,20,50,6.0,7.0,10.0,18.3,,13.0,0.5,12,0.0,0.0,1020.5,75.0,0.0,71,,5,0.0,0.0,0.0,0.0,0.0,2019-06-22 19:28:00+00:00
1,105#2019-01-02T07:33:00.000Z,2019-01-02 08:14:53.961020+01:00,105,2019-01-02 07:33:00+00:00,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-01-02 07:33:00+00:00,2019-01-02T07:34:02.000Z,18,,18,,2019-01-02-8,260,2019-01-02,8,340,4.0,4.0,10.0,4.9,,1.9,0.0,0,0.1,0.1,1036.8,70.0,8.0,81,23.0,7,0.0,1.0,0.0,0.0,0.0,2019-01-02 07:03:00+00:00
2,105#2019-01-03T07:33:00.000Z,2019-01-03 08:49:03.881832+01:00,105,2019-01-03 07:33:00+00:00,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-01-03 07:33:00+00:00,2019-01-03T07:33:00.000Z,18,,18,,2019-01-03-8,260,2019-01-03,8,310,2.0,1.0,3.0,2.6,,1.5,0.0,0,0.0,0.0,1039.9,65.0,8.0,92,,5,0.0,0.0,0.0,0.0,0.0,2019-01-03 07:03:00+00:00
3,105#2019-01-04T07:33:00.000Z,2019-01-04 08:37:38.178122+01:00,105,2019-01-04 07:33:00+00:00,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-01-04 07:33:00+00:00,2019-01-04T07:34:13.000Z,18,,18,,2019-01-04-8,260,2019-01-04,8,270,3.0,2.0,6.0,4.6,,2.4,0.0,0,0.0,0.0,1039.1,64.0,8.0,85,,5,0.0,0.0,0.0,0.0,0.0,2019-01-04 07:03:00+00:00
4,105#2019-01-05T07:32:00.000Z,2019-01-05 08:40:55.071092+01:00,105,2019-01-05 07:32:00+00:00,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-01-05 07:32:00+00:00,2019-01-05T07:38:37.000Z,19,,19,,2019-01-05-8,260,2019-01-05,8,320,4.0,4.0,8.0,7.7,,4.4,0.0,0,0.0,0.0,1032.4,75.0,8.0,79,,5,0.0,0.0,0.0,0.0,0.0,2019-01-05 07:02:00+00:00


In [94]:
dfa['WeatherKey'] = dfa.apply(lambda row: add_uic_code(row['DestinationStationCode'], row['WeatherKey'], [260, 240]), axis=1)

In [95]:
dfa.head()

Unnamed: 0,RideInstance,ObservationTime,RideId,RideTime,DepartureStationCode,DepartureStationUIC,DepartureStationType,TrainId,TrainType,TrainOperator,DestinationStationCode,DestinationStationUIC,DestinationStationType,PlannedArrivalTime,ActualArrivalTime,PlannedArrivalPlatform,PlannedArrivalPlatformSuffix,ActualArrivalPlatform,ActualArrivalPlatformSuffix,WeatherKey,DepartureWeatherStationCode,DepartureTimestamp,DepartureHour,DepartureWindDir,DepartureWindHour,DepartureWindSpeed,DepartureMaxWindSpeed,DepartureTemperature,DepartureMinTemp10M,DepartureDewPointTemp,DepartureSunshineDur,DepartureRadiation,DeparturePrecipDur,DeparturePrecipHour,DepartureAirPressure,DepartureVisibility,DepartureCloudiness,DepartureHumidity,DepartureWeatherCode,DepartureWeatherCodeIndicator,DepartureFog,DepartureRain,DepartureSnow,DepartureThunder,DepartureIceFormation,PlannedDepartureTime
0,104#2019-06-22T19:58:00.000Z,2019-06-22 21:16:20.243615+02:00,104,2019-06-22 19:58:00+00:00,UT,8400621,6,300104,ICE International,NS,ASD,8400058,6,2019-06-22 19:58:00+00:00,2019-06-22T19:58:00.000Z,7,b,7,b,2019-06-22-20-260,240,2019-06-22,20,50,6.0,7.0,10.0,18.3,,13.0,0.5,12,0.0,0.0,1020.5,75.0,0.0,71,,5,0.0,0.0,0.0,0.0,0.0,2019-06-22 19:28:00+00:00
1,105#2019-01-02T07:33:00.000Z,2019-01-02 08:14:53.961020+01:00,105,2019-01-02 07:33:00+00:00,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-01-02 07:33:00+00:00,2019-01-02T07:34:02.000Z,18,,18,,2019-01-02-8-240,260,2019-01-02,8,340,4.0,4.0,10.0,4.9,,1.9,0.0,0,0.1,0.1,1036.8,70.0,8.0,81,23.0,7,0.0,1.0,0.0,0.0,0.0,2019-01-02 07:03:00+00:00
2,105#2019-01-03T07:33:00.000Z,2019-01-03 08:49:03.881832+01:00,105,2019-01-03 07:33:00+00:00,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-01-03 07:33:00+00:00,2019-01-03T07:33:00.000Z,18,,18,,2019-01-03-8-240,260,2019-01-03,8,310,2.0,1.0,3.0,2.6,,1.5,0.0,0,0.0,0.0,1039.9,65.0,8.0,92,,5,0.0,0.0,0.0,0.0,0.0,2019-01-03 07:03:00+00:00
3,105#2019-01-04T07:33:00.000Z,2019-01-04 08:37:38.178122+01:00,105,2019-01-04 07:33:00+00:00,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-01-04 07:33:00+00:00,2019-01-04T07:34:13.000Z,18,,18,,2019-01-04-8-240,260,2019-01-04,8,270,3.0,2.0,6.0,4.6,,2.4,0.0,0,0.0,0.0,1039.1,64.0,8.0,85,,5,0.0,0.0,0.0,0.0,0.0,2019-01-04 07:03:00+00:00
4,105#2019-01-05T07:32:00.000Z,2019-01-05 08:40:55.071092+01:00,105,2019-01-05 07:32:00+00:00,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-01-05 07:32:00+00:00,2019-01-05T07:38:37.000Z,19,,19,,2019-01-05-8-240,260,2019-01-05,8,320,4.0,4.0,8.0,7.7,,4.4,0.0,0,0.0,0.0,1032.4,75.0,8.0,79,,5,0.0,0.0,0.0,0.0,0.0,2019-01-05 07:02:00+00:00


In [96]:
dfa = pd.merge(dfa,result_weather, on = 'WeatherKey', how='left')
dfa.head()

Unnamed: 0,RideInstance,ObservationTime,RideId,RideTime,DepartureStationCode,DepartureStationUIC,DepartureStationType,TrainId,TrainType,TrainOperator,DestinationStationCode,DestinationStationUIC,DestinationStationType,PlannedArrivalTime,ActualArrivalTime,PlannedArrivalPlatform,PlannedArrivalPlatformSuffix,ActualArrivalPlatform,ActualArrivalPlatformSuffix,WeatherKey,DepartureWeatherStationCode,DepartureTimestamp,DepartureHour,DepartureWindDir,DepartureWindHour,DepartureWindSpeed,DepartureMaxWindSpeed,DepartureTemperature,DepartureMinTemp10M,DepartureDewPointTemp,DepartureSunshineDur,DepartureRadiation,DeparturePrecipDur,DeparturePrecipHour,DepartureAirPressure,DepartureVisibility,DepartureCloudiness,DepartureHumidity,DepartureWeatherCode,DepartureWeatherCodeIndicator,DepartureFog,DepartureRain,DepartureSnow,DepartureThunder,DepartureIceFormation,PlannedDepartureTime,StationCode,Timestamp,Hour,WindDir,WindHour,WindSpeed,MaxWindSpeed,Temperature,MinTemp10M,DewPointTemp,SunshineDur,Radiation,PrecipDur,PrecipHour,AirPressure,Visibility,Cloudines,Humidity,WeatherCode,WeatherCodeIndicator,Fog,Rain,Snow,Thunder,IceFormation
0,104#2019-06-22T19:58:00.000Z,2019-06-22 21:16:20.243615+02:00,104,2019-06-22 19:58:00+00:00,UT,8400621,6,300104,ICE International,NS,ASD,8400058,6,2019-06-22 19:58:00+00:00,2019-06-22T19:58:00.000Z,7,b,7,b,2019-06-22-20-260,240,2019-06-22,20,50,6.0,7.0,10.0,18.3,,13.0,0.5,12,0.0,0.0,1020.5,75.0,0.0,71,,5,0.0,0.0,0.0,0.0,0.0,2019-06-22 19:28:00+00:00,260.0,2019-06-22,20.0,20.0,4.0,4.0,7.0,19.0,,12.2,0.3,11.0,0.0,0.0,1020.2,83.0,0.0,64.0,,5.0,0.0,0.0,0.0,0.0,0.0
1,105#2019-01-02T07:33:00.000Z,2019-01-02 08:14:53.961020+01:00,105,2019-01-02 07:33:00+00:00,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-01-02 07:33:00+00:00,2019-01-02T07:34:02.000Z,18,,18,,2019-01-02-8-240,260,2019-01-02,8,340,4.0,4.0,10.0,4.9,,1.9,0.0,0,0.1,0.1,1036.8,70.0,8.0,81,23.0,7,0.0,1.0,0.0,0.0,0.0,2019-01-02 07:03:00+00:00,240.0,2019-01-02,8.0,350.0,6.0,6.0,11.0,5.0,,2.3,0.0,0.0,0.0,-0.1,1037.2,66.0,7.0,83.0,81.0,7.0,0.0,1.0,0.0,0.0,0.0
2,105#2019-01-03T07:33:00.000Z,2019-01-03 08:49:03.881832+01:00,105,2019-01-03 07:33:00+00:00,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-01-03 07:33:00+00:00,2019-01-03T07:33:00.000Z,18,,18,,2019-01-03-8-240,260,2019-01-03,8,310,2.0,1.0,3.0,2.6,,1.5,0.0,0,0.0,0.0,1039.9,65.0,8.0,92,,5,0.0,0.0,0.0,0.0,0.0,2019-01-03 07:03:00+00:00,240.0,2019-01-03,8.0,310.0,3.0,2.0,4.0,1.8,,0.9,0.0,0.0,0.0,-0.1,1040.0,65.0,5.0,93.0,22.0,7.0,0.0,1.0,0.0,0.0,0.0
3,105#2019-01-04T07:33:00.000Z,2019-01-04 08:37:38.178122+01:00,105,2019-01-04 07:33:00+00:00,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-01-04 07:33:00+00:00,2019-01-04T07:34:13.000Z,18,,18,,2019-01-04-8-240,260,2019-01-04,8,270,3.0,2.0,6.0,4.6,,2.4,0.0,0,0.0,0.0,1039.1,64.0,8.0,85,,5,0.0,0.0,0.0,0.0,0.0,2019-01-04 07:03:00+00:00,240.0,2019-01-04,8.0,290.0,6.0,5.0,8.0,5.0,,2.3,0.0,0.0,0.0,0.0,1039.0,70.0,8.0,82.0,,5.0,0.0,0.0,0.0,0.0,0.0
4,105#2019-01-05T07:32:00.000Z,2019-01-05 08:40:55.071092+01:00,105,2019-01-05 07:32:00+00:00,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-01-05 07:32:00+00:00,2019-01-05T07:38:37.000Z,19,,19,,2019-01-05-8-240,260,2019-01-05,8,320,4.0,4.0,8.0,7.7,,4.4,0.0,0,0.0,0.0,1032.4,75.0,8.0,79,,5,0.0,0.0,0.0,0.0,0.0,2019-01-05 07:02:00+00:00,240.0,2019-01-05,8.0,320.0,8.0,8.0,12.0,7.8,,4.4,0.0,1.0,0.0,0.0,1032.8,75.0,8.0,78.0,,5.0,0.0,0.0,0.0,0.0,0.0


In [97]:
dfa = rename_weather('Destination', dfa)

In [98]:
dfa.head()

Unnamed: 0,RideInstance,ObservationTime,RideId,RideTime,DepartureStationCode,DepartureStationUIC,DepartureStationType,TrainId,TrainType,TrainOperator,DestinationStationCode,DestinationStationUIC,DestinationStationType,PlannedArrivalTime,ActualArrivalTime,PlannedArrivalPlatform,PlannedArrivalPlatformSuffix,ActualArrivalPlatform,ActualArrivalPlatformSuffix,WeatherKey,DepartureWeatherStationCode,DepartureTimestamp,DepartureHour,DepartureWindDir,DepartureWindHour,DepartureWindSpeed,DepartureMaxWindSpeed,DepartureTemperature,DepartureMinTemp10M,DepartureDewPointTemp,DepartureSunshineDur,DepartureRadiation,DeparturePrecipDur,DeparturePrecipHour,DepartureAirPressure,DepartureVisibility,DepartureCloudiness,DepartureHumidity,DepartureWeatherCode,DepartureWeatherCodeIndicator,DepartureFog,DepartureRain,DepartureSnow,DepartureThunder,DepartureIceFormation,PlannedDepartureTime,DestinationWeatherStationCode,DestinationTimestamp,DestinationHour,DestinationWindDir,DestinationWindHour,DestinationWindSpeed,DestinationMaxWindSpeed,DestinationTemperature,DestinationMinTemp10M,DestinationDewPointTemp,DestinationSunshineDur,DestinationRadiation,DestinationPrecipDur,DestinationPrecipHour,DestinationAirPressure,DestinationVisibility,DestinationCloudiness,DestinationHumidity,DestinationWeatherCode,DestinationWeatherCodeIndicator,DestinationFog,DestinationRain,DestinationSnow,DestinationThunder,DestinationIceFormation
0,104#2019-06-22T19:58:00.000Z,2019-06-22 21:16:20.243615+02:00,104,2019-06-22 19:58:00+00:00,UT,8400621,6,300104,ICE International,NS,ASD,8400058,6,2019-06-22 19:58:00+00:00,2019-06-22T19:58:00.000Z,7,b,7,b,2019-06-22-20-260,240,2019-06-22,20,50,6.0,7.0,10.0,18.3,,13.0,0.5,12,0.0,0.0,1020.5,75.0,0.0,71,,5,0.0,0.0,0.0,0.0,0.0,2019-06-22 19:28:00+00:00,260.0,2019-06-22,20.0,20.0,4.0,4.0,7.0,19.0,,12.2,0.3,11.0,0.0,0.0,1020.2,83.0,0.0,64.0,,5.0,0.0,0.0,0.0,0.0,0.0
1,105#2019-01-02T07:33:00.000Z,2019-01-02 08:14:53.961020+01:00,105,2019-01-02 07:33:00+00:00,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-01-02 07:33:00+00:00,2019-01-02T07:34:02.000Z,18,,18,,2019-01-02-8-240,260,2019-01-02,8,340,4.0,4.0,10.0,4.9,,1.9,0.0,0,0.1,0.1,1036.8,70.0,8.0,81,23.0,7,0.0,1.0,0.0,0.0,0.0,2019-01-02 07:03:00+00:00,240.0,2019-01-02,8.0,350.0,6.0,6.0,11.0,5.0,,2.3,0.0,0.0,0.0,-0.1,1037.2,66.0,7.0,83.0,81.0,7.0,0.0,1.0,0.0,0.0,0.0
2,105#2019-01-03T07:33:00.000Z,2019-01-03 08:49:03.881832+01:00,105,2019-01-03 07:33:00+00:00,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-01-03 07:33:00+00:00,2019-01-03T07:33:00.000Z,18,,18,,2019-01-03-8-240,260,2019-01-03,8,310,2.0,1.0,3.0,2.6,,1.5,0.0,0,0.0,0.0,1039.9,65.0,8.0,92,,5,0.0,0.0,0.0,0.0,0.0,2019-01-03 07:03:00+00:00,240.0,2019-01-03,8.0,310.0,3.0,2.0,4.0,1.8,,0.9,0.0,0.0,0.0,-0.1,1040.0,65.0,5.0,93.0,22.0,7.0,0.0,1.0,0.0,0.0,0.0
3,105#2019-01-04T07:33:00.000Z,2019-01-04 08:37:38.178122+01:00,105,2019-01-04 07:33:00+00:00,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-01-04 07:33:00+00:00,2019-01-04T07:34:13.000Z,18,,18,,2019-01-04-8-240,260,2019-01-04,8,270,3.0,2.0,6.0,4.6,,2.4,0.0,0,0.0,0.0,1039.1,64.0,8.0,85,,5,0.0,0.0,0.0,0.0,0.0,2019-01-04 07:03:00+00:00,240.0,2019-01-04,8.0,290.0,6.0,5.0,8.0,5.0,,2.3,0.0,0.0,0.0,0.0,1039.0,70.0,8.0,82.0,,5.0,0.0,0.0,0.0,0.0,0.0
4,105#2019-01-05T07:32:00.000Z,2019-01-05 08:40:55.071092+01:00,105,2019-01-05 07:32:00+00:00,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-01-05 07:32:00+00:00,2019-01-05T07:38:37.000Z,19,,19,,2019-01-05-8-240,260,2019-01-05,8,320,4.0,4.0,8.0,7.7,,4.4,0.0,0,0.0,0.0,1032.4,75.0,8.0,79,,5,0.0,0.0,0.0,0.0,0.0,2019-01-05 07:02:00+00:00,240.0,2019-01-05,8.0,320.0,8.0,8.0,12.0,7.8,,4.4,0.0,1.0,0.0,0.0,1032.8,75.0,8.0,78.0,,5.0,0.0,0.0,0.0,0.0,0.0


In [99]:
dfa.head()

Unnamed: 0,RideInstance,ObservationTime,RideId,RideTime,DepartureStationCode,DepartureStationUIC,DepartureStationType,TrainId,TrainType,TrainOperator,DestinationStationCode,DestinationStationUIC,DestinationStationType,PlannedArrivalTime,ActualArrivalTime,PlannedArrivalPlatform,PlannedArrivalPlatformSuffix,ActualArrivalPlatform,ActualArrivalPlatformSuffix,WeatherKey,DepartureWeatherStationCode,DepartureTimestamp,DepartureHour,DepartureWindDir,DepartureWindHour,DepartureWindSpeed,DepartureMaxWindSpeed,DepartureTemperature,DepartureMinTemp10M,DepartureDewPointTemp,DepartureSunshineDur,DepartureRadiation,DeparturePrecipDur,DeparturePrecipHour,DepartureAirPressure,DepartureVisibility,DepartureCloudiness,DepartureHumidity,DepartureWeatherCode,DepartureWeatherCodeIndicator,DepartureFog,DepartureRain,DepartureSnow,DepartureThunder,DepartureIceFormation,PlannedDepartureTime,DestinationWeatherStationCode,DestinationTimestamp,DestinationHour,DestinationWindDir,DestinationWindHour,DestinationWindSpeed,DestinationMaxWindSpeed,DestinationTemperature,DestinationMinTemp10M,DestinationDewPointTemp,DestinationSunshineDur,DestinationRadiation,DestinationPrecipDur,DestinationPrecipHour,DestinationAirPressure,DestinationVisibility,DestinationCloudiness,DestinationHumidity,DestinationWeatherCode,DestinationWeatherCodeIndicator,DestinationFog,DestinationRain,DestinationSnow,DestinationThunder,DestinationIceFormation
0,104#2019-06-22T19:58:00.000Z,2019-06-22 21:16:20.243615+02:00,104,2019-06-22 19:58:00+00:00,UT,8400621,6,300104,ICE International,NS,ASD,8400058,6,2019-06-22 19:58:00+00:00,2019-06-22T19:58:00.000Z,7,b,7,b,2019-06-22-20-260,240,2019-06-22,20,50,6.0,7.0,10.0,18.3,,13.0,0.5,12,0.0,0.0,1020.5,75.0,0.0,71,,5,0.0,0.0,0.0,0.0,0.0,2019-06-22 19:28:00+00:00,260.0,2019-06-22,20.0,20.0,4.0,4.0,7.0,19.0,,12.2,0.3,11.0,0.0,0.0,1020.2,83.0,0.0,64.0,,5.0,0.0,0.0,0.0,0.0,0.0
1,105#2019-01-02T07:33:00.000Z,2019-01-02 08:14:53.961020+01:00,105,2019-01-02 07:33:00+00:00,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-01-02 07:33:00+00:00,2019-01-02T07:34:02.000Z,18,,18,,2019-01-02-8-240,260,2019-01-02,8,340,4.0,4.0,10.0,4.9,,1.9,0.0,0,0.1,0.1,1036.8,70.0,8.0,81,23.0,7,0.0,1.0,0.0,0.0,0.0,2019-01-02 07:03:00+00:00,240.0,2019-01-02,8.0,350.0,6.0,6.0,11.0,5.0,,2.3,0.0,0.0,0.0,-0.1,1037.2,66.0,7.0,83.0,81.0,7.0,0.0,1.0,0.0,0.0,0.0
2,105#2019-01-03T07:33:00.000Z,2019-01-03 08:49:03.881832+01:00,105,2019-01-03 07:33:00+00:00,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-01-03 07:33:00+00:00,2019-01-03T07:33:00.000Z,18,,18,,2019-01-03-8-240,260,2019-01-03,8,310,2.0,1.0,3.0,2.6,,1.5,0.0,0,0.0,0.0,1039.9,65.0,8.0,92,,5,0.0,0.0,0.0,0.0,0.0,2019-01-03 07:03:00+00:00,240.0,2019-01-03,8.0,310.0,3.0,2.0,4.0,1.8,,0.9,0.0,0.0,0.0,-0.1,1040.0,65.0,5.0,93.0,22.0,7.0,0.0,1.0,0.0,0.0,0.0
3,105#2019-01-04T07:33:00.000Z,2019-01-04 08:37:38.178122+01:00,105,2019-01-04 07:33:00+00:00,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-01-04 07:33:00+00:00,2019-01-04T07:34:13.000Z,18,,18,,2019-01-04-8-240,260,2019-01-04,8,270,3.0,2.0,6.0,4.6,,2.4,0.0,0,0.0,0.0,1039.1,64.0,8.0,85,,5,0.0,0.0,0.0,0.0,0.0,2019-01-04 07:03:00+00:00,240.0,2019-01-04,8.0,290.0,6.0,5.0,8.0,5.0,,2.3,0.0,0.0,0.0,0.0,1039.0,70.0,8.0,82.0,,5.0,0.0,0.0,0.0,0.0,0.0
4,105#2019-01-05T07:32:00.000Z,2019-01-05 08:40:55.071092+01:00,105,2019-01-05 07:32:00+00:00,ASD,8400058,6,105,ICE International,NS,UT,8400621,6,2019-01-05 07:32:00+00:00,2019-01-05T07:38:37.000Z,19,,19,,2019-01-05-8-240,260,2019-01-05,8,320,4.0,4.0,8.0,7.7,,4.4,0.0,0,0.0,0.0,1032.4,75.0,8.0,79,,5,0.0,0.0,0.0,0.0,0.0,2019-01-05 07:02:00+00:00,240.0,2019-01-05,8.0,320.0,8.0,8.0,12.0,7.8,,4.4,0.0,1.0,0.0,0.0,1032.8,75.0,8.0,78.0,,5.0,0.0,0.0,0.0,0.0,0.0


In [100]:
# save the dataset with the weate
dfa.to_csv('../assets/data/2019 UT-ASD/2019-parsed-data.csv')