In [112]:
import pandas as pd
import numpy as np
import seaborn as sns
import xml.etree.ElementTree as ET

from functools import reduce
from datetime import datetime, timedelta

pd.set_option('display.max_columns', None)
%config IPCompleter.greedy=True

In [113]:
# load all data time chunks
df = pd.read_csv('../assets/data/2019-UT-ASD-Full/2019-UT-ASD-scrapped.csv', parse_dates=['PlannedDepartureTime',
                                                                                          'PlannedArrivalTime',
                                                                                          'ActualArrivalTime',
                                                                                          'ActualDepartureTime'
                                                                                         ])

In [114]:
df.shape

(94963, 20)

In [115]:
df.head()

Unnamed: 0,Date,RideId,TrainId,DepartureStation,DepartureTime,DepartureDelay,DestinationStation,ArrivalTime,ArrivalDelay,DeparturePlatform,ArrivalPlatform,DepartureMaterials,ArrivalMaterials,InbetweenStations,FaultMessages,RideInstance,PlannedDepartureTime,PlannedArrivalTime,ActualArrivalTime,ActualDepartureTime
0,2018-12-31,1405,1405,Utrecht Centraal,01:01,1.0,Amsterdam Centraal,01:29,1.5,7,2a,VIRM-4 9516;VIRM-6 8671,VIRM-4 9516;VIRM-6 8671,Amsterdam Bijlmer ArenA,,2018-12-31#1405,2018-12-31 01:01:00,2018-12-31 01:29:00,2018-12-31 01:30:30,2018-12-31 01:02:00
1,2018-12-31,1402,1402,Amsterdam Centraal,01:18,0.0,Utrecht Centraal,01:53,0.0,2,15,VIRM-6 8730;VIRM-6 8656,,Amsterdam Bijlmer ArenA,,2018-12-31#1402,2018-12-31 01:18:00,2018-12-31 01:53:00,2018-12-31 01:53:00,2018-12-31 01:18:00
2,2019-01-01,1409,1409,Utrecht Centraal,02:17,0.0,Amsterdam Centraal,02:44,1.0,15,2a,VIRM-6 8656;VIRM-6 8730,VIRM-6 8656;VIRM-6 8730,Amsterdam Bijlmer ArenA,,2019-01-01#1409,2019-01-01 02:17:00,2019-01-01 02:44:00,2019-01-01 02:45:00,2019-01-01 02:17:00
3,2018-12-31,1406,1406,Amsterdam Centraal,02:19,1.5,Utrecht Centraal,02:45,0.0,2b,14,VIRM-6 8648;VIRM-4 9504,,,,2018-12-31#1406,2018-12-31 02:19:00,2018-12-31 02:45:00,2018-12-31 02:45:00,2018-12-31 02:20:30
4,2019-01-01,1413,1413,Utrecht Centraal,03:11,0.0,Amsterdam Centraal,03:44,0.0,14,2a,VIRM-4 9504;VIRM-6 8648,VIRM-4 9504;VIRM-6 8648,,,2019-01-01#1413,2019-01-01 03:11:00,2019-01-01 03:44:00,2019-01-01 03:44:00,2019-01-01 03:11:00


# Merge data with weather

In [116]:
# if 'RideInstance' in dfa:
#     del dfa['RideInstance']
# dfa.insert(0, 'RideInstance', dfa.RideId.astype(str) + '#' + dfa.RideTime )

In [117]:
dfa_weather_U = pd.read_csv('../assets/data/de_bilt_weather_2019.csv')  
dfa_weather_A = pd.read_csv('../assets/data/schiphol_weather_2019.csv') 

In [118]:
dfa_weather_A['WeatherKey'] = dfa_weather_A['Timestamp'].astype(str) + '-' + dfa_weather_A['Hour'].astype(str) + '-' + dfa_weather_A['StationCode'].astype(str)
dfa_weather_U['WeatherKey'] = dfa_weather_U['Timestamp'].astype(str) + '-' + dfa_weather_U['Hour'].astype(str)+ '-'+ dfa_weather_U['StationCode'].astype(str)

In [119]:
print(dfa_weather_A.head())
print(dfa_weather_U.head())

   StationCode   Timestamp  Hour  WindDir  WindHour  WindSpeed  MaxWindSpeed  \
0          240  2019-01-01     1      260       7.0        6.0          10.0   
1          240  2019-01-01     2      260       7.0        7.0          10.0   
2          240  2019-01-01     3      250       7.0        7.0          11.0   
3          240  2019-01-01     4      250       7.0        8.0          11.0   
4          240  2019-01-01     5      260       9.0        9.0          12.0   

   Temperature  MinTemp10M  DewPointTemp  SunshineDur  Radiation  PrecipDur  \
0          8.5         NaN           5.7          0.0          0        0.0   
1          8.6         NaN           5.1          0.0          0        0.0   
2          8.5         NaN           5.1          0.0          0        0.0   
3          8.2         NaN           5.4          0.0          0        0.0   
4          8.7         NaN           5.8          0.0          0        0.0   

   PrecipHour  AirPressure  Visibility  Clou

In [120]:
def add_uic_code(date, destination, weather_station):
    if not pd.isnull(date):
        weather_key = f'{date.strftime("%Y-%m-%d")}-{int(date.hour) + 1}'
        if destination == 'Amsterdam Centraal':
            return weather_key + f'-{weather_station[0]}'
        if destination == '	Utrecht Centraal':
            return weather_key + f'-{weather_station[1]}'
    else:
        return None

In [121]:
# this function renames the weather columns with a specific prefix
def rename_weather(suffix, df):
    return df.rename(columns = {
        'StationCode' : f'{suffix}WeatherStationCode',
        'Timestamp' : f'{suffix}Timestamp',  # date (YYYY=year,MM=month,DD=day)
        'Hour' : f'{suffix}Hour' ,  # time (HH uur/hour, UT. 12 UT=13 MET, 14 MEZT. Hourly division 05 runs from 04.00 UT to 5.00 UT
        'WindDir' : f'{suffix}WindDir' ,  # Mean wind direction (in degrees) during the 10-minute period preceding the time of observation (360=north, 90=east, 180=south, 270=west, 0=calm 990=variable)
        'WindHour' : f'{suffix}WindHour' ,  # Hourly mean wind speed (in 0.1 m/s)
        'WindSpeed' : f'{suffix}WindSpeed' , # Mean wind speed (in 0.1 m/s) during the 10-minute period preceding the time of observation  
        'MaxWindSpeed' : f'{suffix}MaxWindSpeed' ,  # Maximum wind gust (in 0.1 m/s) during the hourly division
        'Temperature' : f'{suffix}Temperature' ,  # Temperature (in 0.1 degrees Celsius) at 1.50 m at the time of observation  
        'MinTemp10M' : f'{suffix}MinTemp10M' ,  # Minimum temperature (in 0.1 degrees Celsius) at 0.1 m in the preceding 6-hour period
        'DewPointTemp' : f'{suffix}DewPointTemp' ,  # Dew point temperature (in 0.1 degrees Celsius) at 1.50 m at the time of observation 
        'SunshineDur' : f'{suffix}SunshineDur' ,  # Sunshine duration (in 0.1 hour) during the hourly division, calculated from global radiation (-1 for <0.05 hour) 
        'Radiation' : f'{suffix}Radiation' ,  # Global radiation (in J/cm2) during the hourly division    
        'PrecipDur' : f'{suffix}PrecipDur',  # Precipitation duration (in 0.1 hour) during the hourly division
        'PrecipHour' : f'{suffix}PrecipHour',  # Hourly precipitation amount (in 0.1 mm) (-1 for <0.05 mm)
        'AirPressure' : f'{suffix}AirPressure',  # Air pressure (in 0.1 hPa) reduced to mean sea level, at the time of observation 
        'Visibility' : f'{suffix}Visibility',  # Horizontal visibility at the time of observation (0=less than 100m, 1=100-200m, 2=200-300m,..., 49=4900-5000m, 50=5-6km, 56=6-7km, 57=7-8km, ..., 79=29-30km, 80=30-35km, 81=35-40km,..., 89=more than 70km)
        'Cloudines' : f'{suffix}Cloudiness',  # Cloud cover (in octants), at the time of observation (9=sky invisible)
        'Humidity': f'{suffix}Humidity',  # Relative atmospheric humidity (in percents) at 1.50 m at the time of observation
        'WeatherCode' : f'{suffix}WeatherCode',  # Present weather code (00-99), description for the hourly division. (http://bibliotheek.knmi.nl/scholierenpdf/weercodes_Nederland)
        'WeatherCodeIndicator': f'{suffix}WeatherCodeIndicator',  # Indicator present weather code (1=manned and recorded (using code from visual observations), 2,3=manned and omitted (no significant weather phenomenon to report, not available), 4=automatically recorded (using code from visual observations), 5,6=automatically omitted (no significant weather phenomenon to report, not available), 7=automatically set (using code from automated observations) 
        'Fog' : f'{suffix}Fog',  # Fog 0=no occurrence, 1=occurred during the preceding hour and/or at the time of observation
        'Rain' : f'{suffix}Rain',  # Rainfall 0=no occurrence, 1=occurred during the preceding hour and/or at the time of observation
        'Snow' : f'{suffix}Snow',  # Snow 0=no occurrence, 1=occurred during the preceding hour and/or at the time of observation
        'Thunder' : f'{suffix}Thunder',  # Thunder  0=no occurrence, 1=occurred during the preceding hour and/or at the time of observation 
        'IceFormation' : f'{suffix}IceFormation'  # Ice formation 0=no occurrence, 1=occurred during the preceding hour and/or at the time of observation
    })

In [122]:
# df['WeatherKey'] = df['PlannedArrivalTime'].dt.strftime('%Y-%m-%d') + '-' + (df['PlannedArrivalTime'].dt.hour.astype(int) + 1).astype(str)

In [123]:
df['WeatherKey'] = df.apply(lambda row: add_uic_code(row['PlannedDepartureTime'], row['DestinationStation'], [260, 240]), axis=1)

In [124]:
df.head()

Unnamed: 0,Date,RideId,TrainId,DepartureStation,DepartureTime,DepartureDelay,DestinationStation,ArrivalTime,ArrivalDelay,DeparturePlatform,ArrivalPlatform,DepartureMaterials,ArrivalMaterials,InbetweenStations,FaultMessages,RideInstance,PlannedDepartureTime,PlannedArrivalTime,ActualArrivalTime,ActualDepartureTime,WeatherKey
0,2018-12-31,1405,1405,Utrecht Centraal,01:01,1.0,Amsterdam Centraal,01:29,1.5,7,2a,VIRM-4 9516;VIRM-6 8671,VIRM-4 9516;VIRM-6 8671,Amsterdam Bijlmer ArenA,,2018-12-31#1405,2018-12-31 01:01:00,2018-12-31 01:29:00,2018-12-31 01:30:30,2018-12-31 01:02:00,2018-12-31-2-260
1,2018-12-31,1402,1402,Amsterdam Centraal,01:18,0.0,Utrecht Centraal,01:53,0.0,2,15,VIRM-6 8730;VIRM-6 8656,,Amsterdam Bijlmer ArenA,,2018-12-31#1402,2018-12-31 01:18:00,2018-12-31 01:53:00,2018-12-31 01:53:00,2018-12-31 01:18:00,
2,2019-01-01,1409,1409,Utrecht Centraal,02:17,0.0,Amsterdam Centraal,02:44,1.0,15,2a,VIRM-6 8656;VIRM-6 8730,VIRM-6 8656;VIRM-6 8730,Amsterdam Bijlmer ArenA,,2019-01-01#1409,2019-01-01 02:17:00,2019-01-01 02:44:00,2019-01-01 02:45:00,2019-01-01 02:17:00,2019-01-01-3-260
3,2018-12-31,1406,1406,Amsterdam Centraal,02:19,1.5,Utrecht Centraal,02:45,0.0,2b,14,VIRM-6 8648;VIRM-4 9504,,,,2018-12-31#1406,2018-12-31 02:19:00,2018-12-31 02:45:00,2018-12-31 02:45:00,2018-12-31 02:20:30,
4,2019-01-01,1413,1413,Utrecht Centraal,03:11,0.0,Amsterdam Centraal,03:44,0.0,14,2a,VIRM-4 9504;VIRM-6 8648,VIRM-4 9504;VIRM-6 8648,,,2019-01-01#1413,2019-01-01 03:11:00,2019-01-01 03:44:00,2019-01-01 03:44:00,2019-01-01 03:11:00,2019-01-01-4-260


In [125]:
 df.sort_values('PlannedArrivalTime').head()

Unnamed: 0,Date,RideId,TrainId,DepartureStation,DepartureTime,DepartureDelay,DestinationStation,ArrivalTime,ArrivalDelay,DeparturePlatform,ArrivalPlatform,DepartureMaterials,ArrivalMaterials,InbetweenStations,FaultMessages,RideInstance,PlannedDepartureTime,PlannedArrivalTime,ActualArrivalTime,ActualDepartureTime,WeatherKey
0,2018-12-31,1405,1405,Utrecht Centraal,01:01,1.0,Amsterdam Centraal,01:29,1.5,7,2a,VIRM-4 9516;VIRM-6 8671,VIRM-4 9516;VIRM-6 8671,Amsterdam Bijlmer ArenA,,2018-12-31#1405,2018-12-31 01:01:00,2018-12-31 01:29:00,2018-12-31 01:30:30,2018-12-31 01:02:00,2018-12-31-2-260
1,2018-12-31,1402,1402,Amsterdam Centraal,01:18,0.0,Utrecht Centraal,01:53,0.0,2,15,VIRM-6 8730;VIRM-6 8656,,Amsterdam Bijlmer ArenA,,2018-12-31#1402,2018-12-31 01:18:00,2018-12-31 01:53:00,2018-12-31 01:53:00,2018-12-31 01:18:00,
3,2018-12-31,1406,1406,Amsterdam Centraal,02:19,1.5,Utrecht Centraal,02:45,0.0,2b,14,VIRM-6 8648;VIRM-4 9504,,,,2018-12-31#1406,2018-12-31 02:19:00,2018-12-31 02:45:00,2018-12-31 02:45:00,2018-12-31 02:20:30,
200,2019-01-01,3091,3091,Amsterdam Centraal,00:24,0.0,Utrecht Centraal,00:52,0.0,4a,19,VIRM-6 8662,VIRM-6 8662,Amsterdam Amstel,,2019-01-01#3091,2019-01-01 00:24:00,2019-01-01 00:52:00,2019-01-01 00:52:00,2019-01-01 00:24:00,
199,2019-01-01,2986,2986,Utrecht Centraal,00:23,0.0,Amsterdam Centraal,00:52,0.0,7,10a,VIRM-4 9563,,Amsterdam Bijlmer ArenA;Amsterdam Amstel,,2019-01-01#2986,2019-01-01 00:23:00,2019-01-01 00:52:00,2019-01-01 00:52:00,2019-01-01 00:23:00,2019-01-01-1-260


In [126]:
# combine weather in one big weather dataset
result_weather = pd.concat([dfa_weather_A, dfa_weather_U])

In [127]:
result_weather.head()

Unnamed: 0,StationCode,Timestamp,Hour,WindDir,WindHour,WindSpeed,MaxWindSpeed,Temperature,MinTemp10M,DewPointTemp,SunshineDur,Radiation,PrecipDur,PrecipHour,AirPressure,Visibility,Cloudines,Humidity,WeatherCode,WeatherCodeIndicator,Fog,Rain,Snow,Thunder,IceFormation,WeatherKey
0,240,2019-01-01,1,260,7.0,6.0,10.0,8.5,,5.7,0.0,0,0.0,0.0,1030.6,69.0,8.0,82,,5,0.0,0.0,0.0,0.0,0.0,2019-01-01-1-240
1,240,2019-01-01,2,260,7.0,7.0,10.0,8.6,,5.1,0.0,0,0.0,0.0,1030.1,75.0,8.0,78,,5,0.0,0.0,0.0,0.0,0.0,2019-01-01-2-240
2,240,2019-01-01,3,250,7.0,7.0,11.0,8.5,,5.1,0.0,0,0.0,0.0,1029.5,75.0,8.0,79,,5,0.0,0.0,0.0,0.0,0.0,2019-01-01-3-240
3,240,2019-01-01,4,250,7.0,8.0,11.0,8.2,,5.4,0.0,0,0.0,0.0,1029.0,70.0,8.0,82,,5,0.0,0.0,0.0,0.0,0.0,2019-01-01-4-240
4,240,2019-01-01,5,260,9.0,9.0,12.0,8.7,,5.8,0.0,0,0.0,-0.1,1028.3,70.0,8.0,81,22.0,7,0.0,1.0,0.0,0.0,0.0,2019-01-01-5-240


In [128]:
# merge weather with train data
df = pd.merge(df,result_weather, on = 'WeatherKey', how='left')
df.head()

Unnamed: 0,Date,RideId,TrainId,DepartureStation,DepartureTime,DepartureDelay,DestinationStation,ArrivalTime,ArrivalDelay,DeparturePlatform,ArrivalPlatform,DepartureMaterials,ArrivalMaterials,InbetweenStations,FaultMessages,RideInstance,PlannedDepartureTime,PlannedArrivalTime,ActualArrivalTime,ActualDepartureTime,WeatherKey,StationCode,Timestamp,Hour,WindDir,WindHour,WindSpeed,MaxWindSpeed,Temperature,MinTemp10M,DewPointTemp,SunshineDur,Radiation,PrecipDur,PrecipHour,AirPressure,Visibility,Cloudines,Humidity,WeatherCode,WeatherCodeIndicator,Fog,Rain,Snow,Thunder,IceFormation
0,2018-12-31,1405,1405,Utrecht Centraal,01:01,1.0,Amsterdam Centraal,01:29,1.5,7,2a,VIRM-4 9516;VIRM-6 8671,VIRM-4 9516;VIRM-6 8671,Amsterdam Bijlmer ArenA,,2018-12-31#1405,2018-12-31 01:01:00,2018-12-31 01:29:00,2018-12-31 01:30:30,2018-12-31 01:02:00,2018-12-31-2-260,,,,,,,,,,,,,,,,,,,,,,,,,
1,2018-12-31,1402,1402,Amsterdam Centraal,01:18,0.0,Utrecht Centraal,01:53,0.0,2,15,VIRM-6 8730;VIRM-6 8656,,Amsterdam Bijlmer ArenA,,2018-12-31#1402,2018-12-31 01:18:00,2018-12-31 01:53:00,2018-12-31 01:53:00,2018-12-31 01:18:00,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2019-01-01,1409,1409,Utrecht Centraal,02:17,0.0,Amsterdam Centraal,02:44,1.0,15,2a,VIRM-6 8656;VIRM-6 8730,VIRM-6 8656;VIRM-6 8730,Amsterdam Bijlmer ArenA,,2019-01-01#1409,2019-01-01 02:17:00,2019-01-01 02:44:00,2019-01-01 02:45:00,2019-01-01 02:17:00,2019-01-01-3-260,260.0,2019-01-01,3.0,250.0,4.0,4.0,9.0,8.4,,5.7,0.0,0.0,0.0,0.0,1030.1,57.0,8.0,83.0,10.0,7.0,0.0,0.0,0.0,0.0,0.0
3,2018-12-31,1406,1406,Amsterdam Centraal,02:19,1.5,Utrecht Centraal,02:45,0.0,2b,14,VIRM-6 8648;VIRM-4 9504,,,,2018-12-31#1406,2018-12-31 02:19:00,2018-12-31 02:45:00,2018-12-31 02:45:00,2018-12-31 02:20:30,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2019-01-01,1413,1413,Utrecht Centraal,03:11,0.0,Amsterdam Centraal,03:44,0.0,14,2a,VIRM-4 9504;VIRM-6 8648,VIRM-4 9504;VIRM-6 8648,,,2019-01-01#1413,2019-01-01 03:11:00,2019-01-01 03:44:00,2019-01-01 03:44:00,2019-01-01 03:11:00,2019-01-01-4-260,260.0,2019-01-01,4.0,250.0,4.0,4.0,8.0,8.2,,5.6,0.0,0.0,0.0,0.0,1029.3,62.0,8.0,83.0,,5.0,0.0,0.0,0.0,0.0,0.0


In [129]:
df.iloc[10].to_frame()

Unnamed: 0,10
Date,2019-01-01
RideId,1425
TrainId,1425
DepartureStation,Utrecht Centraal
DepartureTime,06:05
DepartureDelay,0
DestinationStation,Amsterdam Centraal
ArrivalTime,06:39
ArrivalDelay,0
DeparturePlatform,5


In [130]:
df = rename_weather('Departure', df)

In [131]:
df.head()

Unnamed: 0,Date,RideId,TrainId,DepartureStation,DepartureTime,DepartureDelay,DestinationStation,ArrivalTime,ArrivalDelay,DeparturePlatform,ArrivalPlatform,DepartureMaterials,ArrivalMaterials,InbetweenStations,FaultMessages,RideInstance,PlannedDepartureTime,PlannedArrivalTime,ActualArrivalTime,ActualDepartureTime,WeatherKey,DepartureWeatherStationCode,DepartureTimestamp,DepartureHour,DepartureWindDir,DepartureWindHour,DepartureWindSpeed,DepartureMaxWindSpeed,DepartureTemperature,DepartureMinTemp10M,DepartureDewPointTemp,DepartureSunshineDur,DepartureRadiation,DeparturePrecipDur,DeparturePrecipHour,DepartureAirPressure,DepartureVisibility,DepartureCloudiness,DepartureHumidity,DepartureWeatherCode,DepartureWeatherCodeIndicator,DepartureFog,DepartureRain,DepartureSnow,DepartureThunder,DepartureIceFormation
0,2018-12-31,1405,1405,Utrecht Centraal,01:01,1.0,Amsterdam Centraal,01:29,1.5,7,2a,VIRM-4 9516;VIRM-6 8671,VIRM-4 9516;VIRM-6 8671,Amsterdam Bijlmer ArenA,,2018-12-31#1405,2018-12-31 01:01:00,2018-12-31 01:29:00,2018-12-31 01:30:30,2018-12-31 01:02:00,2018-12-31-2-260,,,,,,,,,,,,,,,,,,,,,,,,,
1,2018-12-31,1402,1402,Amsterdam Centraal,01:18,0.0,Utrecht Centraal,01:53,0.0,2,15,VIRM-6 8730;VIRM-6 8656,,Amsterdam Bijlmer ArenA,,2018-12-31#1402,2018-12-31 01:18:00,2018-12-31 01:53:00,2018-12-31 01:53:00,2018-12-31 01:18:00,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2019-01-01,1409,1409,Utrecht Centraal,02:17,0.0,Amsterdam Centraal,02:44,1.0,15,2a,VIRM-6 8656;VIRM-6 8730,VIRM-6 8656;VIRM-6 8730,Amsterdam Bijlmer ArenA,,2019-01-01#1409,2019-01-01 02:17:00,2019-01-01 02:44:00,2019-01-01 02:45:00,2019-01-01 02:17:00,2019-01-01-3-260,260.0,2019-01-01,3.0,250.0,4.0,4.0,9.0,8.4,,5.7,0.0,0.0,0.0,0.0,1030.1,57.0,8.0,83.0,10.0,7.0,0.0,0.0,0.0,0.0,0.0
3,2018-12-31,1406,1406,Amsterdam Centraal,02:19,1.5,Utrecht Centraal,02:45,0.0,2b,14,VIRM-6 8648;VIRM-4 9504,,,,2018-12-31#1406,2018-12-31 02:19:00,2018-12-31 02:45:00,2018-12-31 02:45:00,2018-12-31 02:20:30,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2019-01-01,1413,1413,Utrecht Centraal,03:11,0.0,Amsterdam Centraal,03:44,0.0,14,2a,VIRM-4 9504;VIRM-6 8648,VIRM-4 9504;VIRM-6 8648,,,2019-01-01#1413,2019-01-01 03:11:00,2019-01-01 03:44:00,2019-01-01 03:44:00,2019-01-01 03:11:00,2019-01-01-4-260,260.0,2019-01-01,4.0,250.0,4.0,4.0,8.0,8.2,,5.6,0.0,0.0,0.0,0.0,1029.3,62.0,8.0,83.0,,5.0,0.0,0.0,0.0,0.0,0.0


# Create the departure time field

In [132]:
# create weather key for departure
df['WeatherKey'] = df.apply(lambda row: add_uic_code(row['PlannedArrivalTime'], row['DestinationStation'], [240, 260]), axis=1)

In [134]:
df.head()

Unnamed: 0,Date,RideId,TrainId,DepartureStation,DepartureTime,DepartureDelay,DestinationStation,ArrivalTime,ArrivalDelay,DeparturePlatform,ArrivalPlatform,DepartureMaterials,ArrivalMaterials,InbetweenStations,FaultMessages,RideInstance,PlannedDepartureTime,PlannedArrivalTime,ActualArrivalTime,ActualDepartureTime,WeatherKey,DepartureWeatherStationCode,DepartureTimestamp,DepartureHour,DepartureWindDir,DepartureWindHour,DepartureWindSpeed,DepartureMaxWindSpeed,DepartureTemperature,DepartureMinTemp10M,DepartureDewPointTemp,DepartureSunshineDur,DepartureRadiation,DeparturePrecipDur,DeparturePrecipHour,DepartureAirPressure,DepartureVisibility,DepartureCloudiness,DepartureHumidity,DepartureWeatherCode,DepartureWeatherCodeIndicator,DepartureFog,DepartureRain,DepartureSnow,DepartureThunder,DepartureIceFormation
0,2018-12-31,1405,1405,Utrecht Centraal,01:01,1.0,Amsterdam Centraal,01:29,1.5,7,2a,VIRM-4 9516;VIRM-6 8671,VIRM-4 9516;VIRM-6 8671,Amsterdam Bijlmer ArenA,,2018-12-31#1405,2018-12-31 01:01:00,2018-12-31 01:29:00,2018-12-31 01:30:30,2018-12-31 01:02:00,2018-12-31-2-240,,,,,,,,,,,,,,,,,,,,,,,,,
1,2018-12-31,1402,1402,Amsterdam Centraal,01:18,0.0,Utrecht Centraal,01:53,0.0,2,15,VIRM-6 8730;VIRM-6 8656,,Amsterdam Bijlmer ArenA,,2018-12-31#1402,2018-12-31 01:18:00,2018-12-31 01:53:00,2018-12-31 01:53:00,2018-12-31 01:18:00,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2019-01-01,1409,1409,Utrecht Centraal,02:17,0.0,Amsterdam Centraal,02:44,1.0,15,2a,VIRM-6 8656;VIRM-6 8730,VIRM-6 8656;VIRM-6 8730,Amsterdam Bijlmer ArenA,,2019-01-01#1409,2019-01-01 02:17:00,2019-01-01 02:44:00,2019-01-01 02:45:00,2019-01-01 02:17:00,2019-01-01-3-240,260.0,2019-01-01,3.0,250.0,4.0,4.0,9.0,8.4,,5.7,0.0,0.0,0.0,0.0,1030.1,57.0,8.0,83.0,10.0,7.0,0.0,0.0,0.0,0.0,0.0
3,2018-12-31,1406,1406,Amsterdam Centraal,02:19,1.5,Utrecht Centraal,02:45,0.0,2b,14,VIRM-6 8648;VIRM-4 9504,,,,2018-12-31#1406,2018-12-31 02:19:00,2018-12-31 02:45:00,2018-12-31 02:45:00,2018-12-31 02:20:30,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2019-01-01,1413,1413,Utrecht Centraal,03:11,0.0,Amsterdam Centraal,03:44,0.0,14,2a,VIRM-4 9504;VIRM-6 8648,VIRM-4 9504;VIRM-6 8648,,,2019-01-01#1413,2019-01-01 03:11:00,2019-01-01 03:44:00,2019-01-01 03:44:00,2019-01-01 03:11:00,2019-01-01-4-240,260.0,2019-01-01,4.0,250.0,4.0,4.0,8.0,8.2,,5.6,0.0,0.0,0.0,0.0,1029.3,62.0,8.0,83.0,,5.0,0.0,0.0,0.0,0.0,0.0


In [136]:
df = pd.merge(df, result_weather, on = 'WeatherKey', how='left')
df.head()

Unnamed: 0,Date,RideId,TrainId,DepartureStation,DepartureTime,DepartureDelay,DestinationStation,ArrivalTime,ArrivalDelay,DeparturePlatform,ArrivalPlatform,DepartureMaterials,ArrivalMaterials,InbetweenStations,FaultMessages,RideInstance,PlannedDepartureTime,PlannedArrivalTime,ActualArrivalTime,ActualDepartureTime,WeatherKey,DepartureWeatherStationCode,DepartureTimestamp,DepartureHour,DepartureWindDir,DepartureWindHour,DepartureWindSpeed,DepartureMaxWindSpeed,DepartureTemperature,DepartureMinTemp10M,DepartureDewPointTemp,DepartureSunshineDur,DepartureRadiation,DeparturePrecipDur,DeparturePrecipHour,DepartureAirPressure,DepartureVisibility,DepartureCloudiness,DepartureHumidity,DepartureWeatherCode,DepartureWeatherCodeIndicator,DepartureFog,DepartureRain,DepartureSnow,DepartureThunder,DepartureIceFormation,StationCode,Timestamp,Hour,WindDir,WindHour,WindSpeed,MaxWindSpeed,Temperature,MinTemp10M,DewPointTemp,SunshineDur,Radiation,PrecipDur,PrecipHour,AirPressure,Visibility,Cloudines,Humidity,WeatherCode,WeatherCodeIndicator,Fog,Rain,Snow,Thunder,IceFormation
0,2018-12-31,1405,1405,Utrecht Centraal,01:01,1.0,Amsterdam Centraal,01:29,1.5,7,2a,VIRM-4 9516;VIRM-6 8671,VIRM-4 9516;VIRM-6 8671,Amsterdam Bijlmer ArenA,,2018-12-31#1405,2018-12-31 01:01:00,2018-12-31 01:29:00,2018-12-31 01:30:30,2018-12-31 01:02:00,2018-12-31-2-240,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2018-12-31,1402,1402,Amsterdam Centraal,01:18,0.0,Utrecht Centraal,01:53,0.0,2,15,VIRM-6 8730;VIRM-6 8656,,Amsterdam Bijlmer ArenA,,2018-12-31#1402,2018-12-31 01:18:00,2018-12-31 01:53:00,2018-12-31 01:53:00,2018-12-31 01:18:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2019-01-01,1409,1409,Utrecht Centraal,02:17,0.0,Amsterdam Centraal,02:44,1.0,15,2a,VIRM-6 8656;VIRM-6 8730,VIRM-6 8656;VIRM-6 8730,Amsterdam Bijlmer ArenA,,2019-01-01#1409,2019-01-01 02:17:00,2019-01-01 02:44:00,2019-01-01 02:45:00,2019-01-01 02:17:00,2019-01-01-3-240,260.0,2019-01-01,3.0,250.0,4.0,4.0,9.0,8.4,,5.7,0.0,0.0,0.0,0.0,1030.1,57.0,8.0,83.0,10.0,7.0,0.0,0.0,0.0,0.0,0.0,240.0,2019-01-01,3.0,250.0,7.0,7.0,11.0,8.5,,5.1,0.0,0.0,0.0,0.0,1029.5,75.0,8.0,79.0,,5.0,0.0,0.0,0.0,0.0,0.0
3,2018-12-31,1406,1406,Amsterdam Centraal,02:19,1.5,Utrecht Centraal,02:45,0.0,2b,14,VIRM-6 8648;VIRM-4 9504,,,,2018-12-31#1406,2018-12-31 02:19:00,2018-12-31 02:45:00,2018-12-31 02:45:00,2018-12-31 02:20:30,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2019-01-01,1413,1413,Utrecht Centraal,03:11,0.0,Amsterdam Centraal,03:44,0.0,14,2a,VIRM-4 9504;VIRM-6 8648,VIRM-4 9504;VIRM-6 8648,,,2019-01-01#1413,2019-01-01 03:11:00,2019-01-01 03:44:00,2019-01-01 03:44:00,2019-01-01 03:11:00,2019-01-01-4-240,260.0,2019-01-01,4.0,250.0,4.0,4.0,8.0,8.2,,5.6,0.0,0.0,0.0,0.0,1029.3,62.0,8.0,83.0,,5.0,0.0,0.0,0.0,0.0,0.0,240.0,2019-01-01,4.0,250.0,7.0,8.0,11.0,8.2,,5.4,0.0,0.0,0.0,0.0,1029.0,70.0,8.0,82.0,,5.0,0.0,0.0,0.0,0.0,0.0


In [137]:
df = rename_weather('Destination', df)

In [139]:
df.head()

Unnamed: 0,Date,RideId,TrainId,DepartureStation,DepartureTime,DepartureDelay,DestinationStation,ArrivalTime,ArrivalDelay,DeparturePlatform,ArrivalPlatform,DepartureMaterials,ArrivalMaterials,InbetweenStations,FaultMessages,RideInstance,PlannedDepartureTime,PlannedArrivalTime,ActualArrivalTime,ActualDepartureTime,WeatherKey,DepartureWeatherStationCode,DepartureTimestamp,DepartureHour,DepartureWindDir,DepartureWindHour,DepartureWindSpeed,DepartureMaxWindSpeed,DepartureTemperature,DepartureMinTemp10M,DepartureDewPointTemp,DepartureSunshineDur,DepartureRadiation,DeparturePrecipDur,DeparturePrecipHour,DepartureAirPressure,DepartureVisibility,DepartureCloudiness,DepartureHumidity,DepartureWeatherCode,DepartureWeatherCodeIndicator,DepartureFog,DepartureRain,DepartureSnow,DepartureThunder,DepartureIceFormation,DestinationWeatherStationCode,DestinationTimestamp,DestinationHour,DestinationWindDir,DestinationWindHour,DestinationWindSpeed,DestinationMaxWindSpeed,DestinationTemperature,DestinationMinTemp10M,DestinationDewPointTemp,DestinationSunshineDur,DestinationRadiation,DestinationPrecipDur,DestinationPrecipHour,DestinationAirPressure,DestinationVisibility,DestinationCloudiness,DestinationHumidity,DestinationWeatherCode,DestinationWeatherCodeIndicator,DestinationFog,DestinationRain,DestinationSnow,DestinationThunder,DestinationIceFormation
0,2018-12-31,1405,1405,Utrecht Centraal,01:01,1.0,Amsterdam Centraal,01:29,1.5,7,2a,VIRM-4 9516;VIRM-6 8671,VIRM-4 9516;VIRM-6 8671,Amsterdam Bijlmer ArenA,,2018-12-31#1405,2018-12-31 01:01:00,2018-12-31 01:29:00,2018-12-31 01:30:30,2018-12-31 01:02:00,2018-12-31-2-240,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2018-12-31,1402,1402,Amsterdam Centraal,01:18,0.0,Utrecht Centraal,01:53,0.0,2,15,VIRM-6 8730;VIRM-6 8656,,Amsterdam Bijlmer ArenA,,2018-12-31#1402,2018-12-31 01:18:00,2018-12-31 01:53:00,2018-12-31 01:53:00,2018-12-31 01:18:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2019-01-01,1409,1409,Utrecht Centraal,02:17,0.0,Amsterdam Centraal,02:44,1.0,15,2a,VIRM-6 8656;VIRM-6 8730,VIRM-6 8656;VIRM-6 8730,Amsterdam Bijlmer ArenA,,2019-01-01#1409,2019-01-01 02:17:00,2019-01-01 02:44:00,2019-01-01 02:45:00,2019-01-01 02:17:00,2019-01-01-3-240,260.0,2019-01-01,3.0,250.0,4.0,4.0,9.0,8.4,,5.7,0.0,0.0,0.0,0.0,1030.1,57.0,8.0,83.0,10.0,7.0,0.0,0.0,0.0,0.0,0.0,240.0,2019-01-01,3.0,250.0,7.0,7.0,11.0,8.5,,5.1,0.0,0.0,0.0,0.0,1029.5,75.0,8.0,79.0,,5.0,0.0,0.0,0.0,0.0,0.0
3,2018-12-31,1406,1406,Amsterdam Centraal,02:19,1.5,Utrecht Centraal,02:45,0.0,2b,14,VIRM-6 8648;VIRM-4 9504,,,,2018-12-31#1406,2018-12-31 02:19:00,2018-12-31 02:45:00,2018-12-31 02:45:00,2018-12-31 02:20:30,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2019-01-01,1413,1413,Utrecht Centraal,03:11,0.0,Amsterdam Centraal,03:44,0.0,14,2a,VIRM-4 9504;VIRM-6 8648,VIRM-4 9504;VIRM-6 8648,,,2019-01-01#1413,2019-01-01 03:11:00,2019-01-01 03:44:00,2019-01-01 03:44:00,2019-01-01 03:11:00,2019-01-01-4-240,260.0,2019-01-01,4.0,250.0,4.0,4.0,8.0,8.2,,5.6,0.0,0.0,0.0,0.0,1029.3,62.0,8.0,83.0,,5.0,0.0,0.0,0.0,0.0,0.0,240.0,2019-01-01,4.0,250.0,7.0,8.0,11.0,8.2,,5.4,0.0,0.0,0.0,0.0,1029.0,70.0,8.0,82.0,,5.0,0.0,0.0,0.0,0.0,0.0


In [140]:
# save the dataset with the weate
df.to_csv('../assets/data/2019-UT-ASD-Full/2019-UT-ASD-scrapped.csv', index=None)