In [205]:
import pandas as pd
import numpy as np
import seaborn as sns
import xml.etree.ElementTree as ET

from functools import reduce
from datetime import datetime, timedelta

pd.set_option('display.max_columns', None)
%config IPCompleter.greedy=True

In [206]:
# load all data time chunks
df = pd.read_csv('../assets/data/2019-UT-ASD-Full/2019-UT-ASD-scrapped.csv', parse_dates=['PlannedDepartureTime',
                                                                                          'PlannedArrivalTime',
                                                                                          'ActualArrivalTime',
                                                                                          'ActualDepartureTime'
                                                                                         ])

In [207]:
df.shape

(94963, 20)

In [208]:
df.head()

Unnamed: 0,Date,RideId,TrainId,DepartureStation,DepartureTime,DepartureDelay,DestinationStation,ArrivalTime,ArrivalDelay,DeparturePlatform,ArrivalPlatform,DepartureMaterials,ArrivalMaterials,InbetweenStations,FaultMessages,RideInstance,PlannedDepartureTime,PlannedArrivalTime,ActualArrivalTime,ActualDepartureTime
0,2018-12-31,1405,1405,Utrecht Centraal,01:01,1.0,Amsterdam Centraal,01:29,1.5,7,2a,VIRM-4 9516;VIRM-6 8671,VIRM-4 9516;VIRM-6 8671,Amsterdam Bijlmer ArenA,,2018-12-31#1405,2018-12-31 01:01:00,2018-12-31 01:29:00,2018-12-31 01:30:30,2018-12-31 01:02:00
1,2018-12-31,1402,1402,Amsterdam Centraal,01:18,0.0,Utrecht Centraal,01:53,0.0,2,15,VIRM-6 8730;VIRM-6 8656,,Amsterdam Bijlmer ArenA,,2018-12-31#1402,2018-12-31 01:18:00,2018-12-31 01:53:00,2018-12-31 01:53:00,2018-12-31 01:18:00
2,2019-01-01,1409,1409,Utrecht Centraal,02:17,0.0,Amsterdam Centraal,02:44,1.0,15,2a,VIRM-6 8656;VIRM-6 8730,VIRM-6 8656;VIRM-6 8730,Amsterdam Bijlmer ArenA,,2019-01-01#1409,2019-01-01 02:17:00,2019-01-01 02:44:00,2019-01-01 02:45:00,2019-01-01 02:17:00
3,2018-12-31,1406,1406,Amsterdam Centraal,02:19,1.5,Utrecht Centraal,02:45,0.0,2b,14,VIRM-6 8648;VIRM-4 9504,,,,2018-12-31#1406,2018-12-31 02:19:00,2018-12-31 02:45:00,2018-12-31 02:45:00,2018-12-31 02:20:30
4,2019-01-01,1413,1413,Utrecht Centraal,03:11,0.0,Amsterdam Centraal,03:44,0.0,14,2a,VIRM-4 9504;VIRM-6 8648,VIRM-4 9504;VIRM-6 8648,,,2019-01-01#1413,2019-01-01 03:11:00,2019-01-01 03:44:00,2019-01-01 03:44:00,2019-01-01 03:11:00


# Merge data with weather

In [209]:
# if 'RideInstance' in dfa:
#     del dfa['RideInstance']
# dfa.insert(0, 'RideInstance', dfa.RideId.astype(str) + '#' + dfa.RideTime )

In [210]:
dfa_weather_U = pd.read_csv('../assets/data/de_bilt_weather_2019.csv')  
dfa_weather_A = pd.read_csv('../assets/data/schiphol_weather_2019.csv') 

In [211]:
dfa_weather_A['WeatherKey'] = dfa_weather_A['Timestamp'].astype(str) + '-' + dfa_weather_A['Hour'].astype(str) + '-' + dfa_weather_A['StationCode'].astype(str)
dfa_weather_U['WeatherKey'] = dfa_weather_U['Timestamp'].astype(str) + '-' + dfa_weather_U['Hour'].astype(str)+ '-'+ dfa_weather_U['StationCode'].astype(str)

In [212]:
print(dfa_weather_A.head())
print(dfa_weather_U.head())

   StationCode   Timestamp  Hour  WindDir  WindHour  WindSpeed  MaxWindSpeed  \
0          240  2019-01-01     1      260       7.0        6.0          10.0   
1          240  2019-01-01     2      260       7.0        7.0          10.0   
2          240  2019-01-01     3      250       7.0        7.0          11.0   
3          240  2019-01-01     4      250       7.0        8.0          11.0   
4          240  2019-01-01     5      260       9.0        9.0          12.0   

   Temperature  MinTemp10M  DewPointTemp  SunshineDur  Radiation  PrecipDur  \
0          8.5         NaN           5.7          0.0          0        0.0   
1          8.6         NaN           5.1          0.0          0        0.0   
2          8.5         NaN           5.1          0.0          0        0.0   
3          8.2         NaN           5.4          0.0          0        0.0   
4          8.7         NaN           5.8          0.0          0        0.0   

   PrecipHour  AirPressure  Visibility  Clou

In [213]:
def add_uic_code(date, destination, weather_station):
    if not pd.isnull(date):
        weather_key = f'{date.strftime("%Y-%m-%d")}-{int(date.hour) + 1}'
        if destination == 'Amsterdam Centraal':
            return weather_key + f'-{weather_station[0]}'
        if destination == 'Utrecht Centraal':
            return weather_key + f'-{weather_station[1]}'
    else:
        return None

In [214]:
# this function renames the weather columns with a specific prefix
def rename_weather(suffix, df):
    return df.rename(columns = {
        'StationCode' : f'{suffix}WeatherStationCode',
        'Timestamp' : f'{suffix}Timestamp',  # date (YYYY=year,MM=month,DD=day)
        'Hour' : f'{suffix}Hour' ,  # time (HH uur/hour, UT. 12 UT=13 MET, 14 MEZT. Hourly division 05 runs from 04.00 UT to 5.00 UT
        'WindDir' : f'{suffix}WindDir' ,  # Mean wind direction (in degrees) during the 10-minute period preceding the time of observation (360=north, 90=east, 180=south, 270=west, 0=calm 990=variable)
        'WindHour' : f'{suffix}WindHour' ,  # Hourly mean wind speed (in 0.1 m/s)
        'WindSpeed' : f'{suffix}WindSpeed' , # Mean wind speed (in 0.1 m/s) during the 10-minute period preceding the time of observation  
        'MaxWindSpeed' : f'{suffix}MaxWindSpeed' ,  # Maximum wind gust (in 0.1 m/s) during the hourly division
        'Temperature' : f'{suffix}Temperature' ,  # Temperature (in 0.1 degrees Celsius) at 1.50 m at the time of observation  
        'MinTemp10M' : f'{suffix}MinTemp10M' ,  # Minimum temperature (in 0.1 degrees Celsius) at 0.1 m in the preceding 6-hour period
        'DewPointTemp' : f'{suffix}DewPointTemp' ,  # Dew point temperature (in 0.1 degrees Celsius) at 1.50 m at the time of observation 
        'SunshineDur' : f'{suffix}SunshineDur' ,  # Sunshine duration (in 0.1 hour) during the hourly division, calculated from global radiation (-1 for <0.05 hour) 
        'Radiation' : f'{suffix}Radiation' ,  # Global radiation (in J/cm2) during the hourly division    
        'PrecipDur' : f'{suffix}PrecipDur',  # Precipitation duration (in 0.1 hour) during the hourly division
        'PrecipHour' : f'{suffix}PrecipHour',  # Hourly precipitation amount (in 0.1 mm) (-1 for <0.05 mm)
        'AirPressure' : f'{suffix}AirPressure',  # Air pressure (in 0.1 hPa) reduced to mean sea level, at the time of observation 
        'Visibility' : f'{suffix}Visibility',  # Horizontal visibility at the time of observation (0=less than 100m, 1=100-200m, 2=200-300m,..., 49=4900-5000m, 50=5-6km, 56=6-7km, 57=7-8km, ..., 79=29-30km, 80=30-35km, 81=35-40km,..., 89=more than 70km)
        'Cloudines' : f'{suffix}Cloudiness',  # Cloud cover (in octants), at the time of observation (9=sky invisible)
        'Humidity': f'{suffix}Humidity',  # Relative atmospheric humidity (in percents) at 1.50 m at the time of observation
        'WeatherCode' : f'{suffix}WeatherCode',  # Present weather code (00-99), description for the hourly division. (http://bibliotheek.knmi.nl/scholierenpdf/weercodes_Nederland)
        'WeatherCodeIndicator': f'{suffix}WeatherCodeIndicator',  # Indicator present weather code (1=manned and recorded (using code from visual observations), 2,3=manned and omitted (no significant weather phenomenon to report, not available), 4=automatically recorded (using code from visual observations), 5,6=automatically omitted (no significant weather phenomenon to report, not available), 7=automatically set (using code from automated observations) 
        'Fog' : f'{suffix}Fog',  # Fog 0=no occurrence, 1=occurred during the preceding hour and/or at the time of observation
        'Rain' : f'{suffix}Rain',  # Rainfall 0=no occurrence, 1=occurred during the preceding hour and/or at the time of observation
        'Snow' : f'{suffix}Snow',  # Snow 0=no occurrence, 1=occurred during the preceding hour and/or at the time of observation
        'Thunder' : f'{suffix}Thunder',  # Thunder  0=no occurrence, 1=occurred during the preceding hour and/or at the time of observation 
        'IceFormation' : f'{suffix}IceFormation'  # Ice formation 0=no occurrence, 1=occurred during the preceding hour and/or at the time of observation
    })

In [215]:
# df['WeatherKey'] = df['PlannedArrivalTime'].dt.strftime('%Y-%m-%d') + '-' + (df['PlannedArrivalTime'].dt.hour.astype(int) + 1).astype(str)

In [216]:
df['WeatherKey'] = df.apply(lambda row: add_uic_code(row['PlannedDepartureTime'], row['DestinationStation'], [260, 240]), axis=1)

In [217]:
df.head()

Unnamed: 0,Date,RideId,TrainId,DepartureStation,DepartureTime,DepartureDelay,DestinationStation,ArrivalTime,ArrivalDelay,DeparturePlatform,ArrivalPlatform,DepartureMaterials,ArrivalMaterials,InbetweenStations,FaultMessages,RideInstance,PlannedDepartureTime,PlannedArrivalTime,ActualArrivalTime,ActualDepartureTime,WeatherKey
0,2018-12-31,1405,1405,Utrecht Centraal,01:01,1.0,Amsterdam Centraal,01:29,1.5,7,2a,VIRM-4 9516;VIRM-6 8671,VIRM-4 9516;VIRM-6 8671,Amsterdam Bijlmer ArenA,,2018-12-31#1405,2018-12-31 01:01:00,2018-12-31 01:29:00,2018-12-31 01:30:30,2018-12-31 01:02:00,2018-12-31-2-260
1,2018-12-31,1402,1402,Amsterdam Centraal,01:18,0.0,Utrecht Centraal,01:53,0.0,2,15,VIRM-6 8730;VIRM-6 8656,,Amsterdam Bijlmer ArenA,,2018-12-31#1402,2018-12-31 01:18:00,2018-12-31 01:53:00,2018-12-31 01:53:00,2018-12-31 01:18:00,2018-12-31-2-240
2,2019-01-01,1409,1409,Utrecht Centraal,02:17,0.0,Amsterdam Centraal,02:44,1.0,15,2a,VIRM-6 8656;VIRM-6 8730,VIRM-6 8656;VIRM-6 8730,Amsterdam Bijlmer ArenA,,2019-01-01#1409,2019-01-01 02:17:00,2019-01-01 02:44:00,2019-01-01 02:45:00,2019-01-01 02:17:00,2019-01-01-3-260
3,2018-12-31,1406,1406,Amsterdam Centraal,02:19,1.5,Utrecht Centraal,02:45,0.0,2b,14,VIRM-6 8648;VIRM-4 9504,,,,2018-12-31#1406,2018-12-31 02:19:00,2018-12-31 02:45:00,2018-12-31 02:45:00,2018-12-31 02:20:30,2018-12-31-3-240
4,2019-01-01,1413,1413,Utrecht Centraal,03:11,0.0,Amsterdam Centraal,03:44,0.0,14,2a,VIRM-4 9504;VIRM-6 8648,VIRM-4 9504;VIRM-6 8648,,,2019-01-01#1413,2019-01-01 03:11:00,2019-01-01 03:44:00,2019-01-01 03:44:00,2019-01-01 03:11:00,2019-01-01-4-260


In [218]:
 df.sort_values('PlannedArrivalTime').head()

Unnamed: 0,Date,RideId,TrainId,DepartureStation,DepartureTime,DepartureDelay,DestinationStation,ArrivalTime,ArrivalDelay,DeparturePlatform,ArrivalPlatform,DepartureMaterials,ArrivalMaterials,InbetweenStations,FaultMessages,RideInstance,PlannedDepartureTime,PlannedArrivalTime,ActualArrivalTime,ActualDepartureTime,WeatherKey
0,2018-12-31,1405,1405,Utrecht Centraal,01:01,1.0,Amsterdam Centraal,01:29,1.5,7,2a,VIRM-4 9516;VIRM-6 8671,VIRM-4 9516;VIRM-6 8671,Amsterdam Bijlmer ArenA,,2018-12-31#1405,2018-12-31 01:01:00,2018-12-31 01:29:00,2018-12-31 01:30:30,2018-12-31 01:02:00,2018-12-31-2-260
1,2018-12-31,1402,1402,Amsterdam Centraal,01:18,0.0,Utrecht Centraal,01:53,0.0,2,15,VIRM-6 8730;VIRM-6 8656,,Amsterdam Bijlmer ArenA,,2018-12-31#1402,2018-12-31 01:18:00,2018-12-31 01:53:00,2018-12-31 01:53:00,2018-12-31 01:18:00,2018-12-31-2-240
3,2018-12-31,1406,1406,Amsterdam Centraal,02:19,1.5,Utrecht Centraal,02:45,0.0,2b,14,VIRM-6 8648;VIRM-4 9504,,,,2018-12-31#1406,2018-12-31 02:19:00,2018-12-31 02:45:00,2018-12-31 02:45:00,2018-12-31 02:20:30,2018-12-31-3-240
200,2019-01-01,3091,3091,Amsterdam Centraal,00:24,0.0,Utrecht Centraal,00:52,0.0,4a,19,VIRM-6 8662,VIRM-6 8662,Amsterdam Amstel,,2019-01-01#3091,2019-01-01 00:24:00,2019-01-01 00:52:00,2019-01-01 00:52:00,2019-01-01 00:24:00,2019-01-01-1-240
199,2019-01-01,2986,2986,Utrecht Centraal,00:23,0.0,Amsterdam Centraal,00:52,0.0,7,10a,VIRM-4 9563,,Amsterdam Bijlmer ArenA;Amsterdam Amstel,,2019-01-01#2986,2019-01-01 00:23:00,2019-01-01 00:52:00,2019-01-01 00:52:00,2019-01-01 00:23:00,2019-01-01-1-260


In [219]:
# combine weather in one big weather dataset
result_weather = pd.concat([dfa_weather_A, dfa_weather_U])

In [220]:
result_weather.head()

Unnamed: 0,StationCode,Timestamp,Hour,WindDir,WindHour,WindSpeed,MaxWindSpeed,Temperature,MinTemp10M,DewPointTemp,SunshineDur,Radiation,PrecipDur,PrecipHour,AirPressure,Visibility,Cloudines,Humidity,WeatherCode,WeatherCodeIndicator,Fog,Rain,Snow,Thunder,IceFormation,WeatherKey
0,240,2019-01-01,1,260,7.0,6.0,10.0,8.5,,5.7,0.0,0,0.0,0.0,1030.6,69.0,8.0,82,,5,0.0,0.0,0.0,0.0,0.0,2019-01-01-1-240
1,240,2019-01-01,2,260,7.0,7.0,10.0,8.6,,5.1,0.0,0,0.0,0.0,1030.1,75.0,8.0,78,,5,0.0,0.0,0.0,0.0,0.0,2019-01-01-2-240
2,240,2019-01-01,3,250,7.0,7.0,11.0,8.5,,5.1,0.0,0,0.0,0.0,1029.5,75.0,8.0,79,,5,0.0,0.0,0.0,0.0,0.0,2019-01-01-3-240
3,240,2019-01-01,4,250,7.0,8.0,11.0,8.2,,5.4,0.0,0,0.0,0.0,1029.0,70.0,8.0,82,,5,0.0,0.0,0.0,0.0,0.0,2019-01-01-4-240
4,240,2019-01-01,5,260,9.0,9.0,12.0,8.7,,5.8,0.0,0,0.0,-0.1,1028.3,70.0,8.0,81,22.0,7,0.0,1.0,0.0,0.0,0.0,2019-01-01-5-240


In [221]:
# merge weather with train data
df = pd.merge(df,result_weather, on = 'WeatherKey', how='left')
df.head()

Unnamed: 0,Date,RideId,TrainId,DepartureStation,DepartureTime,DepartureDelay,DestinationStation,ArrivalTime,ArrivalDelay,DeparturePlatform,ArrivalPlatform,DepartureMaterials,ArrivalMaterials,InbetweenStations,FaultMessages,RideInstance,PlannedDepartureTime,PlannedArrivalTime,ActualArrivalTime,ActualDepartureTime,WeatherKey,StationCode,Timestamp,Hour,WindDir,WindHour,WindSpeed,MaxWindSpeed,Temperature,MinTemp10M,DewPointTemp,SunshineDur,Radiation,PrecipDur,PrecipHour,AirPressure,Visibility,Cloudines,Humidity,WeatherCode,WeatherCodeIndicator,Fog,Rain,Snow,Thunder,IceFormation
0,2018-12-31,1405,1405,Utrecht Centraal,01:01,1.0,Amsterdam Centraal,01:29,1.5,7,2a,VIRM-4 9516;VIRM-6 8671,VIRM-4 9516;VIRM-6 8671,Amsterdam Bijlmer ArenA,,2018-12-31#1405,2018-12-31 01:01:00,2018-12-31 01:29:00,2018-12-31 01:30:30,2018-12-31 01:02:00,2018-12-31-2-260,,,,,,,,,,,,,,,,,,,,,,,,,
1,2018-12-31,1402,1402,Amsterdam Centraal,01:18,0.0,Utrecht Centraal,01:53,0.0,2,15,VIRM-6 8730;VIRM-6 8656,,Amsterdam Bijlmer ArenA,,2018-12-31#1402,2018-12-31 01:18:00,2018-12-31 01:53:00,2018-12-31 01:53:00,2018-12-31 01:18:00,2018-12-31-2-240,,,,,,,,,,,,,,,,,,,,,,,,,
2,2019-01-01,1409,1409,Utrecht Centraal,02:17,0.0,Amsterdam Centraal,02:44,1.0,15,2a,VIRM-6 8656;VIRM-6 8730,VIRM-6 8656;VIRM-6 8730,Amsterdam Bijlmer ArenA,,2019-01-01#1409,2019-01-01 02:17:00,2019-01-01 02:44:00,2019-01-01 02:45:00,2019-01-01 02:17:00,2019-01-01-3-260,260.0,2019-01-01,3.0,250.0,4.0,4.0,9.0,8.4,,5.7,0.0,0.0,0.0,0.0,1030.1,57.0,8.0,83.0,10.0,7.0,0.0,0.0,0.0,0.0,0.0
3,2018-12-31,1406,1406,Amsterdam Centraal,02:19,1.5,Utrecht Centraal,02:45,0.0,2b,14,VIRM-6 8648;VIRM-4 9504,,,,2018-12-31#1406,2018-12-31 02:19:00,2018-12-31 02:45:00,2018-12-31 02:45:00,2018-12-31 02:20:30,2018-12-31-3-240,,,,,,,,,,,,,,,,,,,,,,,,,
4,2019-01-01,1413,1413,Utrecht Centraal,03:11,0.0,Amsterdam Centraal,03:44,0.0,14,2a,VIRM-4 9504;VIRM-6 8648,VIRM-4 9504;VIRM-6 8648,,,2019-01-01#1413,2019-01-01 03:11:00,2019-01-01 03:44:00,2019-01-01 03:44:00,2019-01-01 03:11:00,2019-01-01-4-260,260.0,2019-01-01,4.0,250.0,4.0,4.0,8.0,8.2,,5.6,0.0,0.0,0.0,0.0,1029.3,62.0,8.0,83.0,,5.0,0.0,0.0,0.0,0.0,0.0


In [222]:
df.iloc[10].to_frame()

Unnamed: 0,10
Date,2019-01-01
RideId,1425
TrainId,1425
DepartureStation,Utrecht Centraal
DepartureTime,06:05
DepartureDelay,0
DestinationStation,Amsterdam Centraal
ArrivalTime,06:39
ArrivalDelay,0
DeparturePlatform,5


In [223]:
df = rename_weather('Departure', df)

In [224]:
df.loc[df['DestinationStation'] == 'Utrecht Centraal'].isna().sum()

Date                                 0
RideId                               0
TrainId                              0
DepartureStation                     0
DepartureTime                        2
DepartureDelay                       0
DestinationStation                   0
ArrivalTime                          0
ArrivalDelay                         0
DeparturePlatform                    5
ArrivalPlatform                      0
DepartureMaterials                2070
ArrivalMaterials                  5152
InbetweenStations                 3642
FaultMessages                    45577
RideInstance                         0
PlannedDepartureTime                 2
PlannedArrivalTime                   0
ActualArrivalTime                    0
ActualDepartureTime                  2
WeatherKey                           2
DepartureWeatherStationCode          4
DepartureTimestamp                   4
DepartureHour                        4
DepartureWindDir                     4
DepartureWindHour        

In [225]:
df.head()

Unnamed: 0,Date,RideId,TrainId,DepartureStation,DepartureTime,DepartureDelay,DestinationStation,ArrivalTime,ArrivalDelay,DeparturePlatform,ArrivalPlatform,DepartureMaterials,ArrivalMaterials,InbetweenStations,FaultMessages,RideInstance,PlannedDepartureTime,PlannedArrivalTime,ActualArrivalTime,ActualDepartureTime,WeatherKey,DepartureWeatherStationCode,DepartureTimestamp,DepartureHour,DepartureWindDir,DepartureWindHour,DepartureWindSpeed,DepartureMaxWindSpeed,DepartureTemperature,DepartureMinTemp10M,DepartureDewPointTemp,DepartureSunshineDur,DepartureRadiation,DeparturePrecipDur,DeparturePrecipHour,DepartureAirPressure,DepartureVisibility,DepartureCloudiness,DepartureHumidity,DepartureWeatherCode,DepartureWeatherCodeIndicator,DepartureFog,DepartureRain,DepartureSnow,DepartureThunder,DepartureIceFormation
0,2018-12-31,1405,1405,Utrecht Centraal,01:01,1.0,Amsterdam Centraal,01:29,1.5,7,2a,VIRM-4 9516;VIRM-6 8671,VIRM-4 9516;VIRM-6 8671,Amsterdam Bijlmer ArenA,,2018-12-31#1405,2018-12-31 01:01:00,2018-12-31 01:29:00,2018-12-31 01:30:30,2018-12-31 01:02:00,2018-12-31-2-260,,,,,,,,,,,,,,,,,,,,,,,,,
1,2018-12-31,1402,1402,Amsterdam Centraal,01:18,0.0,Utrecht Centraal,01:53,0.0,2,15,VIRM-6 8730;VIRM-6 8656,,Amsterdam Bijlmer ArenA,,2018-12-31#1402,2018-12-31 01:18:00,2018-12-31 01:53:00,2018-12-31 01:53:00,2018-12-31 01:18:00,2018-12-31-2-240,,,,,,,,,,,,,,,,,,,,,,,,,
2,2019-01-01,1409,1409,Utrecht Centraal,02:17,0.0,Amsterdam Centraal,02:44,1.0,15,2a,VIRM-6 8656;VIRM-6 8730,VIRM-6 8656;VIRM-6 8730,Amsterdam Bijlmer ArenA,,2019-01-01#1409,2019-01-01 02:17:00,2019-01-01 02:44:00,2019-01-01 02:45:00,2019-01-01 02:17:00,2019-01-01-3-260,260.0,2019-01-01,3.0,250.0,4.0,4.0,9.0,8.4,,5.7,0.0,0.0,0.0,0.0,1030.1,57.0,8.0,83.0,10.0,7.0,0.0,0.0,0.0,0.0,0.0
3,2018-12-31,1406,1406,Amsterdam Centraal,02:19,1.5,Utrecht Centraal,02:45,0.0,2b,14,VIRM-6 8648;VIRM-4 9504,,,,2018-12-31#1406,2018-12-31 02:19:00,2018-12-31 02:45:00,2018-12-31 02:45:00,2018-12-31 02:20:30,2018-12-31-3-240,,,,,,,,,,,,,,,,,,,,,,,,,
4,2019-01-01,1413,1413,Utrecht Centraal,03:11,0.0,Amsterdam Centraal,03:44,0.0,14,2a,VIRM-4 9504;VIRM-6 8648,VIRM-4 9504;VIRM-6 8648,,,2019-01-01#1413,2019-01-01 03:11:00,2019-01-01 03:44:00,2019-01-01 03:44:00,2019-01-01 03:11:00,2019-01-01-4-260,260.0,2019-01-01,4.0,250.0,4.0,4.0,8.0,8.2,,5.6,0.0,0.0,0.0,0.0,1029.3,62.0,8.0,83.0,,5.0,0.0,0.0,0.0,0.0,0.0


# Create the departure time field

In [226]:
# create weather key for departure
df['WeatherKey'] = df.apply(lambda row: add_uic_code(row['PlannedArrivalTime'], row['DestinationStation'], [240, 260]), axis=1)

In [227]:
df.head()

Unnamed: 0,Date,RideId,TrainId,DepartureStation,DepartureTime,DepartureDelay,DestinationStation,ArrivalTime,ArrivalDelay,DeparturePlatform,ArrivalPlatform,DepartureMaterials,ArrivalMaterials,InbetweenStations,FaultMessages,RideInstance,PlannedDepartureTime,PlannedArrivalTime,ActualArrivalTime,ActualDepartureTime,WeatherKey,DepartureWeatherStationCode,DepartureTimestamp,DepartureHour,DepartureWindDir,DepartureWindHour,DepartureWindSpeed,DepartureMaxWindSpeed,DepartureTemperature,DepartureMinTemp10M,DepartureDewPointTemp,DepartureSunshineDur,DepartureRadiation,DeparturePrecipDur,DeparturePrecipHour,DepartureAirPressure,DepartureVisibility,DepartureCloudiness,DepartureHumidity,DepartureWeatherCode,DepartureWeatherCodeIndicator,DepartureFog,DepartureRain,DepartureSnow,DepartureThunder,DepartureIceFormation
0,2018-12-31,1405,1405,Utrecht Centraal,01:01,1.0,Amsterdam Centraal,01:29,1.5,7,2a,VIRM-4 9516;VIRM-6 8671,VIRM-4 9516;VIRM-6 8671,Amsterdam Bijlmer ArenA,,2018-12-31#1405,2018-12-31 01:01:00,2018-12-31 01:29:00,2018-12-31 01:30:30,2018-12-31 01:02:00,2018-12-31-2-240,,,,,,,,,,,,,,,,,,,,,,,,,
1,2018-12-31,1402,1402,Amsterdam Centraal,01:18,0.0,Utrecht Centraal,01:53,0.0,2,15,VIRM-6 8730;VIRM-6 8656,,Amsterdam Bijlmer ArenA,,2018-12-31#1402,2018-12-31 01:18:00,2018-12-31 01:53:00,2018-12-31 01:53:00,2018-12-31 01:18:00,2018-12-31-2-260,,,,,,,,,,,,,,,,,,,,,,,,,
2,2019-01-01,1409,1409,Utrecht Centraal,02:17,0.0,Amsterdam Centraal,02:44,1.0,15,2a,VIRM-6 8656;VIRM-6 8730,VIRM-6 8656;VIRM-6 8730,Amsterdam Bijlmer ArenA,,2019-01-01#1409,2019-01-01 02:17:00,2019-01-01 02:44:00,2019-01-01 02:45:00,2019-01-01 02:17:00,2019-01-01-3-240,260.0,2019-01-01,3.0,250.0,4.0,4.0,9.0,8.4,,5.7,0.0,0.0,0.0,0.0,1030.1,57.0,8.0,83.0,10.0,7.0,0.0,0.0,0.0,0.0,0.0
3,2018-12-31,1406,1406,Amsterdam Centraal,02:19,1.5,Utrecht Centraal,02:45,0.0,2b,14,VIRM-6 8648;VIRM-4 9504,,,,2018-12-31#1406,2018-12-31 02:19:00,2018-12-31 02:45:00,2018-12-31 02:45:00,2018-12-31 02:20:30,2018-12-31-3-260,,,,,,,,,,,,,,,,,,,,,,,,,
4,2019-01-01,1413,1413,Utrecht Centraal,03:11,0.0,Amsterdam Centraal,03:44,0.0,14,2a,VIRM-4 9504;VIRM-6 8648,VIRM-4 9504;VIRM-6 8648,,,2019-01-01#1413,2019-01-01 03:11:00,2019-01-01 03:44:00,2019-01-01 03:44:00,2019-01-01 03:11:00,2019-01-01-4-240,260.0,2019-01-01,4.0,250.0,4.0,4.0,8.0,8.2,,5.6,0.0,0.0,0.0,0.0,1029.3,62.0,8.0,83.0,,5.0,0.0,0.0,0.0,0.0,0.0


In [228]:
df = pd.merge(df, result_weather, on = 'WeatherKey', how='left')
df.head()

Unnamed: 0,Date,RideId,TrainId,DepartureStation,DepartureTime,DepartureDelay,DestinationStation,ArrivalTime,ArrivalDelay,DeparturePlatform,ArrivalPlatform,DepartureMaterials,ArrivalMaterials,InbetweenStations,FaultMessages,RideInstance,PlannedDepartureTime,PlannedArrivalTime,ActualArrivalTime,ActualDepartureTime,WeatherKey,DepartureWeatherStationCode,DepartureTimestamp,DepartureHour,DepartureWindDir,DepartureWindHour,DepartureWindSpeed,DepartureMaxWindSpeed,DepartureTemperature,DepartureMinTemp10M,DepartureDewPointTemp,DepartureSunshineDur,DepartureRadiation,DeparturePrecipDur,DeparturePrecipHour,DepartureAirPressure,DepartureVisibility,DepartureCloudiness,DepartureHumidity,DepartureWeatherCode,DepartureWeatherCodeIndicator,DepartureFog,DepartureRain,DepartureSnow,DepartureThunder,DepartureIceFormation,StationCode,Timestamp,Hour,WindDir,WindHour,WindSpeed,MaxWindSpeed,Temperature,MinTemp10M,DewPointTemp,SunshineDur,Radiation,PrecipDur,PrecipHour,AirPressure,Visibility,Cloudines,Humidity,WeatherCode,WeatherCodeIndicator,Fog,Rain,Snow,Thunder,IceFormation
0,2018-12-31,1405,1405,Utrecht Centraal,01:01,1.0,Amsterdam Centraal,01:29,1.5,7,2a,VIRM-4 9516;VIRM-6 8671,VIRM-4 9516;VIRM-6 8671,Amsterdam Bijlmer ArenA,,2018-12-31#1405,2018-12-31 01:01:00,2018-12-31 01:29:00,2018-12-31 01:30:30,2018-12-31 01:02:00,2018-12-31-2-240,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2018-12-31,1402,1402,Amsterdam Centraal,01:18,0.0,Utrecht Centraal,01:53,0.0,2,15,VIRM-6 8730;VIRM-6 8656,,Amsterdam Bijlmer ArenA,,2018-12-31#1402,2018-12-31 01:18:00,2018-12-31 01:53:00,2018-12-31 01:53:00,2018-12-31 01:18:00,2018-12-31-2-260,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2019-01-01,1409,1409,Utrecht Centraal,02:17,0.0,Amsterdam Centraal,02:44,1.0,15,2a,VIRM-6 8656;VIRM-6 8730,VIRM-6 8656;VIRM-6 8730,Amsterdam Bijlmer ArenA,,2019-01-01#1409,2019-01-01 02:17:00,2019-01-01 02:44:00,2019-01-01 02:45:00,2019-01-01 02:17:00,2019-01-01-3-240,260.0,2019-01-01,3.0,250.0,4.0,4.0,9.0,8.4,,5.7,0.0,0.0,0.0,0.0,1030.1,57.0,8.0,83.0,10.0,7.0,0.0,0.0,0.0,0.0,0.0,240.0,2019-01-01,3.0,250.0,7.0,7.0,11.0,8.5,,5.1,0.0,0.0,0.0,0.0,1029.5,75.0,8.0,79.0,,5.0,0.0,0.0,0.0,0.0,0.0
3,2018-12-31,1406,1406,Amsterdam Centraal,02:19,1.5,Utrecht Centraal,02:45,0.0,2b,14,VIRM-6 8648;VIRM-4 9504,,,,2018-12-31#1406,2018-12-31 02:19:00,2018-12-31 02:45:00,2018-12-31 02:45:00,2018-12-31 02:20:30,2018-12-31-3-260,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2019-01-01,1413,1413,Utrecht Centraal,03:11,0.0,Amsterdam Centraal,03:44,0.0,14,2a,VIRM-4 9504;VIRM-6 8648,VIRM-4 9504;VIRM-6 8648,,,2019-01-01#1413,2019-01-01 03:11:00,2019-01-01 03:44:00,2019-01-01 03:44:00,2019-01-01 03:11:00,2019-01-01-4-240,260.0,2019-01-01,4.0,250.0,4.0,4.0,8.0,8.2,,5.6,0.0,0.0,0.0,0.0,1029.3,62.0,8.0,83.0,,5.0,0.0,0.0,0.0,0.0,0.0,240.0,2019-01-01,4.0,250.0,7.0,8.0,11.0,8.2,,5.4,0.0,0.0,0.0,0.0,1029.0,70.0,8.0,82.0,,5.0,0.0,0.0,0.0,0.0,0.0


In [229]:
df = rename_weather('Destination', df)

In [230]:
df.head()

Unnamed: 0,Date,RideId,TrainId,DepartureStation,DepartureTime,DepartureDelay,DestinationStation,ArrivalTime,ArrivalDelay,DeparturePlatform,ArrivalPlatform,DepartureMaterials,ArrivalMaterials,InbetweenStations,FaultMessages,RideInstance,PlannedDepartureTime,PlannedArrivalTime,ActualArrivalTime,ActualDepartureTime,WeatherKey,DepartureWeatherStationCode,DepartureTimestamp,DepartureHour,DepartureWindDir,DepartureWindHour,DepartureWindSpeed,DepartureMaxWindSpeed,DepartureTemperature,DepartureMinTemp10M,DepartureDewPointTemp,DepartureSunshineDur,DepartureRadiation,DeparturePrecipDur,DeparturePrecipHour,DepartureAirPressure,DepartureVisibility,DepartureCloudiness,DepartureHumidity,DepartureWeatherCode,DepartureWeatherCodeIndicator,DepartureFog,DepartureRain,DepartureSnow,DepartureThunder,DepartureIceFormation,DestinationWeatherStationCode,DestinationTimestamp,DestinationHour,DestinationWindDir,DestinationWindHour,DestinationWindSpeed,DestinationMaxWindSpeed,DestinationTemperature,DestinationMinTemp10M,DestinationDewPointTemp,DestinationSunshineDur,DestinationRadiation,DestinationPrecipDur,DestinationPrecipHour,DestinationAirPressure,DestinationVisibility,DestinationCloudiness,DestinationHumidity,DestinationWeatherCode,DestinationWeatherCodeIndicator,DestinationFog,DestinationRain,DestinationSnow,DestinationThunder,DestinationIceFormation
0,2018-12-31,1405,1405,Utrecht Centraal,01:01,1.0,Amsterdam Centraal,01:29,1.5,7,2a,VIRM-4 9516;VIRM-6 8671,VIRM-4 9516;VIRM-6 8671,Amsterdam Bijlmer ArenA,,2018-12-31#1405,2018-12-31 01:01:00,2018-12-31 01:29:00,2018-12-31 01:30:30,2018-12-31 01:02:00,2018-12-31-2-240,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2018-12-31,1402,1402,Amsterdam Centraal,01:18,0.0,Utrecht Centraal,01:53,0.0,2,15,VIRM-6 8730;VIRM-6 8656,,Amsterdam Bijlmer ArenA,,2018-12-31#1402,2018-12-31 01:18:00,2018-12-31 01:53:00,2018-12-31 01:53:00,2018-12-31 01:18:00,2018-12-31-2-260,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2019-01-01,1409,1409,Utrecht Centraal,02:17,0.0,Amsterdam Centraal,02:44,1.0,15,2a,VIRM-6 8656;VIRM-6 8730,VIRM-6 8656;VIRM-6 8730,Amsterdam Bijlmer ArenA,,2019-01-01#1409,2019-01-01 02:17:00,2019-01-01 02:44:00,2019-01-01 02:45:00,2019-01-01 02:17:00,2019-01-01-3-240,260.0,2019-01-01,3.0,250.0,4.0,4.0,9.0,8.4,,5.7,0.0,0.0,0.0,0.0,1030.1,57.0,8.0,83.0,10.0,7.0,0.0,0.0,0.0,0.0,0.0,240.0,2019-01-01,3.0,250.0,7.0,7.0,11.0,8.5,,5.1,0.0,0.0,0.0,0.0,1029.5,75.0,8.0,79.0,,5.0,0.0,0.0,0.0,0.0,0.0
3,2018-12-31,1406,1406,Amsterdam Centraal,02:19,1.5,Utrecht Centraal,02:45,0.0,2b,14,VIRM-6 8648;VIRM-4 9504,,,,2018-12-31#1406,2018-12-31 02:19:00,2018-12-31 02:45:00,2018-12-31 02:45:00,2018-12-31 02:20:30,2018-12-31-3-260,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2019-01-01,1413,1413,Utrecht Centraal,03:11,0.0,Amsterdam Centraal,03:44,0.0,14,2a,VIRM-4 9504;VIRM-6 8648,VIRM-4 9504;VIRM-6 8648,,,2019-01-01#1413,2019-01-01 03:11:00,2019-01-01 03:44:00,2019-01-01 03:44:00,2019-01-01 03:11:00,2019-01-01-4-240,260.0,2019-01-01,4.0,250.0,4.0,4.0,8.0,8.2,,5.6,0.0,0.0,0.0,0.0,1029.3,62.0,8.0,83.0,,5.0,0.0,0.0,0.0,0.0,0.0,240.0,2019-01-01,4.0,250.0,7.0,8.0,11.0,8.2,,5.4,0.0,0.0,0.0,0.0,1029.0,70.0,8.0,82.0,,5.0,0.0,0.0,0.0,0.0,0.0


In [231]:
df.loc[df['DestinationStation'] == 'Utrecht Centraal'].isna().sum()

Date                                   0
RideId                                 0
TrainId                                0
DepartureStation                       0
DepartureTime                          2
DepartureDelay                         0
DestinationStation                     0
ArrivalTime                            0
ArrivalDelay                           0
DeparturePlatform                      5
ArrivalPlatform                        0
DepartureMaterials                  2070
ArrivalMaterials                    5152
InbetweenStations                   3642
FaultMessages                      45577
RideInstance                           0
PlannedDepartureTime                   2
PlannedArrivalTime                     0
ActualArrivalTime                      0
ActualDepartureTime                    2
WeatherKey                             0
DepartureWeatherStationCode            4
DepartureTimestamp                     4
DepartureHour                          4
DepartureWindDir

In [232]:
# save the dataset with the weate
df.to_csv('../assets/data/2019-UT-ASD-Full/2019-UT-ASD-scrapped.csv', index=None)

# Merge disruptions with train data

In [233]:
df = pd.read_csv('../assets/data/2019-UT-ASD-Full/2019-UT-ASD-scrapped.csv')
df_disruptions = pd.read_csv('../assets/original_dataset/disruptions-2019.csv', parse_dates=['start_time', 
                                                                                             'end_time'
                                                                                            ])

In [234]:
df.loc[df['DestinationStation'] == 'Utrecht Centraal'].isna().sum()

Date                                   0
RideId                                 0
TrainId                                0
DepartureStation                       0
DepartureTime                          2
DepartureDelay                         0
DestinationStation                     0
ArrivalTime                            0
ArrivalDelay                           0
DeparturePlatform                      5
ArrivalPlatform                        0
DepartureMaterials                  2070
ArrivalMaterials                    5152
InbetweenStations                   3642
FaultMessages                      45577
RideInstance                           0
PlannedDepartureTime                   2
PlannedArrivalTime                     0
ActualArrivalTime                      0
ActualDepartureTime                    2
WeatherKey                             0
DepartureWeatherStationCode            4
DepartureTimestamp                     4
DepartureHour                          4
DepartureWindDir

In [235]:
df_disruptions.shape

(5940, 14)

In [236]:
df_disruptions = df_disruptions.loc[df_disruptions['end_time'].notna()]

In [237]:
df_disruptions.fillna('not applicable', inplace=True)

In [238]:
# df_disruptions.fillna('not applicable', inplace=True)

In [239]:
df_disruptions = df_disruptions.loc[df_disruptions['rdt_lines'].str.contains('Amsterdam Centraal - Utrecht Centraal')]

In [240]:
df_disruptions.shape

(174, 14)

In [241]:
df_disruptions.head()

Unnamed: 0,rdt_id,ns_lines,rdt_lines,rdt_lines_id,rdt_station_names,rdt_station_codes,cause_nl,cause_en,statistical_cause_nl,statistical_cause_en,cause_group,start_time,end_time,duration_minutes
104,25340,Amsterdam-Utrecht,"Amsterdam Centraal - Utrecht Centraal, Amsterd...",136137141,"Abcoude,Breukelen","AC, BKL",defecte trein,broken down train,defecte trein,broken down train,rolling stock,2019-01-08 09:20:58,2019-01-08 09:21:35,1.0
106,25342,Utrecht Centraal,"'s-Hertogenbosch - Utrecht Centraal, Almere Oo...","40,44,51,133,134,136,137,142,143,147,149,150,1...",Utrecht Centraal,UT,inzet van hulpdiensten,an emergency call,inzet van hulpdiensten,an emergency call,external,2019-01-08 11:42:07,2019-01-08 11:45:17,3.0
123,25359,Breukelen-Utrecht-Veenendaal,"Amsterdam Centraal - Utrecht Centraal, Schipho...",136137,"Breukelen,Maarssen,Utrecht Centraal,Utrecht Zu...","BKL, MAS, UT, UTZL",gladde sporen,slippery railway tracks,gladde sporen,slippery railway tracks,weather,2019-01-09 06:29:40,2019-01-09 09:58:10,209.0
144,25380,Breukelen-Utrecht-Veenendaal,"Amsterdam Centraal - Utrecht Centraal, Schipho...",136137,"Breukelen,Maarssen,Utrecht Centraal,Utrecht Zu...","BKL, MAS, UT, UTZL",gladde sporen,slippery railway tracks,gladde sporen,slippery railway tracks,weather,2019-01-10 07:01:52,2019-01-10 09:14:39,133.0
155,25391,Breukelen-Utrecht-Veenendaal,"Amsterdam Centraal - Utrecht Centraal, Schipho...",136137,"Breukelen,Maarssen,Utrecht Centraal,Utrecht Zu...","BKL, MAS, UT, UTZL",gladde sporen,slippery railway tracks,gladde sporen,slippery railway tracks,weather,2019-01-10 16:13:02,2019-01-10 18:57:39,165.0


In [242]:
df['DisruptionKey'] = df['Date']

In [243]:
df.head()

Unnamed: 0,Date,RideId,TrainId,DepartureStation,DepartureTime,DepartureDelay,DestinationStation,ArrivalTime,ArrivalDelay,DeparturePlatform,ArrivalPlatform,DepartureMaterials,ArrivalMaterials,InbetweenStations,FaultMessages,RideInstance,PlannedDepartureTime,PlannedArrivalTime,ActualArrivalTime,ActualDepartureTime,WeatherKey,DepartureWeatherStationCode,DepartureTimestamp,DepartureHour,DepartureWindDir,DepartureWindHour,DepartureWindSpeed,DepartureMaxWindSpeed,DepartureTemperature,DepartureMinTemp10M,DepartureDewPointTemp,DepartureSunshineDur,DepartureRadiation,DeparturePrecipDur,DeparturePrecipHour,DepartureAirPressure,DepartureVisibility,DepartureCloudiness,DepartureHumidity,DepartureWeatherCode,DepartureWeatherCodeIndicator,DepartureFog,DepartureRain,DepartureSnow,DepartureThunder,DepartureIceFormation,DestinationWeatherStationCode,DestinationTimestamp,DestinationHour,DestinationWindDir,DestinationWindHour,DestinationWindSpeed,DestinationMaxWindSpeed,DestinationTemperature,DestinationMinTemp10M,DestinationDewPointTemp,DestinationSunshineDur,DestinationRadiation,DestinationPrecipDur,DestinationPrecipHour,DestinationAirPressure,DestinationVisibility,DestinationCloudiness,DestinationHumidity,DestinationWeatherCode,DestinationWeatherCodeIndicator,DestinationFog,DestinationRain,DestinationSnow,DestinationThunder,DestinationIceFormation,DisruptionKey
0,2018-12-31,1405,1405,Utrecht Centraal,01:01,1.0,Amsterdam Centraal,01:29,1.5,7,2a,VIRM-4 9516;VIRM-6 8671,VIRM-4 9516;VIRM-6 8671,Amsterdam Bijlmer ArenA,,2018-12-31#1405,2018-12-31 01:01:00,2018-12-31 01:29:00,2018-12-31 01:30:30,2018-12-31 01:02:00,2018-12-31-2-240,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2018-12-31
1,2018-12-31,1402,1402,Amsterdam Centraal,01:18,0.0,Utrecht Centraal,01:53,0.0,2,15,VIRM-6 8730;VIRM-6 8656,,Amsterdam Bijlmer ArenA,,2018-12-31#1402,2018-12-31 01:18:00,2018-12-31 01:53:00,2018-12-31 01:53:00,2018-12-31 01:18:00,2018-12-31-2-260,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2018-12-31
2,2019-01-01,1409,1409,Utrecht Centraal,02:17,0.0,Amsterdam Centraal,02:44,1.0,15,2a,VIRM-6 8656;VIRM-6 8730,VIRM-6 8656;VIRM-6 8730,Amsterdam Bijlmer ArenA,,2019-01-01#1409,2019-01-01 02:17:00,2019-01-01 02:44:00,2019-01-01 02:45:00,2019-01-01 02:17:00,2019-01-01-3-240,260.0,2019-01-01,3.0,250.0,4.0,4.0,9.0,8.4,,5.7,0.0,0.0,0.0,0.0,1030.1,57.0,8.0,83.0,10.0,7.0,0.0,0.0,0.0,0.0,0.0,240.0,2019-01-01,3.0,250.0,7.0,7.0,11.0,8.5,,5.1,0.0,0.0,0.0,0.0,1029.5,75.0,8.0,79.0,,5.0,0.0,0.0,0.0,0.0,0.0,2019-01-01
3,2018-12-31,1406,1406,Amsterdam Centraal,02:19,1.5,Utrecht Centraal,02:45,0.0,2b,14,VIRM-6 8648;VIRM-4 9504,,,,2018-12-31#1406,2018-12-31 02:19:00,2018-12-31 02:45:00,2018-12-31 02:45:00,2018-12-31 02:20:30,2018-12-31-3-260,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2018-12-31
4,2019-01-01,1413,1413,Utrecht Centraal,03:11,0.0,Amsterdam Centraal,03:44,0.0,14,2a,VIRM-4 9504;VIRM-6 8648,VIRM-4 9504;VIRM-6 8648,,,2019-01-01#1413,2019-01-01 03:11:00,2019-01-01 03:44:00,2019-01-01 03:44:00,2019-01-01 03:11:00,2019-01-01-4-240,260.0,2019-01-01,4.0,250.0,4.0,4.0,8.0,8.2,,5.6,0.0,0.0,0.0,0.0,1029.3,62.0,8.0,83.0,,5.0,0.0,0.0,0.0,0.0,0.0,240.0,2019-01-01,4.0,250.0,7.0,8.0,11.0,8.2,,5.4,0.0,0.0,0.0,0.0,1029.0,70.0,8.0,82.0,,5.0,0.0,0.0,0.0,0.0,0.0,2019-01-01


In [244]:
df['DepartureTime'].str.slice(stop=2)

0        01
1        01
2        02
3        02
4        03
         ..
94958    19
94959    19
94960    19
94961    20
94962    20
Name: DepartureTime, Length: 94963, dtype: object

In [245]:
df_disruptions.loc[df_disruptions.start_time.dt.strftime('%Y-%m-%d') != df_disruptions.end_time.dt.strftime('%Y-%m-%d')]

Unnamed: 0,rdt_id,ns_lines,rdt_lines,rdt_lines_id,rdt_station_names,rdt_station_codes,cause_nl,cause_en,statistical_cause_nl,statistical_cause_en,cause_group,start_time,end_time,duration_minutes
666,25902,Amsterdam-Utrecht,"Amsterdam Centraal - Utrecht Centraal, Amsterd...",136137141,"Abcoude,Amsterdam Bijlmer ArenA,Amsterdam Hole...","AC, ASB, ASHD",wisselstoring,points failure,wisselstoring,points failure,infrastructure,2019-02-05 23:56:54,2019-02-06 00:06:17,9.0
1673,26909,Amsterdam-Utrecht; Amsterdam-Gouda,"Amsterdam Centraal - Utrecht Centraal, Amsterd...",136141,"Amsterdam Amstel,Amsterdam Bijlmer ArenA,Amste...","ASA, ASB, ASD, ASDM, DVD",defecte trein,broken down train,defecte trein,broken down train,rolling stock,2019-04-05 23:16:47,2019-04-06 00:42:18,86.0
2334,27570,Amsterdam-Utrecht; Amsterdam-Gouda,"Amsterdam Centraal - Utrecht Centraal, Amsterd...",136141,"Amsterdam Amstel,Amsterdam Bijlmer ArenA,Amste...","ASA, ASB, ASD, ASDM, DVD",inzet van hulpdiensten,an emergency call,inzet van hulpdiensten,an emergency call,external,2019-05-17 22:13:25,2019-05-18 07:39:02,566.0
2787,28023,Amsterdam-Utrecht,"Amsterdam Centraal - Utrecht Centraal, Amsterd...",136137141,"Abcoude,Amsterdam Bijlmer ArenA,Amsterdam Hole...","AC, ASB, ASHD",defecte trein,broken down train,defecte trein,broken down train,rolling stock,2019-06-19 19:13:57,2019-06-20 01:11:31,358.0
3329,28565,Amsterdam-Utrecht,"Amsterdam Centraal - Utrecht Centraal, Amsterd...",136141,"Amsterdam Amstel,Amsterdam Bijlmer ArenA,Amste...","ASA, ASB, ASD, ASDM, DVD",herstelwerkzaamheden,repair works,herstelwerkzaamheden,repair works,engineering work,2019-07-23 22:51:13,2019-07-24 01:11:20,140.0
4689,29925,Amsterdam-Utrecht,Amsterdam Centraal - Utrecht Centraal,136,"Abcoude,Amsterdam Amstel,Amsterdam Bijlmer Are...","AC, ASA, ASB, ASD, ASDM, ASHD, BKL, DVD, MAS, ...",aanrijding met een voertuig,collision with a vehicle,aanrijding met een voertuig,collision with a vehicle,accidents,2019-10-16 23:33:36,2019-10-17 01:33:05,119.0
5374,30610,Amsterdam-Utrecht; Amsterdam-Gouda,"Amsterdam Centraal - Utrecht Centraal, Amsterd...",136141,"Amsterdam Amstel,Amsterdam Bijlmer ArenA,Amste...","ASA, ASB, ASD, ASDM, DVD",wisselstoring,points failure,wisselstoring,points failure,infrastructure,2019-11-28 20:47:57,2019-11-29 00:50:40,243.0


In [246]:
df.head()

Unnamed: 0,Date,RideId,TrainId,DepartureStation,DepartureTime,DepartureDelay,DestinationStation,ArrivalTime,ArrivalDelay,DeparturePlatform,ArrivalPlatform,DepartureMaterials,ArrivalMaterials,InbetweenStations,FaultMessages,RideInstance,PlannedDepartureTime,PlannedArrivalTime,ActualArrivalTime,ActualDepartureTime,WeatherKey,DepartureWeatherStationCode,DepartureTimestamp,DepartureHour,DepartureWindDir,DepartureWindHour,DepartureWindSpeed,DepartureMaxWindSpeed,DepartureTemperature,DepartureMinTemp10M,DepartureDewPointTemp,DepartureSunshineDur,DepartureRadiation,DeparturePrecipDur,DeparturePrecipHour,DepartureAirPressure,DepartureVisibility,DepartureCloudiness,DepartureHumidity,DepartureWeatherCode,DepartureWeatherCodeIndicator,DepartureFog,DepartureRain,DepartureSnow,DepartureThunder,DepartureIceFormation,DestinationWeatherStationCode,DestinationTimestamp,DestinationHour,DestinationWindDir,DestinationWindHour,DestinationWindSpeed,DestinationMaxWindSpeed,DestinationTemperature,DestinationMinTemp10M,DestinationDewPointTemp,DestinationSunshineDur,DestinationRadiation,DestinationPrecipDur,DestinationPrecipHour,DestinationAirPressure,DestinationVisibility,DestinationCloudiness,DestinationHumidity,DestinationWeatherCode,DestinationWeatherCodeIndicator,DestinationFog,DestinationRain,DestinationSnow,DestinationThunder,DestinationIceFormation,DisruptionKey
0,2018-12-31,1405,1405,Utrecht Centraal,01:01,1.0,Amsterdam Centraal,01:29,1.5,7,2a,VIRM-4 9516;VIRM-6 8671,VIRM-4 9516;VIRM-6 8671,Amsterdam Bijlmer ArenA,,2018-12-31#1405,2018-12-31 01:01:00,2018-12-31 01:29:00,2018-12-31 01:30:30,2018-12-31 01:02:00,2018-12-31-2-240,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2018-12-31
1,2018-12-31,1402,1402,Amsterdam Centraal,01:18,0.0,Utrecht Centraal,01:53,0.0,2,15,VIRM-6 8730;VIRM-6 8656,,Amsterdam Bijlmer ArenA,,2018-12-31#1402,2018-12-31 01:18:00,2018-12-31 01:53:00,2018-12-31 01:53:00,2018-12-31 01:18:00,2018-12-31-2-260,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2018-12-31
2,2019-01-01,1409,1409,Utrecht Centraal,02:17,0.0,Amsterdam Centraal,02:44,1.0,15,2a,VIRM-6 8656;VIRM-6 8730,VIRM-6 8656;VIRM-6 8730,Amsterdam Bijlmer ArenA,,2019-01-01#1409,2019-01-01 02:17:00,2019-01-01 02:44:00,2019-01-01 02:45:00,2019-01-01 02:17:00,2019-01-01-3-240,260.0,2019-01-01,3.0,250.0,4.0,4.0,9.0,8.4,,5.7,0.0,0.0,0.0,0.0,1030.1,57.0,8.0,83.0,10.0,7.0,0.0,0.0,0.0,0.0,0.0,240.0,2019-01-01,3.0,250.0,7.0,7.0,11.0,8.5,,5.1,0.0,0.0,0.0,0.0,1029.5,75.0,8.0,79.0,,5.0,0.0,0.0,0.0,0.0,0.0,2019-01-01
3,2018-12-31,1406,1406,Amsterdam Centraal,02:19,1.5,Utrecht Centraal,02:45,0.0,2b,14,VIRM-6 8648;VIRM-4 9504,,,,2018-12-31#1406,2018-12-31 02:19:00,2018-12-31 02:45:00,2018-12-31 02:45:00,2018-12-31 02:20:30,2018-12-31-3-260,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2018-12-31
4,2019-01-01,1413,1413,Utrecht Centraal,03:11,0.0,Amsterdam Centraal,03:44,0.0,14,2a,VIRM-4 9504;VIRM-6 8648,VIRM-4 9504;VIRM-6 8648,,,2019-01-01#1413,2019-01-01 03:11:00,2019-01-01 03:44:00,2019-01-01 03:44:00,2019-01-01 03:11:00,2019-01-01-4-240,260.0,2019-01-01,4.0,250.0,4.0,4.0,8.0,8.2,,5.6,0.0,0.0,0.0,0.0,1029.3,62.0,8.0,83.0,,5.0,0.0,0.0,0.0,0.0,0.0,240.0,2019-01-01,4.0,250.0,7.0,8.0,11.0,8.2,,5.4,0.0,0.0,0.0,0.0,1029.0,70.0,8.0,82.0,,5.0,0.0,0.0,0.0,0.0,0.0,2019-01-01


In [247]:
df_disruptions['DisruptionKey'] = df_disruptions['start_time'].dt.strftime('%Y-%m-%d')

In [68]:
df_disruptions.head()

Unnamed: 0,rdt_id,ns_lines,rdt_lines,rdt_lines_id,rdt_station_names,rdt_station_codes,cause_nl,cause_en,statistical_cause_nl,statistical_cause_en,cause_group,start_time,end_time,duration_minutes,DisruptionKey
104,25340,Amsterdam-Utrecht,"Amsterdam Centraal - Utrecht Centraal, Amsterd...",136137141,"Abcoude,Breukelen","AC, BKL",defecte trein,broken down train,defecte trein,broken down train,rolling stock,2019-01-08 09:20:58,2019-01-08 09:21:35,1.0,2019-01-08-09
106,25342,Utrecht Centraal,"'s-Hertogenbosch - Utrecht Centraal, Almere Oo...","40,44,51,133,134,136,137,142,143,147,149,150,1...",Utrecht Centraal,UT,inzet van hulpdiensten,an emergency call,inzet van hulpdiensten,an emergency call,external,2019-01-08 11:42:07,2019-01-08 11:45:17,3.0,2019-01-08-11
123,25359,Breukelen-Utrecht-Veenendaal,"Amsterdam Centraal - Utrecht Centraal, Schipho...",136137,"Breukelen,Maarssen,Utrecht Centraal,Utrecht Zu...","BKL, MAS, UT, UTZL",gladde sporen,slippery railway tracks,gladde sporen,slippery railway tracks,weather,2019-01-09 06:29:40,2019-01-09 09:58:10,209.0,2019-01-09-06
144,25380,Breukelen-Utrecht-Veenendaal,"Amsterdam Centraal - Utrecht Centraal, Schipho...",136137,"Breukelen,Maarssen,Utrecht Centraal,Utrecht Zu...","BKL, MAS, UT, UTZL",gladde sporen,slippery railway tracks,gladde sporen,slippery railway tracks,weather,2019-01-10 07:01:52,2019-01-10 09:14:39,133.0,2019-01-10-07
155,25391,Breukelen-Utrecht-Veenendaal,"Amsterdam Centraal - Utrecht Centraal, Schipho...",136137,"Breukelen,Maarssen,Utrecht Centraal,Utrecht Zu...","BKL, MAS, UT, UTZL",gladde sporen,slippery railway tracks,gladde sporen,slippery railway tracks,weather,2019-01-10 16:13:02,2019-01-10 18:57:39,165.0,2019-01-10-16


In [249]:
df_disruptions['DisruptionKey'].dtypes

dtype('O')

In [250]:
df_temp = pd.merge(df, df_disruptions, on='DisruptionKey', how='left')

In [251]:
df_temp

Unnamed: 0,Date,RideId,TrainId,DepartureStation,DepartureTime,DepartureDelay,DestinationStation,ArrivalTime,ArrivalDelay,DeparturePlatform,ArrivalPlatform,DepartureMaterials,ArrivalMaterials,InbetweenStations,FaultMessages,RideInstance,PlannedDepartureTime,PlannedArrivalTime,ActualArrivalTime,ActualDepartureTime,WeatherKey,DepartureWeatherStationCode,DepartureTimestamp,DepartureHour,DepartureWindDir,DepartureWindHour,DepartureWindSpeed,DepartureMaxWindSpeed,DepartureTemperature,DepartureMinTemp10M,DepartureDewPointTemp,DepartureSunshineDur,DepartureRadiation,DeparturePrecipDur,DeparturePrecipHour,DepartureAirPressure,DepartureVisibility,DepartureCloudiness,DepartureHumidity,DepartureWeatherCode,DepartureWeatherCodeIndicator,DepartureFog,DepartureRain,DepartureSnow,DepartureThunder,DepartureIceFormation,DestinationWeatherStationCode,DestinationTimestamp,DestinationHour,DestinationWindDir,DestinationWindHour,DestinationWindSpeed,DestinationMaxWindSpeed,DestinationTemperature,DestinationMinTemp10M,DestinationDewPointTemp,DestinationSunshineDur,DestinationRadiation,DestinationPrecipDur,DestinationPrecipHour,DestinationAirPressure,DestinationVisibility,DestinationCloudiness,DestinationHumidity,DestinationWeatherCode,DestinationWeatherCodeIndicator,DestinationFog,DestinationRain,DestinationSnow,DestinationThunder,DestinationIceFormation,DisruptionKey,rdt_id,ns_lines,rdt_lines,rdt_lines_id,rdt_station_names,rdt_station_codes,cause_nl,cause_en,statistical_cause_nl,statistical_cause_en,cause_group,start_time,end_time,duration_minutes
0,2018-12-31,1405,1405,Utrecht Centraal,01:01,1.0,Amsterdam Centraal,01:29,1.5,7,2a,VIRM-4 9516;VIRM-6 8671,VIRM-4 9516;VIRM-6 8671,Amsterdam Bijlmer ArenA,,2018-12-31#1405,2018-12-31 01:01:00,2018-12-31 01:29:00,2018-12-31 01:30:30,2018-12-31 01:02:00,2018-12-31-2-240,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2018-12-31,,,,,,,,,,,,NaT,NaT,
1,2018-12-31,1402,1402,Amsterdam Centraal,01:18,0.0,Utrecht Centraal,01:53,0.0,2,15,VIRM-6 8730;VIRM-6 8656,,Amsterdam Bijlmer ArenA,,2018-12-31#1402,2018-12-31 01:18:00,2018-12-31 01:53:00,2018-12-31 01:53:00,2018-12-31 01:18:00,2018-12-31-2-260,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2018-12-31,,,,,,,,,,,,NaT,NaT,
2,2019-01-01,1409,1409,Utrecht Centraal,02:17,0.0,Amsterdam Centraal,02:44,1.0,15,2a,VIRM-6 8656;VIRM-6 8730,VIRM-6 8656;VIRM-6 8730,Amsterdam Bijlmer ArenA,,2019-01-01#1409,2019-01-01 02:17:00,2019-01-01 02:44:00,2019-01-01 02:45:00,2019-01-01 02:17:00,2019-01-01-3-240,260.0,2019-01-01,3.0,250.0,4.0,4.0,9.0,8.4,,5.7,0.0,0.0,0.0,0.0,1030.1,57.0,8.0,83.0,10.0,7.0,0.0,0.0,0.0,0.0,0.0,240.0,2019-01-01,3.0,250.0,7.0,7.0,11.0,8.5,,5.1,0.0,0.0,0.0,0.0,1029.5,75.0,8.0,79.0,,5.0,0.0,0.0,0.0,0.0,0.0,2019-01-01,,,,,,,,,,,,NaT,NaT,
3,2018-12-31,1406,1406,Amsterdam Centraal,02:19,1.5,Utrecht Centraal,02:45,0.0,2b,14,VIRM-6 8648;VIRM-4 9504,,,,2018-12-31#1406,2018-12-31 02:19:00,2018-12-31 02:45:00,2018-12-31 02:45:00,2018-12-31 02:20:30,2018-12-31-3-260,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2018-12-31,,,,,,,,,,,,NaT,NaT,
4,2019-01-01,1413,1413,Utrecht Centraal,03:11,0.0,Amsterdam Centraal,03:44,0.0,14,2a,VIRM-4 9504;VIRM-6 8648,VIRM-4 9504;VIRM-6 8648,,,2019-01-01#1413,2019-01-01 03:11:00,2019-01-01 03:44:00,2019-01-01 03:44:00,2019-01-01 03:11:00,2019-01-01-4-240,260.0,2019-01-01,4.0,250.0,4.0,4.0,8.0,8.2,,5.6,0.0,0.0,0.0,0.0,1029.3,62.0,8.0,83.0,,5.0,0.0,0.0,0.0,0.0,0.0,240.0,2019-01-01,4.0,250.0,7.0,8.0,11.0,8.2,,5.4,0.0,0.0,0.0,0.0,1029.0,70.0,8.0,82.0,,5.0,0.0,0.0,0.0,0.0,0.0,2019-01-01,,,,,,,,,,,,NaT,NaT,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111263,2019-12-31,2973,2973,Amsterdam Centraal,19:40,0.0,Utrecht Centraal,20:07,2.0,4b,18,VIRM-4 9468,VIRM-4 9468,Amsterdam Amstel,,2019-12-31#2973,2019-12-31 19:40:00,2019-12-31 20:07:00,2019-12-31 20:09:00,2019-12-31 19:40:00,2019-12-31-21-260,240.0,2019-12-31,20.0,90.0,2.0,1.0,5.0,1.1,,0.9,0.0,0.0,0.0,0.0,1035.8,0.0,9.0,98.0,32.0,7.0,1.0,0.0,0.0,0.0,0.0,260.0,2019-12-31,21.0,140.0,3.0,4.0,6.0,4.0,,3.5,0.0,0.0,0.0,0.0,1035.6,18.0,8.0,96.0,10.0,7.0,0.0,0.0,0.0,0.0,0.0,2019-12-31,31170.0,Amsterdam-Utrecht,"Amsterdam Centraal - Utrecht Centraal, Amsterd...",136141,"Amsterdam Amstel,Amsterdam Muiderpoort","ASA, ASDM",inzet van hulpdiensten,an emergency call,inzet van hulpdiensten,an emergency call,external,2019-12-31 07:28:45,2019-12-31 07:57:33,29.0
111264,2019-12-31,2968,2968,Utrecht Centraal,19:53,0.0,Amsterdam Centraal,20:18,1.0,7,8b,VIRM-6 8671,VIRM-6 8671,Amsterdam Amstel,,2019-12-31#2968,2019-12-31 19:53:00,2019-12-31 20:18:00,2019-12-31 20:19:00,2019-12-31 19:53:00,2019-12-31-21-240,260.0,2019-12-31,20.0,130.0,2.0,3.0,5.0,4.0,,3.7,0.0,0.0,0.0,0.0,1035.9,12.0,8.0,98.0,20.0,7.0,1.0,0.0,0.0,0.0,0.0,240.0,2019-12-31,21.0,130.0,2.0,3.0,4.0,2.7,,2.6,0.0,0.0,0.0,0.0,1035.3,1.0,9.0,99.0,32.0,7.0,1.0,0.0,0.0,0.0,0.0,2019-12-31,31170.0,Amsterdam-Utrecht,"Amsterdam Centraal - Utrecht Centraal, Amsterd...",136141,"Amsterdam Amstel,Amsterdam Muiderpoort","ASA, ASDM",inzet van hulpdiensten,an emergency call,inzet van hulpdiensten,an emergency call,external,2019-12-31 07:28:45,2019-12-31 07:57:33,29.0
111265,2019-12-31,3073,3073,Amsterdam Centraal,19:54,0.0,Utrecht Centraal,20:21,0.0,4b,19,VIRM-4 9576,VIRM-4 9576,Amsterdam Amstel,,2019-12-31#3073,2019-12-31 19:54:00,2019-12-31 20:21:00,2019-12-31 20:21:00,2019-12-31 19:54:00,2019-12-31-21-260,240.0,2019-12-31,20.0,90.0,2.0,1.0,5.0,1.1,,0.9,0.0,0.0,0.0,0.0,1035.8,0.0,9.0,98.0,32.0,7.0,1.0,0.0,0.0,0.0,0.0,260.0,2019-12-31,21.0,140.0,3.0,4.0,6.0,4.0,,3.5,0.0,0.0,0.0,0.0,1035.6,18.0,8.0,96.0,10.0,7.0,0.0,0.0,0.0,0.0,0.0,2019-12-31,31170.0,Amsterdam-Utrecht,"Amsterdam Centraal - Utrecht Centraal, Amsterd...",136141,"Amsterdam Amstel,Amsterdam Muiderpoort","ASA, ASDM",inzet van hulpdiensten,an emergency call,inzet van hulpdiensten,an emergency call,external,2019-12-31 07:28:45,2019-12-31 07:57:33,29.0
111266,2019-12-31,122,122,Utrecht Centraal,20:03,0.0,Amsterdam Centraal,20:28,0.0,5,7b,ICE-3M,ICE-3M,,,2019-12-31#122,2019-12-31 20:03:00,2019-12-31 20:28:00,2019-12-31 20:28:00,2019-12-31 20:03:00,2019-12-31-21-240,260.0,2019-12-31,21.0,140.0,3.0,4.0,6.0,4.0,,3.5,0.0,0.0,0.0,0.0,1035.6,18.0,8.0,96.0,10.0,7.0,0.0,0.0,0.0,0.0,0.0,240.0,2019-12-31,21.0,130.0,2.0,3.0,4.0,2.7,,2.6,0.0,0.0,0.0,0.0,1035.3,1.0,9.0,99.0,32.0,7.0,1.0,0.0,0.0,0.0,0.0,2019-12-31,31170.0,Amsterdam-Utrecht,"Amsterdam Centraal - Utrecht Centraal, Amsterd...",136141,"Amsterdam Amstel,Amsterdam Muiderpoort","ASA, ASDM",inzet van hulpdiensten,an emergency call,inzet van hulpdiensten,an emergency call,external,2019-12-31 07:28:45,2019-12-31 07:57:33,29.0


In [252]:
df = df_temp

In [254]:
df

Unnamed: 0,Date,RideId,TrainId,DepartureStation,DepartureTime,DepartureDelay,DestinationStation,ArrivalTime,ArrivalDelay,DeparturePlatform,ArrivalPlatform,DepartureMaterials,ArrivalMaterials,InbetweenStations,FaultMessages,RideInstance,PlannedDepartureTime,PlannedArrivalTime,ActualArrivalTime,ActualDepartureTime,WeatherKey,DepartureWeatherStationCode,DepartureTimestamp,DepartureHour,DepartureWindDir,DepartureWindHour,DepartureWindSpeed,DepartureMaxWindSpeed,DepartureTemperature,DepartureMinTemp10M,DepartureDewPointTemp,DepartureSunshineDur,DepartureRadiation,DeparturePrecipDur,DeparturePrecipHour,DepartureAirPressure,DepartureVisibility,DepartureCloudiness,DepartureHumidity,DepartureWeatherCode,DepartureWeatherCodeIndicator,DepartureFog,DepartureRain,DepartureSnow,DepartureThunder,DepartureIceFormation,DestinationWeatherStationCode,DestinationTimestamp,DestinationHour,DestinationWindDir,DestinationWindHour,DestinationWindSpeed,DestinationMaxWindSpeed,DestinationTemperature,DestinationMinTemp10M,DestinationDewPointTemp,DestinationSunshineDur,DestinationRadiation,DestinationPrecipDur,DestinationPrecipHour,DestinationAirPressure,DestinationVisibility,DestinationCloudiness,DestinationHumidity,DestinationWeatherCode,DestinationWeatherCodeIndicator,DestinationFog,DestinationRain,DestinationSnow,DestinationThunder,DestinationIceFormation,DisruptionKey,rdt_id,ns_lines,rdt_lines,rdt_lines_id,rdt_station_names,rdt_station_codes,cause_nl,cause_en,statistical_cause_nl,statistical_cause_en,cause_group,start_time,end_time,duration_minutes
0,2018-12-31,1405,1405,Utrecht Centraal,01:01,1.0,Amsterdam Centraal,01:29,1.5,7,2a,VIRM-4 9516;VIRM-6 8671,VIRM-4 9516;VIRM-6 8671,Amsterdam Bijlmer ArenA,,2018-12-31#1405,2018-12-31 01:01:00,2018-12-31 01:29:00,2018-12-31 01:30:30,2018-12-31 01:02:00,2018-12-31-2-240,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2018-12-31,,,,,,,,,,,,NaT,NaT,
1,2018-12-31,1402,1402,Amsterdam Centraal,01:18,0.0,Utrecht Centraal,01:53,0.0,2,15,VIRM-6 8730;VIRM-6 8656,,Amsterdam Bijlmer ArenA,,2018-12-31#1402,2018-12-31 01:18:00,2018-12-31 01:53:00,2018-12-31 01:53:00,2018-12-31 01:18:00,2018-12-31-2-260,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2018-12-31,,,,,,,,,,,,NaT,NaT,
2,2019-01-01,1409,1409,Utrecht Centraal,02:17,0.0,Amsterdam Centraal,02:44,1.0,15,2a,VIRM-6 8656;VIRM-6 8730,VIRM-6 8656;VIRM-6 8730,Amsterdam Bijlmer ArenA,,2019-01-01#1409,2019-01-01 02:17:00,2019-01-01 02:44:00,2019-01-01 02:45:00,2019-01-01 02:17:00,2019-01-01-3-240,260.0,2019-01-01,3.0,250.0,4.0,4.0,9.0,8.4,,5.7,0.0,0.0,0.0,0.0,1030.1,57.0,8.0,83.0,10.0,7.0,0.0,0.0,0.0,0.0,0.0,240.0,2019-01-01,3.0,250.0,7.0,7.0,11.0,8.5,,5.1,0.0,0.0,0.0,0.0,1029.5,75.0,8.0,79.0,,5.0,0.0,0.0,0.0,0.0,0.0,2019-01-01,,,,,,,,,,,,NaT,NaT,
3,2018-12-31,1406,1406,Amsterdam Centraal,02:19,1.5,Utrecht Centraal,02:45,0.0,2b,14,VIRM-6 8648;VIRM-4 9504,,,,2018-12-31#1406,2018-12-31 02:19:00,2018-12-31 02:45:00,2018-12-31 02:45:00,2018-12-31 02:20:30,2018-12-31-3-260,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2018-12-31,,,,,,,,,,,,NaT,NaT,
4,2019-01-01,1413,1413,Utrecht Centraal,03:11,0.0,Amsterdam Centraal,03:44,0.0,14,2a,VIRM-4 9504;VIRM-6 8648,VIRM-4 9504;VIRM-6 8648,,,2019-01-01#1413,2019-01-01 03:11:00,2019-01-01 03:44:00,2019-01-01 03:44:00,2019-01-01 03:11:00,2019-01-01-4-240,260.0,2019-01-01,4.0,250.0,4.0,4.0,8.0,8.2,,5.6,0.0,0.0,0.0,0.0,1029.3,62.0,8.0,83.0,,5.0,0.0,0.0,0.0,0.0,0.0,240.0,2019-01-01,4.0,250.0,7.0,8.0,11.0,8.2,,5.4,0.0,0.0,0.0,0.0,1029.0,70.0,8.0,82.0,,5.0,0.0,0.0,0.0,0.0,0.0,2019-01-01,,,,,,,,,,,,NaT,NaT,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111263,2019-12-31,2973,2973,Amsterdam Centraal,19:40,0.0,Utrecht Centraal,20:07,2.0,4b,18,VIRM-4 9468,VIRM-4 9468,Amsterdam Amstel,,2019-12-31#2973,2019-12-31 19:40:00,2019-12-31 20:07:00,2019-12-31 20:09:00,2019-12-31 19:40:00,2019-12-31-21-260,240.0,2019-12-31,20.0,90.0,2.0,1.0,5.0,1.1,,0.9,0.0,0.0,0.0,0.0,1035.8,0.0,9.0,98.0,32.0,7.0,1.0,0.0,0.0,0.0,0.0,260.0,2019-12-31,21.0,140.0,3.0,4.0,6.0,4.0,,3.5,0.0,0.0,0.0,0.0,1035.6,18.0,8.0,96.0,10.0,7.0,0.0,0.0,0.0,0.0,0.0,2019-12-31,31170.0,Amsterdam-Utrecht,"Amsterdam Centraal - Utrecht Centraal, Amsterd...",136141,"Amsterdam Amstel,Amsterdam Muiderpoort","ASA, ASDM",inzet van hulpdiensten,an emergency call,inzet van hulpdiensten,an emergency call,external,2019-12-31 07:28:45,2019-12-31 07:57:33,29.0
111264,2019-12-31,2968,2968,Utrecht Centraal,19:53,0.0,Amsterdam Centraal,20:18,1.0,7,8b,VIRM-6 8671,VIRM-6 8671,Amsterdam Amstel,,2019-12-31#2968,2019-12-31 19:53:00,2019-12-31 20:18:00,2019-12-31 20:19:00,2019-12-31 19:53:00,2019-12-31-21-240,260.0,2019-12-31,20.0,130.0,2.0,3.0,5.0,4.0,,3.7,0.0,0.0,0.0,0.0,1035.9,12.0,8.0,98.0,20.0,7.0,1.0,0.0,0.0,0.0,0.0,240.0,2019-12-31,21.0,130.0,2.0,3.0,4.0,2.7,,2.6,0.0,0.0,0.0,0.0,1035.3,1.0,9.0,99.0,32.0,7.0,1.0,0.0,0.0,0.0,0.0,2019-12-31,31170.0,Amsterdam-Utrecht,"Amsterdam Centraal - Utrecht Centraal, Amsterd...",136141,"Amsterdam Amstel,Amsterdam Muiderpoort","ASA, ASDM",inzet van hulpdiensten,an emergency call,inzet van hulpdiensten,an emergency call,external,2019-12-31 07:28:45,2019-12-31 07:57:33,29.0
111265,2019-12-31,3073,3073,Amsterdam Centraal,19:54,0.0,Utrecht Centraal,20:21,0.0,4b,19,VIRM-4 9576,VIRM-4 9576,Amsterdam Amstel,,2019-12-31#3073,2019-12-31 19:54:00,2019-12-31 20:21:00,2019-12-31 20:21:00,2019-12-31 19:54:00,2019-12-31-21-260,240.0,2019-12-31,20.0,90.0,2.0,1.0,5.0,1.1,,0.9,0.0,0.0,0.0,0.0,1035.8,0.0,9.0,98.0,32.0,7.0,1.0,0.0,0.0,0.0,0.0,260.0,2019-12-31,21.0,140.0,3.0,4.0,6.0,4.0,,3.5,0.0,0.0,0.0,0.0,1035.6,18.0,8.0,96.0,10.0,7.0,0.0,0.0,0.0,0.0,0.0,2019-12-31,31170.0,Amsterdam-Utrecht,"Amsterdam Centraal - Utrecht Centraal, Amsterd...",136141,"Amsterdam Amstel,Amsterdam Muiderpoort","ASA, ASDM",inzet van hulpdiensten,an emergency call,inzet van hulpdiensten,an emergency call,external,2019-12-31 07:28:45,2019-12-31 07:57:33,29.0
111266,2019-12-31,122,122,Utrecht Centraal,20:03,0.0,Amsterdam Centraal,20:28,0.0,5,7b,ICE-3M,ICE-3M,,,2019-12-31#122,2019-12-31 20:03:00,2019-12-31 20:28:00,2019-12-31 20:28:00,2019-12-31 20:03:00,2019-12-31-21-240,260.0,2019-12-31,21.0,140.0,3.0,4.0,6.0,4.0,,3.5,0.0,0.0,0.0,0.0,1035.6,18.0,8.0,96.0,10.0,7.0,0.0,0.0,0.0,0.0,0.0,240.0,2019-12-31,21.0,130.0,2.0,3.0,4.0,2.7,,2.6,0.0,0.0,0.0,0.0,1035.3,1.0,9.0,99.0,32.0,7.0,1.0,0.0,0.0,0.0,0.0,2019-12-31,31170.0,Amsterdam-Utrecht,"Amsterdam Centraal - Utrecht Centraal, Amsterd...",136141,"Amsterdam Amstel,Amsterdam Muiderpoort","ASA, ASDM",inzet van hulpdiensten,an emergency call,inzet van hulpdiensten,an emergency call,external,2019-12-31 07:28:45,2019-12-31 07:57:33,29.0


In [255]:
df_disruptions.loc[df_disruptions.duplicated('Date')].sort_values('Date', ascending=True)

KeyError: Index(['Date'], dtype='object')

In [297]:
df.loc[df['rdt_id'] == 26712]

Unnamed: 0,Date,RideId,TrainId,DepartureStation,DepartureTime,DepartureDelay,DestinationStation,ArrivalTime,ArrivalDelay,DeparturePlatform,ArrivalPlatform,DepartureMaterials,ArrivalMaterials,InbetweenStations,FaultMessages,RideInstance,PlannedDepartureTime,PlannedArrivalTime,ActualArrivalTime,ActualDepartureTime,WeatherKey,DepartureWeatherStationCode,DepartureTimestamp,DepartureHour,DepartureWindDir,DepartureWindHour,DepartureWindSpeed,DepartureMaxWindSpeed,DepartureTemperature,DepartureMinTemp10M,DepartureDewPointTemp,DepartureSunshineDur,DepartureRadiation,DeparturePrecipDur,DeparturePrecipHour,DepartureAirPressure,DepartureVisibility,DepartureCloudiness,DepartureHumidity,DepartureWeatherCode,DepartureWeatherCodeIndicator,DepartureFog,DepartureRain,DepartureSnow,DepartureThunder,DepartureIceFormation,DestinationWeatherStationCode,DestinationTimestamp,DestinationHour,DestinationWindDir,DestinationWindHour,DestinationWindSpeed,DestinationMaxWindSpeed,DestinationTemperature,DestinationMinTemp10M,DestinationDewPointTemp,DestinationSunshineDur,DestinationRadiation,DestinationPrecipDur,DestinationPrecipHour,DestinationAirPressure,DestinationVisibility,DestinationCloudiness,DestinationHumidity,DestinationWeatherCode,DestinationWeatherCodeIndicator,DestinationFog,DestinationRain,DestinationSnow,DestinationThunder,DestinationIceFormation,rdt_id,ns_lines,rdt_lines,rdt_lines_id,rdt_station_names,rdt_station_codes,cause_nl,cause_en,statistical_cause_nl,statistical_cause_en,cause_group,start_time,end_time,duration_minutes
22793,2019-03-26,1406,1406,Amsterdam Centraal,02:19,0.0,Utrecht Centraal,02:44,1.5,7a,15,VIRM-4 9514,,,,2019-03-26#1406,2019-03-26 02:19:00,2019-03-26 02:44:00,2019-03-26 02:45:30,2019-03-26 02:19:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,26712.0,Amsterdam-Gouda,"Amsterdam Centraal - Utrecht Centraal, Amsterd...",136137141,"Abcoude,Amsterdam Bijlmer ArenA,Amsterdam Hole...","AC, ASB, ASHD, BKL",aanrijding met een persoon,person hit by a train,aanrijding met een persoon,person hit by a train,accidents,2019-03-26 09:11:52,2019-03-26 12:02:50,171.0
22796,2019-03-26,1409,1409,Utrecht Centraal,02:17,0.0,Amsterdam Centraal,02:44,1.0,18,7a,VIRM-4 9426,VIRM-4 9426,,,2019-03-26#1409,2019-03-26 02:17:00,2019-03-26 02:44:00,2019-03-26 02:45:00,2019-03-26 02:17:00,2019-03-26-3-240,260.0,2019-03-26,3.0,310.0,2.0,1.0,5.0,5.6,,1.1,0.0,0.0,0.0,0.0,1031.4,75.0,8.0,72.0,,5.0,0.0,0.0,0.0,0.0,0.0,240.0,2019-03-26,3.0,310.0,2.0,2.0,4.0,3.3,,0.6,0.0,0.0,0.0,0.0,1031.4,70.0,8.0,82.0,,5.0,0.0,0.0,0.0,0.0,0.0,26712.0,Amsterdam-Gouda,"Amsterdam Centraal - Utrecht Centraal, Amsterd...",136137141,"Abcoude,Amsterdam Bijlmer ArenA,Amsterdam Hole...","AC, ASB, ASHD, BKL",aanrijding met een persoon,person hit by a train,aanrijding met een persoon,person hit by a train,accidents,2019-03-26 09:11:52,2019-03-26 12:02:50,171.0
22799,2019-03-26,1413,1413,Utrecht Centraal,03:17,0.0,Amsterdam Centraal,03:44,0.0,15,7a,VIRM-4 9514,VIRM-4 9514,,,2019-03-26#1413,2019-03-26 03:17:00,2019-03-26 03:44:00,2019-03-26 03:44:00,2019-03-26 03:17:00,2019-03-26-4-240,260.0,2019-03-26,4.0,280.0,1.0,1.0,3.0,2.6,,0.8,0.0,0.0,0.0,0.0,1031.3,70.0,8.0,88.0,,5.0,0.0,0.0,0.0,0.0,0.0,240.0,2019-03-26,4.0,300.0,2.0,2.0,3.0,2.8,,0.9,0.0,0.0,0.0,0.0,1031.4,69.0,8.0,87.0,,5.0,0.0,0.0,0.0,0.0,0.0,26712.0,Amsterdam-Gouda,"Amsterdam Centraal - Utrecht Centraal, Amsterd...",136137141,"Abcoude,Amsterdam Bijlmer ArenA,Amsterdam Hole...","AC, ASB, ASHD, BKL",aanrijding met een persoon,person hit by a train,aanrijding met een persoon,person hit by a train,accidents,2019-03-26 09:11:52,2019-03-26 12:02:50,171.0
22802,2019-03-26,1410,1410,Amsterdam Centraal,03:19,0.0,Utrecht Centraal,03:44,6.0,7a,15,VIRM-4 9571,,,,2019-03-26#1410,2019-03-26 03:19:00,2019-03-26 03:44:00,2019-03-26 03:50:00,2019-03-26 03:19:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,26712.0,Amsterdam-Gouda,"Amsterdam Centraal - Utrecht Centraal, Amsterd...",136137141,"Abcoude,Amsterdam Bijlmer ArenA,Amsterdam Hole...","AC, ASB, ASHD, BKL",aanrijding met een persoon,person hit by a train,aanrijding met een persoon,person hit by a train,accidents,2019-03-26 09:11:52,2019-03-26 12:02:50,171.0
22805,2019-03-26,1414,1414,Amsterdam Centraal,04:19,0.0,Utrecht Centraal,04:44,0.0,7a,15,VIRM-6 8610,,,,2019-03-26#1414,2019-03-26 04:19:00,2019-03-26 04:44:00,2019-03-26 04:44:00,2019-03-26 04:19:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,26712.0,Amsterdam-Gouda,"Amsterdam Centraal - Utrecht Centraal, Amsterd...",136137141,"Abcoude,Amsterdam Bijlmer ArenA,Amsterdam Hole...","AC, ASB, ASHD, BKL",aanrijding met een persoon,person hit by a train,aanrijding met een persoon,person hit by a train,accidents,2019-03-26 09:11:52,2019-03-26 12:02:50,171.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23606,2019-03-26,7384,7384,Utrecht Centraal,00:21,0.0,Amsterdam Centraal,01:06,0.0,14,5a,SLT-4 2458,,Utrecht Zuilen;Maarssen;Breukelen;Abcoude;Amst...,,2019-03-26#7384,2019-03-26 00:21:00,2019-03-26 01:06:00,2019-03-26 01:06:00,2019-03-26 00:21:00,2019-03-26-2-240,260.0,2019-03-26,1.0,340.0,3.0,2.0,8.0,6.7,,1.7,0.0,0.0,0.0,0.0,1031.6,74.0,8.0,70.0,,5.0,0.0,0.0,0.0,0.0,0.0,240.0,2019-03-26,2.0,340.0,4.0,2.0,7.0,5.2,,0.9,0.0,0.0,0.0,0.0,1031.7,72.0,8.0,73.0,,5.0,0.0,0.0,0.0,0.0,0.0,26712.0,Amsterdam-Gouda,"Amsterdam Centraal - Utrecht Centraal, Amsterd...",136137141,"Abcoude,Amsterdam Bijlmer ArenA,Amsterdam Hole...","AC, ASB, ASHD, BKL",aanrijding met een persoon,person hit by a train,aanrijding met een persoon,person hit by a train,accidents,2019-03-26 09:11:52,2019-03-26 12:02:50,171.0
23609,2019-03-26,7393,7393,Amsterdam Centraal,00:28,0.0,Utrecht Centraal,01:10,0.0,4b,15,SLT-4 2442;SLT-6 2627,,Amsterdam Muiderpoort;Amsterdam Amstel;Duivend...,,2019-03-26#7393,2019-03-26 00:28:00,2019-03-26 01:10:00,2019-03-26 01:10:00,2019-03-26 00:28:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,26712.0,Amsterdam-Gouda,"Amsterdam Centraal - Utrecht Centraal, Amsterd...",136137141,"Abcoude,Amsterdam Bijlmer ArenA,Amsterdam Hole...","AC, ASB, ASHD, BKL",aanrijding met een persoon,person hit by a train,aanrijding met een persoon,person hit by a train,accidents,2019-03-26 09:11:52,2019-03-26 12:02:50,171.0
23612,2019-03-26,1405,1405,Utrecht Centraal,01:01,0.0,Amsterdam Centraal,01:28,0.0,7,4a,VIRM-6 8667,VIRM-6 8667,Amsterdam Bijlmer ArenA,,2019-03-26#1405,2019-03-26 01:01:00,2019-03-26 01:28:00,2019-03-26 01:28:00,2019-03-26 01:01:00,2019-03-26-2-240,260.0,2019-03-26,2.0,340.0,3.0,3.0,6.0,6.6,,1.1,0.0,0.0,0.0,0.0,1031.5,74.0,8.0,68.0,,5.0,0.0,0.0,0.0,0.0,0.0,240.0,2019-03-26,2.0,340.0,4.0,2.0,7.0,5.2,,0.9,0.0,0.0,0.0,0.0,1031.7,72.0,8.0,73.0,,5.0,0.0,0.0,0.0,0.0,0.0,26712.0,Amsterdam-Gouda,"Amsterdam Centraal - Utrecht Centraal, Amsterd...",136137141,"Abcoude,Amsterdam Bijlmer ArenA,Amsterdam Hole...","AC, ASB, ASHD, BKL",aanrijding met een persoon,person hit by a train,aanrijding met een persoon,person hit by a train,accidents,2019-03-26 09:11:52,2019-03-26 12:02:50,171.0
23615,2019-03-26,7395,7395,Amsterdam Centraal,00:57,0.0,Utrecht Centraal,01:40,0.0,2b,15,SGMM-2 2123;SGMM-3 2972,,Amsterdam Muiderpoort;Amsterdam Amstel;Duivend...,,2019-03-26#7395,2019-03-26 00:57:00,2019-03-26 01:40:00,2019-03-26 01:40:00,2019-03-26 00:57:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,26712.0,Amsterdam-Gouda,"Amsterdam Centraal - Utrecht Centraal, Amsterd...",136137141,"Abcoude,Amsterdam Bijlmer ArenA,Amsterdam Hole...","AC, ASB, ASHD, BKL",aanrijding met een persoon,person hit by a train,aanrijding met een persoon,person hit by a train,accidents,2019-03-26 09:11:52,2019-03-26 12:02:50,171.0


In [56]:
%%time
count = 0
for i in range(df.shape[0]):
    print(i)
    train_time = pd.to_datetime(df.iloc[i]['Date'] + '-' + "{:02d}".format(int(df.iloc[i]['DepartureTime'][:2])))
    for j in range(df_disruptions.shape[0]):
        disruption_start = pd.to_datetime(df_disruptions.iloc[j]['start_time'].strftime('%Y-%m-%d-%H'))
        disruption_end = pd.to_datetime(df_disruptions.iloc[j]['end_time'].strftime('%Y-%m-%d-%H'))
        
        if (train_time >= disruption_start) and (train_time <= disruption_end):
            count += 1
            print(count)

0
1
2


KeyboardInterrupt: 

In [40]:
train_time = df.iloc[0]['Date'] + '-' + "{:02d}".format(int(df.iloc[50]['DepartureTime'][:2]) + 1)

In [41]:
train_time

'2018-12-31-12'

In [42]:
train_time = pd.to_datetime(train_time)

In [46]:
disruption_start = df_disruptions.iloc[0]['start_time'].strftime('%Y-%m-%d-%H')


In [48]:
disruption_start

Timestamp('2019-01-01 06:00:00')

In [47]:
disruption_start = pd.to_datetime(disruption_start)

In [18]:
train_time > disruption_start

False

In [127]:
disruption_start = df.iloc[1]['start_time'].hour
disruption_end = df.iloc[1]['start_time'].hour
ride_time = int("{:01d}".format(int(df.iloc[1]['DepartureTime'][:2])))

In [128]:
type(ride_time)

int

In [257]:
dis_columns = list(df_disruptions.columns)

In [125]:
del i

In [None]:
%%time
count = 0
for i in range(df.shape[0]):
    ride_time = -2
    if pd.notna(df.iloc[i]['DepartureTime']): 
        ride_time = int("{:01d}".format(int(df.iloc[i]['DepartureTime'][:2])))
    disruption_start = df.iloc[i]['start_time'].hour if pd.notna(df.iloc[i]['start_time'].hour) else -1
    disruption_end = df.iloc[i]['end_time'].hour if pd.notna(df.iloc[i]['end_time'].hour) else -1
    count += 1
    print(count)
    if (ride_time and disruption_start and disruption_end):
        if (ride_time < disruption_start) or (ride_time > disruption_end):
            df.iloc[i][dis_columns] = None

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.loc[key] = value


109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358


1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139


In [162]:
df.iloc[480]

Unnamed: 0                             480
Date                            2019-01-02
RideId                                9387
TrainId                               9387
DepartureStation        Amsterdam Centraal
                               ...        
statistical_cause_en                   NaN
cause_group                            NaN
start_time                             NaT
end_time                               NaT
duration_minutes                       NaN
Name: 480, Length: 87, dtype: object

In [141]:
df.isna().sum()

Date                        0
RideId                      0
TrainId                     0
DepartureStation          930
DepartureTime             940
                        ...  
statistical_cause_en    62292
cause_group             62292
start_time              62292
end_time                62292
duration_minutes        62292
Length: 86, dtype: int64

In [159]:
mask1 = (df['DepartureTime'].str.slice(stop=2).astype(int) < df['start_time'].dt.hour if pd.notna(df['start_time'].dt.hour) else -1)
mask2 = (df['DepartureTime'].str.slice(stop=2).astype(int) > df['end_time'].dt.hour if pd.notna(df['end_time'].dt.hour) else -1)
df[(mask1 | mask2), dis_columns] = None

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [166]:
df.isna().sum()

Unnamed: 0                  0
Date                        0
RideId                      0
TrainId                     0
DepartureStation          930
                        ...  
statistical_cause_en    62292
cause_group             62292
start_time              62292
end_time                62292
duration_minutes        62292
Length: 87, dtype: int64

In [167]:
df.to_csv('../assets/data/2019-UT-ASD-Full/2019-UT-ASD-scrapped.csv')

In [172]:
pd.options.display.max_rows = 4000
df.loc[df['DestinationStation'] == 'Utrecht Centraal', 'DestinationRain'].isna().sum()

55924