In [137]:
import pandas as pd
import requests
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import mpl_toolkits
import numpy as np
%matplotlib inline

In [138]:
df = pd.read_csv("../assets/data/2019-UT-ASD-Full/2019-UT-ASD-scrapped.csv")

In [139]:
df.head()

Unnamed: 0.1,Unnamed: 0,Date,RideId,TrainId,DepartureStation,DepartureTime,DepartureDelay,DestinationStation,ArrivalTime,ArrivalDelay,...,rdt_station_names,rdt_station_codes,cause_nl,cause_en,statistical_cause_nl,statistical_cause_en,cause_group,start_time,end_time,duration_minutes
0,0,2018-12-31,1405,1405,Utrecht Centraal,01:01,1.0,Amsterdam Centraal,01:29,1.5,...,,,,,,,,,,
1,1,2018-12-31,1402,1402,Amsterdam Centraal,01:18,0.0,Utrecht Centraal,01:53,0.0,...,,,,,,,,,,
2,2,2019-01-01,1409,1409,Utrecht Centraal,02:17,0.0,Amsterdam Centraal,02:44,1.0,...,,,,,,,,,,
3,3,2018-12-31,1406,1406,Amsterdam Centraal,02:19,1.5,Utrecht Centraal,02:45,0.0,...,,,,,,,,,,
4,4,2019-01-01,1413,1413,Utrecht Centraal,03:11,0.0,Amsterdam Centraal,03:44,0.0,...,,,,,,,,,,


In [140]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV

In [141]:
numeric_features = df.select_dtypes(include=[np.number])

In [142]:
numeric_features.columns

Index(['Unnamed: 0', 'RideId', 'TrainId', 'DepartureDelay', 'ArrivalDelay',
       'DepartureWeatherStationCode', 'DepartureHour', 'DepartureWindDir',
       'DepartureWindHour', 'DepartureWindSpeed', 'DepartureMaxWindSpeed',
       'DepartureTemperature', 'DepartureMinTemp10M', 'DepartureDewPointTemp',
       'DepartureSunshineDur', 'DepartureRadiation', 'DeparturePrecipDur',
       'DeparturePrecipHour', 'DepartureAirPressure', 'DepartureVisibility',
       'DepartureCloudiness', 'DepartureHumidity', 'DepartureWeatherCode',
       'DepartureWeatherCodeIndicator', 'DepartureFog', 'DepartureRain',
       'DepartureSnow', 'DepartureThunder', 'DepartureIceFormation',
       'DestinationWeatherStationCode', 'DestinationHour',
       'DestinationWindDir', 'DestinationWindHour', 'DestinationWindSpeed',
       'DestinationMaxWindSpeed', 'DestinationTemperature',
       'DestinationMinTemp10M', 'DestinationDewPointTemp',
       'DestinationSunshineDur', 'DestinationRadiation',
       'Destina

In [143]:
# set the target and predictors
y = df.ArrivalDelay  # target

# use only those input features with numeric data type 
df_temp = df.select_dtypes(include=["int64","float64"]) 

X = df_temp.drop(["ArrivalDelay"],axis=1)  # predictors

In [144]:
feature_list = ['RideId','DepartureStation','Date','DestinationStation', 'cause_en', 'DepartureTime', 'ArrivalTime', 'ArrivalDelay']

In [145]:
df[feature_list].isna().sum()

RideId                    0
DepartureStation        930
Date                      0
DestinationStation     1073
cause_en              62292
DepartureTime           940
ArrivalTime            1073
ArrivalDelay              0
dtype: int64

In [146]:
feature_df = df[feature_list]

In [147]:
feature_df['Maintanence'] = feature_df['cause_en'].apply(lambda cause: True if cause == 'engineering work' else False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feature_df['Maintanence'] = feature_df['cause_en'].apply(lambda cause: True if cause == 'engineering work' else False)


In [148]:
feature_df.drop(columns=['cause_en'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [159]:
feature_df

Unnamed: 0,RideId,DepartureStation,Date,DestinationStation,DepartureTime,ArrivalTime,ArrivalDelay,Maintanence
0,1405,Utrecht Centraal,2018-12-31,Amsterdam Centraal,01:01,01:29,1.5,False
1,1402,Amsterdam Centraal,2018-12-31,Utrecht Centraal,01:18,01:53,0.0,False
2,1409,Utrecht Centraal,2019-01-01,Amsterdam Centraal,02:17,02:44,1.0,False
3,1406,Amsterdam Centraal,2018-12-31,Utrecht Centraal,02:19,02:45,0.0,False
4,1413,Utrecht Centraal,2019-01-01,Amsterdam Centraal,03:11,03:44,0.0,False
...,...,...,...,...,...,...,...,...
111263,2973,Amsterdam Centraal,2019-12-31,Utrecht Centraal,19:40,20:07,2.0,False
111264,2968,Utrecht Centraal,2019-12-31,Amsterdam Centraal,19:53,20:18,1.0,False
111265,3073,Amsterdam Centraal,2019-12-31,Utrecht Centraal,19:54,20:21,0.0,False
111266,122,Utrecht Centraal,2019-12-31,Amsterdam Centraal,20:03,20:28,0.0,False


In [149]:
feature_df.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feature_df.dropna(inplace=True)


In [150]:
feature_df.isna().sum()

RideId                0
DepartureStation      0
Date                  0
DestinationStation    0
DepartureTime         0
ArrivalTime           0
ArrivalDelay          0
Maintanence           0
dtype: int64

In [151]:
# split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [152]:
X_train.dropna()

Unnamed: 0.1,Unnamed: 0,RideId,TrainId,DepartureDelay,DepartureWeatherStationCode,DepartureHour,DepartureWindDir,DepartureWindHour,DepartureWindSpeed,DepartureMaxWindSpeed,...,DestinationHumidity,DestinationWeatherCode,DestinationWeatherCodeIndicator,DestinationFog,DestinationRain,DestinationSnow,DestinationThunder,DestinationIceFormation,rdt_id,duration_minutes
100951,100951,839,839,0.0,240.0,12.0,350.0,4.0,3.0,8.0,...,64.0,3.0,7.0,0.0,0.0,0.0,0.0,0.0,30624.0,42.0
24945,24945,3963,3963,0.0,240.0,18.0,50.0,2.0,2.0,6.0,...,63.0,4.0,7.0,0.0,0.0,0.0,0.0,0.0,26764.0,226.0
102340,102340,2987,2987,0.0,240.0,24.0,350.0,2.0,3.0,4.0,...,97.0,10.0,7.0,0.0,0.0,0.0,0.0,0.0,30618.0,9.0
86715,86715,3080,3080,2.5,260.0,24.0,230.0,3.0,4.0,6.0,...,79.0,3.0,7.0,0.0,0.0,0.0,0.0,0.0,29870.0,14.0
88071,88071,3056,3056,0.0,260.0,18.0,190.0,6.0,5.0,13.0,...,76.0,1.0,7.0,0.0,0.0,0.0,0.0,0.0,29958.0,133.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94275,94275,3958,3958,0.0,260.0,18.0,200.0,4.0,6.0,10.0,...,86.0,1.0,7.0,0.0,0.0,0.0,0.0,0.0,30264.0,139.0
1913,1913,839,839,0.0,240.0,12.0,310.0,14.0,15.0,21.0,...,69.0,2.0,7.0,0.0,0.0,0.0,0.0,0.0,25342.0,3.0
88870,88870,2982,2982,0.0,260.0,24.0,190.0,2.0,2.0,5.0,...,96.0,10.0,7.0,0.0,0.0,0.0,0.0,0.0,30002.0,4.0
93790,93790,120,120,0.0,260.0,24.0,130.0,2.0,2.0,4.0,...,94.0,2.0,7.0,0.0,0.0,0.0,0.0,0.0,30257.0,0.0


In [153]:
y_train.dropna()

71969     0.0
106888    0.0
70214     2.0
28891     0.0
23649     0.0
         ... 
21243     9.0
45891     0.0
42613     0.0
43567     0.0
68268     0.0
Name: ArrivalDelay, Length: 89014, dtype: float64

In [154]:
y_train

71969     0.0
106888    0.0
70214     2.0
28891     0.0
23649     0.0
         ... 
21243     9.0
45891     0.0
42613     0.0
43567     0.0
68268     0.0
Name: ArrivalDelay, Length: 89014, dtype: float64

In [155]:
y_train.replace([np.inf, -np.inf], np.nan, inplace=True)

In [156]:
X_train.isnull().sum()

Unnamed: 0                             0
RideId                                 0
TrainId                                0
DepartureDelay                         0
DepartureWeatherStationCode          850
DepartureHour                        850
DepartureWindDir                     850
DepartureWindHour                    850
DepartureWindSpeed                   850
DepartureMaxWindSpeed                850
DepartureTemperature                 850
DepartureMinTemp10M                74208
DepartureDewPointTemp                850
DepartureSunshineDur                 850
DepartureRadiation                   850
DeparturePrecipDur                   850
DeparturePrecipHour                  850
DepartureAirPressure                 850
DepartureVisibility                  850
DepartureCloudiness                  951
DepartureHumidity                    850
DepartureWeatherCode               57615
DepartureWeatherCodeIndicator        850
DepartureFog                         850
DepartureRain   

In [157]:
ridge = Ridge(alpha = 1)  # sets alpha to a default value as baseline  
ridge.fit(X_train, y_train)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [158]:
#predict y_values using X_test set
yr_ridge = ridge.predict(X_test)

NotFittedError: This Ridge instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [135]:
actual_values = y_test
plt.scatter(yr_ridge, actual_values, alpha=.75,
            color='b') #alpha helps to show overlapping data
plt.xlabel('Predicted Price')
plt.ylabel('Actual Price')
plt.title('Ridge Regression Model')
plt.show()
#pltrandom_state=None.show()

NameError: name 'yr_ridge' is not defined