In [1]:
# Load in our libraries
import pandas as pd
import numpy as np
import re
import sklearn
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm 
import plotly.offline as py
from scipy import stats
import plotly.graph_objs as go
import plotly.tools as tls
import math
import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import confusion_matrix
# Going to use these 5 base models for the stacking
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)
from sklearn.model_selection import KFold

In [6]:
import matplotlib.pyplot as plt
from fbprophet import Prophet
from sklearn.metrics import mean_squared_error, mean_absolute_error
plt.style.use('fivethirtyeight')

In [2]:
dataset_train = pd.read_csv(r'C:\Users\Ashish\Desktop\Data Science A-Z\AVTraffic\train.csv')
dataset_test = pd.read_csv(r'C:\Users\Ashish\Desktop\Data Science A-Z\AVTraffic\test.csv')
sample = pd.read_csv(r'C:\Users\Ashish\Desktop\Data Science A-Z\AVTraffic\sample_submission.csv')

In [3]:
dataset_train.head()

Unnamed: 0,DateTime,Junction,Vehicles,ID
0,2015-11-01 00:00:00,1,15,20151101001
1,2015-11-01 01:00:00,1,13,20151101011
2,2015-11-01 02:00:00,1,10,20151101021
3,2015-11-01 03:00:00,1,7,20151101031
4,2015-11-01 04:00:00,1,9,20151101041


In [10]:
dataset_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48120 entries, 0 to 48119
Data columns (total 4 columns):
DateTime    48120 non-null datetime64[ns]
Junction    48120 non-null int64
Vehicles    48120 non-null int64
ID          48120 non-null int64
dtypes: datetime64[ns](1), int64(3)
memory usage: 1.5 MB


In [9]:

dataset_train['DateTime'] = pd.to_datetime(dataset_train['DateTime'])
dataset_test['DateTime'] = pd.to_datetime(dataset_test['DateTime'])

In [90]:
def create_features(df, label=None):
    """
    Creates time series features from datetime index.
    """
    df = df.copy()
    df['date'] = df.DateTime
    df['hour'] = df['date'].dt.hour
    df['minutes'] = df['date'].dt.minute
    df['dayofweek'] = df['date'].dt.dayofweek
    df['quarter'] = df['date'].dt.quarter
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['dayofyear'] = df['date'].dt.dayofyear
    df['dayofmonth'] = df['date'].dt.day
    df['weekofyear'] = df['date'].dt.weekofyear
    df['is_month_start'] = (df['date'].dt.is_month_start).astype(int)
    df['is_month_end'] = (df['date'].dt.is_month_end ).astype(int)
    df['is_year_start'] = (df['date'].dt.is_year_start).astype(int)
    df['is_year_end'] = (df['date'].dt.is_year_end).astype(int)
    
    X = df[['hour','dayofweek','quarter','month','year',
           'dayofyear','dayofmonth','weekofyear','is_year_end','is_year_start','is_month_end','is_month_start']]
    if label:
        y = df[label]
        return X, y
    return X

In [91]:
X, y = create_features(dataset_train, label='Vehicles')
#X_test = create_features(data)
features_and_target = pd.concat([X, y], axis=1)

In [92]:
features_and_target.head()

Unnamed: 0,hour,dayofweek,quarter,month,year,dayofyear,dayofmonth,weekofyear,is_year_end,is_year_start,is_month_end,is_month_start,Vehicles
0,0,6,4,11,2015,305,1,44,0,0,0,1,15
1,1,6,4,11,2015,305,1,44,0,0,0,1,13
2,2,6,4,11,2015,305,1,44,0,0,0,1,10
3,3,6,4,11,2015,305,1,44,0,0,0,1,7
4,4,6,4,11,2015,305,1,44,0,0,0,1,9


In [93]:
X_test = create_features(dataset_test)

In [94]:
features_and_target['Junction'] = dataset_train['Junction']
X_test['Junction'] = dataset_test['Junction']

In [95]:
y = features_and_target['Vehicles']
X_train = features_and_target.drop("Vehicles",axis=1)

In [96]:
from sklearn.ensemble import BaggingRegressor
from sklearn.model_selection import train_test_split

In [97]:
X_tr,X_val,y_tr,y_val = train_test_split(X_train,y,test_size=0.2,random_state=42)

In [106]:
errlgb = []
y_pred_totxgb = []
fold = KFold(n_splits=5, shuffle=True, random_state=42)
for train_index, test_index in fold.split(X_train):
    X_tr, X_tst = X_train.loc[train_index], X_train.loc[test_index]
    y_tr, y_test = y[train_index], y[test_index]
    br = BaggingRegressor(n_estimators=8)
    br.fit(X_tr,y_tr)
    pred = br.predict(X_tst)
    print(np.sqrt(mean_squared_error(pred,y_test)))
    y_test = br.predict(X_test)
    y_pred_totxgb.append(y_test)

3.6615674764449495
4.016040328158323
4.077808016897373
4.198370912483361
4.019515505208636


In [107]:
br.score(X_train,y)

0.9869910541357273

In [75]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
errlgb = []
y_pred_totxgb = []
fold = KFold(n_splits=5, shuffle=True, random_state=42)
for train_index, test_index in fold.split(X_train):
    X_tr, X_tst = X_train.loc[train_index], X_train.loc[test_index]
    y_tr, y_test = y[train_index], y[test_index]
    br = HistGradientBoostingRegressor(learning_rate=0.01,random_state=42)
    br.fit(X_tr,y_tr)
    pred = br.predict(X_tst)
    print(np.sqrt(mean_squared_error(pred,y_test)))
    y_test = br.predict(X_test)
    y_pred_totxgb.append(y_test)

9.666432131101965
9.668608205269852


KeyboardInterrupt: 

In [108]:
final = np.mean(y_pred_totxgb, 0)

In [111]:
#y_test = br.predict(X_test)

In [109]:
sample["Vehicles"] = final

In [110]:
sample.to_csv(r'C:\Users\Ashish\Desktop\Data Science A-Z\AVTraffic\submission_br_folds.csv', index=False)