In [2]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
pd.options.display.float_format = '{:,.2f}'.format
pd.options.display.max_rows = 100
pd.options.display.max_columns = 20

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm
from scipy import stats

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_squared_log_error, r2_score, mean_absolute_error

In [3]:
raw = pd.read_csv('bike-sharing-demand/Full.csv')
raw

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 0:00,1,0,0,1,9.84,14.39,81,0.00,3,13,16
1,2011-01-01 1:00,1,0,0,1,9.02,13.63,80,0.00,8,32,40
2,2011-01-01 2:00,1,0,0,1,9.02,13.63,80,0.00,5,27,32
3,2011-01-01 3:00,1,0,0,1,9.84,14.39,75,0.00,3,10,13
4,2011-01-01 4:00,1,0,0,1,9.84,14.39,75,0.00,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
17374,2012-12-31 19:00,1,0,1,2,10.66,12.88,60,11.00,11,108,119
17375,2012-12-31 20:00,1,0,1,2,10.66,12.88,60,11.00,8,81,89
17376,2012-12-31 21:00,1,0,1,1,10.66,12.88,60,11.00,7,83,90
17377,2012-12-31 22:00,1,0,1,1,10.66,13.63,56,9.00,13,48,61


In [4]:
def non_feature_engineering(raw):
    if 'datetime' in raw.columns:
        raw['datetime'] = pd.to_datetime(raw['datetime'])
        raw['DateTime'] = pd.to_datetime(raw['datetime'])

    if raw.index.dtype == 'int64':
        raw.set_index('DateTime', inplace = True)

    raw = raw.asfreq('H', method = 'ffill')
    raw_nfe = raw.copy()
    return raw_nfe

In [22]:
def feature_engineering(raw):
    if 'datetime' in raw.columns:
        raw['datetime'] = pd.to_datetime(raw['datetime'])
        raw['DateTime'] = pd.to_datetime(raw['datetime'])

    if raw.index.dtype == 'int64':
        raw.set_index('DateTime', inplace = True)

    raw = raw.asfreq('H', method = 'ffill')

    result = sm.tsa.seasonal_decompose(raw['count'], model = 'additive')
    Y_trend = pd.DataFrame(result.trend)
    Y_trend.fillna(method = 'ffill', inplace = True)
    Y_trend.fillna(method = 'bfill', inplace = True)
    Y_trend.columns = ['count_trend']
    Y_seasonal = pd.DataFrame(result.seasonal)
    Y_seasonal.fillna(method = 'ffill', inplace = True)
    Y_seasonal.fillna(method = 'bfill', inplace = True)
    Y_seasonal.columns = ['count_seasonal']

    if 'count_trend' not in raw.columns:
        if 'count_seasonal' not in raw.columns:
            raw = pd.concat([raw, Y_trend, Y_seasonal], axis = 1)

    Y_day = raw[['count']].rolling(24).mean()
    Y_day.fillna(method = 'ffill', inplace = True)
    Y_day.fillna(method = 'bfill', inplace = True)
    Y_day.columns = ['count_day']
    Y_week = raw[['count']].rolling(24).mean()
    Y_week.fillna(method = 'ffill', inplace = True)
    Y_week.fillna(method = 'bfill', inplace = True)
    Y_week.columns = ['count_week']

    if 'count_day' not in raw.columns:
        if 'count_week' not in raw.columns:
            raw = pd.concat([raw, Y_day, Y_week], axis = 1)

    Y_diff = raw[['count']].diff()
    Y_diff.fillna(method = 'bfill', inplace = True)
    Y_diff.fillna(method = 'ffill', inplace = True)
    Y_diff.columns = ['count_diff']

    if 'count_diff' not in raw.columns:
        raw = pd.concat([raw, Y_diff], axis = 1)

    raw['temp_group'] = pd.cut(raw['temp'], 10)

    raw['Year'] = raw.datetime.dt.year
    raw['Quarter'] = raw.datetime.dt.quarter
    raw['Quarter_ver2'] = raw['Quarter'] + (raw.Year - raw.Year.min()) * 4
    raw['Month'] = raw.datetime.dt.month
    raw['Day'] = raw.datetime.dt.day
    raw['Hour'] = raw.datetime.dt.hour
    raw['DayofWeek'] = raw.datetime.dt.dayofweek

    raw['count_lag1'] = raw['count'].shift(1)
    raw['count_lag2'] = raw['count'].shift(2)
    raw['count_lag1'].fillna(method = "bfill", inplace = True)
    raw['count_lag2'].fillna(method= "bfill", inplace = True)

    if 'Quarter' in raw.columns:
        raw = pd.concat([raw, pd.get_dummies(raw['Quarter'], prefix = 'Quarter_Dummy', drop_first = True)], axis = 1)
        del raw['Quarter']

    raw_fe = raw.copy()
    return raw_fe

In [6]:
raw_nfe = non_feature_engineering(raw)
raw_nfe

Unnamed: 0_level_0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2011-01-01 00:00:00,2011-01-01 00:00:00,1,0,0,1,9.84,14.39,81,0.00,3,13,16
2011-01-01 01:00:00,2011-01-01 01:00:00,1,0,0,1,9.02,13.63,80,0.00,8,32,40
2011-01-01 02:00:00,2011-01-01 02:00:00,1,0,0,1,9.02,13.63,80,0.00,5,27,32
2011-01-01 03:00:00,2011-01-01 03:00:00,1,0,0,1,9.84,14.39,75,0.00,3,10,13
2011-01-01 04:00:00,2011-01-01 04:00:00,1,0,0,1,9.84,14.39,75,0.00,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
2012-12-31 19:00:00,2012-12-31 19:00:00,1,0,1,2,10.66,12.88,60,11.00,11,108,119
2012-12-31 20:00:00,2012-12-31 20:00:00,1,0,1,2,10.66,12.88,60,11.00,8,81,89
2012-12-31 21:00:00,2012-12-31 21:00:00,1,0,1,1,10.66,12.88,60,11.00,7,83,90
2012-12-31 22:00:00,2012-12-31 22:00:00,1,0,1,1,10.66,13.63,56,9.00,13,48,61


In [9]:
raw_fe = feature_engineering(raw)
raw_fe

Unnamed: 0_level_0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,...,Quarter_ver2,Month,Day,Hour,DayofWeek,count_lag1,count_lag2,Quarter_Dummy_2,Quarter_Dummy_3,Quarter_Dummy_4
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-01-01 00:00:00,2011-01-01 00:00:00,1,0,0,1,9.84,14.39,81,0.00,3,...,1,1,1,0,5,,,0,0,0
2011-01-01 01:00:00,2011-01-01 01:00:00,1,0,0,1,9.02,13.63,80,0.00,8,...,1,1,1,1,5,16.00,,0,0,0
2011-01-01 02:00:00,2011-01-01 02:00:00,1,0,0,1,9.02,13.63,80,0.00,5,...,1,1,1,2,5,40.00,16.00,0,0,0
2011-01-01 03:00:00,2011-01-01 03:00:00,1,0,0,1,9.84,14.39,75,0.00,3,...,1,1,1,3,5,32.00,40.00,0,0,0
2011-01-01 04:00:00,2011-01-01 04:00:00,1,0,0,1,9.84,14.39,75,0.00,0,...,1,1,1,4,5,13.00,32.00,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2012-12-31 19:00:00,2012-12-31 19:00:00,1,0,1,2,10.66,12.88,60,11.00,11,...,8,12,31,19,0,122.00,164.00,0,0,1
2012-12-31 20:00:00,2012-12-31 20:00:00,1,0,1,2,10.66,12.88,60,11.00,8,...,8,12,31,20,0,119.00,122.00,0,0,1
2012-12-31 21:00:00,2012-12-31 21:00:00,1,0,1,1,10.66,12.88,60,11.00,7,...,8,12,31,21,0,89.00,119.00,0,0,1
2012-12-31 22:00:00,2012-12-31 22:00:00,1,0,1,1,10.66,13.63,56,9.00,13,...,8,12,31,22,0,90.00,89.00,0,0,1


In [19]:
Y_colname = ['count']
X_remove = ['datetime', 'DateTime', 'temp_group', 'casual', 'registered']
X_colname = [x for x in raw_fe.columns if x not in Y_colname+X_remove]
len(X_colname)

24

In [23]:
X_train, X_test, Y_train, Y_test = train_test_split(raw_fe[X_colname], raw_fe[Y_colname], test_size = 0.2, random_state = 123)
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)
X_train.info()

(14035, 26) (14035, 1)
(3509, 26) (3509, 1)
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 14035 entries, 2011-10-13 06:00:00 to 2012-10-17 05:00:00
Data columns (total 26 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   season           14035 non-null  int64  
 1   holiday          14035 non-null  int64  
 2   workingday       14035 non-null  int64  
 3   weather          14035 non-null  int64  
 4   temp             14035 non-null  float64
 5   atemp            14035 non-null  float64
 6   humidity         14035 non-null  int64  
 7   windspeed        14035 non-null  float64
 8   count_trend      14035 non-null  float64
 9   count_trend      14035 non-null  float64
 10  count_seasonal   14035 non-null  float64
 11  count_day        14035 non-null  float64
 12  count_trend      14035 non-null  float64
 13  count_trend      14035 non-null  float64
 14  count_diff       14035 non-null  float64
 15  Year             14035 no

In [25]:
# for a time-serires
raw_train = raw_fe.loc[raw_fe.index < '2012-07-01',:]
raw_test = raw_fe.loc[raw_fe.index >= '2012-07-01',:]
print(raw_train.shape, raw_test.shape)

(13128, 30) (4416, 30)


In [26]:
# data split of x and y from train/test sets
X_train = raw_train[X_colname]
Y_train = raw_train[Y_colname]
X_test = raw_test[X_colname]
Y_train = raw_test[Y_colname]

print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

(13128, 26) (4416, 1)
(4416, 26) (3509, 1)


In [27]:
def datasplit_cs(raw, Y_colname, X_colname):
    X_train, X_test, Y_train, Y_test = train_test_split(raw_fe[X_colname], raw_fe[Y_colname], test_size = 0.2, random_state = 123)
    print(X_train.shape, Y_train.shape)
    print(X_test.shape, Y_test.shape)

In [28]:
def datasplit_ts(raw, Y_colname, X_colname, criteria):
    raw_train = raw.loc[raw.index < criteria,:]
    raw_test = raw.loc[raw.index >= criteria,:]
    Y_train = raw_train[Y_colname]
    X_train = raw_train[X_colname]
    Y_test = raw_test[Y_colname]
    X_test = raw_test[X_colname]