In [1]:
import pandas as pd
import numpy as np
import glob
import time
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
from plotly import tools
init_notebook_mode(connected=True)

In [2]:
df = pd.read_csv('train_6BJx641.csv')

In [3]:
df.head()

Unnamed: 0,ID,datetime,temperature,var1,pressure,windspeed,var2,electricity_consumption
0,0,2013-07-01 00:00:00,-11.4,-17.1,1003.0,571.91,A,216.0
1,1,2013-07-01 01:00:00,-12.1,-19.3,996.0,575.04,A,210.0
2,2,2013-07-01 02:00:00,-12.9,-20.0,1000.0,578.435,A,225.0
3,3,2013-07-01 03:00:00,-11.4,-17.1,995.0,582.58,A,216.0
4,4,2013-07-01 04:00:00,-11.4,-19.3,1005.0,586.6,A,222.0


In [4]:
del df['ID']

In [5]:
df.isnull().sum()

datetime                   0
temperature                0
var1                       0
pressure                   0
windspeed                  0
var2                       0
electricity_consumption    0
dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26496 entries, 0 to 26495
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   datetime                 26496 non-null  object 
 1   temperature              26496 non-null  float64
 2   var1                     26496 non-null  float64
 3   pressure                 26496 non-null  float64
 4   windspeed                26496 non-null  float64
 5   var2                     26496 non-null  object 
 6   electricity_consumption  26496 non-null  float64
dtypes: float64(5), object(2)
memory usage: 1.4+ MB


In [7]:
#Creating datetime features to use in model to capture seasonality 
df['time'] = pd.to_datetime(df['datetime'])   
df['year'] = df.time.dt.year
df['month'] = df.time.dt.month
df['day'] = df.time.dt.day
df['hour'] = df.time.dt.hour
df.drop('time', axis=1, inplace=True)

In [8]:
df.head()

Unnamed: 0,datetime,temperature,var1,pressure,windspeed,var2,electricity_consumption,year,month,day,hour
0,2013-07-01 00:00:00,-11.4,-17.1,1003.0,571.91,A,216.0,2013,7,1,0
1,2013-07-01 01:00:00,-12.1,-19.3,996.0,575.04,A,210.0,2013,7,1,1
2,2013-07-01 02:00:00,-12.9,-20.0,1000.0,578.435,A,225.0,2013,7,1,2
3,2013-07-01 03:00:00,-11.4,-17.1,995.0,582.58,A,216.0,2013,7,1,3
4,2013-07-01 04:00:00,-11.4,-19.3,1005.0,586.6,A,222.0,2013,7,1,4


In [9]:
df=df.sort_values(by='datetime')

In [10]:
df.head()

Unnamed: 0,datetime,temperature,var1,pressure,windspeed,var2,electricity_consumption,year,month,day,hour
0,2013-07-01 00:00:00,-11.4,-17.1,1003.0,571.91,A,216.0,2013,7,1,0
1,2013-07-01 01:00:00,-12.1,-19.3,996.0,575.04,A,210.0,2013,7,1,1
2,2013-07-01 02:00:00,-12.9,-20.0,1000.0,578.435,A,225.0,2013,7,1,2
3,2013-07-01 03:00:00,-11.4,-17.1,995.0,582.58,A,216.0,2013,7,1,3
4,2013-07-01 04:00:00,-11.4,-19.3,1005.0,586.6,A,222.0,2013,7,1,4


In [11]:
del df['datetime']

In [12]:
df.head()

Unnamed: 0,temperature,var1,pressure,windspeed,var2,electricity_consumption,year,month,day,hour
0,-11.4,-17.1,1003.0,571.91,A,216.0,2013,7,1,0
1,-12.1,-19.3,996.0,575.04,A,210.0,2013,7,1,1
2,-12.9,-20.0,1000.0,578.435,A,225.0,2013,7,1,2
3,-11.4,-17.1,995.0,582.58,A,216.0,2013,7,1,3
4,-11.4,-19.3,1005.0,586.6,A,222.0,2013,7,1,4


In [13]:
#convering all categorical columns to numerical.
df1=pd.get_dummies(df)

In [14]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

In [15]:
df1.head()

Unnamed: 0,temperature,var1,pressure,windspeed,electricity_consumption,year,month,day,hour,var2_A,var2_B,var2_C
0,-11.4,-17.1,1003.0,571.91,216.0,2013,7,1,0,1,0,0
1,-12.1,-19.3,996.0,575.04,210.0,2013,7,1,1,1,0,0
2,-12.9,-20.0,1000.0,578.435,225.0,2013,7,1,2,1,0,0
3,-11.4,-17.1,995.0,582.58,216.0,2013,7,1,3,1,0,0
4,-11.4,-19.3,1005.0,586.6,222.0,2013,7,1,4,1,0,0


In [16]:
#creating target and features objects 
x = df1.drop(columns=['electricity_consumption'])
y = df1.iloc[:,4]

In [17]:
#implementing selectKbest
st=time.time()
bestfeatures = SelectKBest(score_func=f_regression)
fit = bestfeatures.fit(x,y)
et=time.time()-st
print(et)

0.015002012252807617


In [18]:
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(x.columns)
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Featuress','Score']
best_features=featureScores.nlargest(5,'Score')

In [19]:
best_features

Unnamed: 0,Featuress,Score
3,windspeed,1603.378268
1,var1,483.788043
0,temperature,369.330583
6,day,256.724699
9,var2_B,74.551514


In [20]:
df1.shape

(26496, 12)

In [21]:
test=df1.tail(7940)

In [22]:
test1=test.head(7440)

In [23]:
train=df1.head(18556)

In [24]:
pred=test.tail(500)

In [25]:
# train-test-validation split
test=df1.tail(7940)
#test set
test1=test.head(7440)
#training set
train=df1.head(18556)
#validation set
pred=test.tail(500)

In [26]:
test1.tail()

Unnamed: 0,temperature,var1,pressure,windspeed,electricity_consumption,year,month,day,hour,var2_A,var2_B,var2_C
25991,-6.4,-19.3,991.0,119.71,183.0,2017,6,2,23,1,0,0
25992,-5.0,-21.4,1008.0,127.485,195.0,2017,6,3,0,1,0,0
25993,-7.1,-20.0,1004.0,132.03,183.0,2017,6,3,1,1,0,0
25994,-6.4,-17.9,991.0,136.575,183.0,2017,6,3,2,1,0,0
25995,-5.7,-17.9,1008.0,139.83,180.0,2017,6,3,3,1,0,0


In [27]:
 pred.tail()

Unnamed: 0,temperature,var1,pressure,windspeed,electricity_consumption,year,month,day,hour,var2_A,var2_B,var2_C
26491,-0.7,-15.0,1009.0,51.685,225.0,2017,6,23,19,1,0,0
26492,-2.9,-11.4,1005.0,56.105,213.0,2017,6,23,20,1,0,0
26493,-1.4,-12.9,995.0,61.275,213.0,2017,6,23,21,1,0,0
26494,-2.9,-11.4,996.0,67.21,210.0,2017,6,23,22,1,0,0
26495,-2.1,-11.4,1009.0,71.88,210.0,2017,6,23,23,1,0,0


In [28]:
y_train=train.iloc[:,4]

In [29]:
X_train=train.drop(columns=['electricity_consumption'])

In [30]:
y_test=test1.iloc[:,4]

In [31]:
X_test=test1.drop(columns=['electricity_consumption'])

In [32]:
y_pred=pred.iloc[:,4]

In [33]:
X_pred=pred.drop(columns=['electricity_consumption'])

In [34]:
import xgboost as xgb
xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 100, alpha = 10, n_estimators = 140)
xg_reg.fit(X_train,y_train)



In [35]:
from sklearn.metrics import mean_squared_error
predictions = xg_reg.predict(X_test)
errors = abs(predictions - y_test)
mape = 100 * np.mean(errors / y_test)
mse=mean_squared_error(y_test,predictions)
RMSE=np.sqrt(mse)
print("XGBOOST model")
print("mape value for test set",mape)
print("mse value for test set",mse)
print("RMSE value for test set",RMSE)

XGBOOST model
mape value for test set 20.395743624282215
mse value for test set 9949.946409810507
RMSE value for test set 99.74941809259093


In [36]:
predictions = xg_reg.predict(X_pred)
errors = abs(predictions - y_pred)
mape = 100 * np.mean(errors / y_pred)
mse=mean_squared_error(y_pred,predictions)
RMSE=np.sqrt(mse)
print("XGBOOST model")
print("mape value for validation set",mape)
print("mse value for validation set",mse)
print("RMSE value for validation set",RMSE)

XGBOOST model
mape value for validation set 18.31413324392246
mse value for validation set 6692.219349248934
RMSE value for validation set 81.80598602332799


In [37]:
from lightgbm import LGBMRegressor
lgb_reg = LGBMRegressor(n_estimators=100, random_state=42)
lgb_reg.fit(X_train, y_train)

In [38]:
from sklearn.metrics import mean_squared_error
predictions = lgb_reg.predict(X_test)
errors = abs(predictions - y_test)
mape = 100 * np.mean(errors / y_test)
mse=mean_squared_error(y_test,predictions)
RMSE=np.sqrt(mse)
print("LIGHTGBM model")
print("mape value for test set",mape)
print("mse value for test set",mse)
print("RMSE value for test set",RMSE)

LIGHTGBM model
mape value for test set 17.8086387209238
mse value for test set 7448.058075387331
RMSE value for test set 86.30213250776212


In [39]:
predictions = lgb_reg.predict(X_pred)
errors = abs(predictions - y_pred)
mape = 100 * np.mean(errors / y_pred)
mse=mean_squared_error(y_pred,predictions)
RMSE=np.sqrt(mse)
print("LIGHTGBM model")
print("mape value for validation set",mape)
print("mse value for validation set",mse)
print("RMSE value for validation set",RMSE)

LIGHTGBM model
mape value for validation set 14.524462046915062
mse value for validation set 4610.576774339071
RMSE value for validation set 67.90122807681074


In [40]:
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(n_estimators=100, random_state=42)
regr.fit(X_train, y_train)

In [41]:
from sklearn.metrics import mean_squared_error
predictions = regr.predict(X_test)
errors = abs(predictions - y_test)
mape = 100 * np.mean(errors / y_test)
mse=mean_squared_error(y_test,predictions)
RMSE=np.sqrt(mse)
print("RANDOM FOREST model")
print("mape value for test set",mape)
print("mse value for test set",mse)
print("RMSE value for test set",RMSE)

RANDOM FOREST model
mape value for test set 18.341864621229462
mse value for test set 7642.701889959678
RMSE value for test set 87.42254794936875


In [42]:
predictions = regr.predict(X_pred)
errors = abs(predictions - y_pred)
mape = 100 * np.mean(errors / y_pred)
mse=mean_squared_error(y_pred,predictions)
RMSE=np.sqrt(mse)
print("RANDOM FOREST model")
print("mape value for validation set",mape)
print("mse value for validation set",mse)
print("RMSE value for validation set",RMSE)

RANDOM FOREST model
mape value for validation set 16.41982254170068
mse value for validation set 5138.454886199999
RMSE value for validation set 71.68301672083841


In [43]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    mse=mean_squared_error(test_labels,predictions)
    RMSE=np.sqrt(mse)
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    print('RMSE = {:0.2f}'.format(RMSE))
    return accuracy,predictions,RMSE

In [44]:
models=[xg_reg,lgb_reg,regr]
model_name=['XGBoost','LightGBM','RandomForest']
model_RMSE=[]
model_predictions=[]
for item in models:
    base_accuracy,predictions,RMSE=evaluate(item,X_test,y_test)
    model_RMSE.append(RMSE)
    model_predictions.append(predictions)
r=model_RMSE.index(min(model_RMSE))
best_model_predictions=model_predictions[r]
best_model_name=model_name[r]
best_model=models[r]    

Model Performance
Average Error: 67.7595 degrees.
Accuracy = 79.60%.
RMSE = 99.75
Model Performance
Average Error: 58.4586 degrees.
Accuracy = 82.19%.
RMSE = 86.30
Model Performance
Average Error: 59.3871 degrees.
Accuracy = 81.66%.
RMSE = 87.42


In [45]:
print('Best Model:')
print(best_model_name)
print('Model Object:')
print(best_model)
print('Predictions:')
print(best_model_predictions)

Best Model:
LightGBM
Model Object:
LGBMRegressor(random_state=42)
Predictions:
[216.59204245 232.93339549 224.6671183  ... 198.45734021 192.5073198
 191.79087945]


In [46]:
#Plot timeseries
y_test=pd.DataFrame(y_test)

y_test['predictions']=best_model_predictions

X_test['datetime']=pd.to_datetime(X_test[['year','month','day','hour']])

y_test['datetime']=X_test['datetime']

y_test=y_test.sort_values(by='datetime')

trace0 = go.Scatter(x=y_test['datetime'].astype(str), y=y_test['electricity_consumption'].values, opacity = 0.8, name='actual_value')
trace1 = go.Scatter(x=y_test['datetime'].astype(str), y=y_test['predictions'].values, opacity = 0.8, name='prediction')
layout = dict(
    title= "Prediction vs actual:",
    xaxis=dict(
        rangeselector=dict(
            buttons=list([
                dict(count=1, label='1m', step='month', stepmode='backward'),
                dict(count=6, label='6m', step='month', stepmode='backward'),
                dict(count=12, label='12m', step='month', stepmode='backward'),
                dict(step='all')
            ])
        ),
        rangeslider=dict(visible = True),
        type='date'
    )
)
fig = dict(data= [trace0,trace1], layout=layout)
iplot(fig)

In [47]:
models=[xg_reg,lgb_reg,regr]
model_name=['XGBoost','LightGBM','RandomForest']
model_RMSE=[]
model_predictions=[]
for item in models:
    base_accuracy,predictions,RMSE=evaluate(item,X_pred,y_pred)
    model_RMSE.append(RMSE)
    model_predictions.append(predictions)
r=model_RMSE.index(min(model_RMSE))
best_model_predictions=model_predictions[r]
best_model_name=model_name[r]
best_model=models[r]

Model Performance
Average Error: 54.9496 degrees.
Accuracy = 81.69%.
RMSE = 81.81
Model Performance
Average Error: 43.6196 degrees.
Accuracy = 85.48%.
RMSE = 67.90
Model Performance
Average Error: 46.8309 degrees.
Accuracy = 83.58%.
RMSE = 71.68


In [48]:
print('Best Model:')
print(best_model_name)
print('Model Object:')
print(best_model)
print('Predictions:')
print(best_model_predictions)

Best Model:
LightGBM
Model Object:
LGBMRegressor(random_state=42)
Predictions:
[192.02849511 193.2968421  237.88839221 221.5189054  212.80355811
 206.80779746 207.37546971 207.14007037 208.5919119  205.05943497
 202.49199157 205.05943497 206.73860635 203.37080023 207.00436673
 204.19965083 233.34242302 238.22601053 229.43033946 217.36532302
 209.62031574 210.11428889 204.99864413 202.54362727 235.22940937
 221.95106969 225.34880764 222.14923335 225.23069969 217.34363158
 214.27011354 211.82848886 213.89049168 211.31789582 207.13325938
 232.87577214 231.06615227 235.98911853 255.70579284 254.29933613
 296.3633898  352.75764798 269.2324668  263.67953229 355.5478214
 359.1433237  306.64453897 283.07961304 328.32357687 328.09345003
 272.7926993  271.04794768 262.34072163 243.41526875 223.60502308
 217.63879732 250.8424193  237.80932361 242.9721742  232.71441757
 239.03637335 250.66858403 248.95450287 238.74039742 252.07948862
 314.62270119 296.32141252 292.02354448 250.9503676  300.6073805

In [49]:
#Plot timeseries
y_pred=pd.DataFrame(y_pred)

y_pred['predictions']=best_model_predictions

X_pred['datetime']=pd.to_datetime(X_pred[['year','month','day','hour']])

y_pred['datetime']=X_pred['datetime']

y_pred=y_pred.sort_values(by='datetime')

trace0 = go.Scatter(x=y_pred['datetime'].astype(str), y=y_pred['electricity_consumption'].values, opacity = 0.8, name='actual_value')
trace1 = go.Scatter(x=y_pred['datetime'].astype(str), y=y_pred['predictions'].values, opacity = 0.8, name='prediction')
layout = dict(
    title= "Prediction vs actual:",
    xaxis=dict(
        rangeselector=dict(
            buttons=list([
                dict(count=1, label='1m', step='month', stepmode='backward'),
                dict(count=6, label='6m', step='month', stepmode='backward'),
                dict(count=12, label='12m', step='month', stepmode='backward'),
                dict(step='all')
            ])
        ),
        rangeslider=dict(visible = True),
        type='date'
    )
)
fig = dict(data= [trace0,trace1], layout=layout)
iplot(fig)