# Prediction Models

Importing Libraries

In [53]:
import pandas as pd
import datetime
import numpy as np
import sklearn
from sklearn.cross_validation import train_test_split 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.grid_search import GridSearchCV
from sklearn import linear_model
from sklearn.metrics import *
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from tkinter import filedialog
from tkinter import *

List of Customised functions used

In [54]:
def mean_absolute_percentage_error(y_true, y_pred): 
        y_true, y_pred = np.array(y_true), np.array(y_pred)
        return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
weekend = ['Saturday','Sunday']
def week_day_type(x):
    if x in weekend:
        return 'weekends'
    else:
        return 'weekdays'
morning=range(6,12)
afternoon=range(12,17)
evening=range(17,22)
def time_slot(x):
    if x in morning:
        return 'morning'
    elif x in afternoon:
        return 'afternoon'
    elif x in evening:
        return 'evening'
    else:
        return 'night'  

Reading the csv file and loading it into a dataframe

In [187]:
root = Tk()
root.filename =  filedialog.askopenfilename(initialdir = "/",title = "Select file",filetypes = (("csv files","*.csv"),("all files","*.*")))
print (root.filename)
df=pd.read_csv(root.filename)
root.after(30, lambda: root.destroy())
root.mainloop()

C:/Users/Akash/Documents/ADS/Assignment-2-master/energydata_complete.csv


Creating additional features and dummy columns in the data so that our models get more insights and make more accurate predictions

In [188]:
df['date']=pd.to_datetime(df['date'])
df['year']=df['date'].dt.year
df['month']=df['date'].dt.month
df['day']=df['date'].dt.day
df['day_of_week']=df['date'].dt.weekday_name
df['time_hr_24']=df['date'].dt.hour
df['time_min']=df['date'].dt.minute
df['week_day_type']=df['day_of_week'].map(week_day_type)
morning=range(6,12)
afternoon=range(12,17)
evening=range(17,22)  
df['time_slot']=df['time_hr_24'].map(time_slot)
df.drop(['date'],axis=1,inplace=True)
df=pd.get_dummies(df,prefix=['DOW','TS','WDT'],columns=['day_of_week','time_slot','week_day_type'])

Removing outliers

In [189]:
#Removing Out-liers
print(df.shape)
df=df[(df['Appliances']-df['Appliances'].mean()).abs() <= 3*df['Appliances'].std()]
df=df[(df['lights']-df['lights'].mean()).abs() <= 3*df['lights'].std()]
df=df[(df['T1']-df['T1'].mean()).abs() <= 3*df['T1'].std()]
df=df[(df['T2']-df['T2'].mean()).abs() <= 3*df['T2'].std()]
df=df[(df['T3']-df['T3'].mean()).abs() <= 3*df['T3'].std()]
df=df[(df['T4']-df['T4'].mean()).abs() <= 3*df['T4'].std()]
df=df[(df['T5']-df['T5'].mean()).abs() <= 3*df['T5'].std()]
df=df[(df['T6']-df['T6'].mean()).abs() <= 3*df['T6'].std()]
df=df[(df['T7']-df['T7'].mean()).abs() <= 3*df['T7'].std()]
df=df[(df['T8']-df['T8'].mean()).abs() <= 3*df['T8'].std()]
df=df[(df['T9']-df['T9'].mean()).abs() <= 3*df['T9'].std()]
df=df[(df['T_out']-df['T_out'].mean()).abs() <= 3*df['T_out'].std()]
df=df[(df['RH_1']-df['RH_1'].mean()).abs() <= 3*df['RH_1'].std()]
df=df[(df['RH_2']-df['RH_2'].mean()).abs() <= 3*df['RH_2'].std()]
df=df[(df['RH_3']-df['RH_3'].mean()).abs() <= 3*df['RH_3'].std()]
df=df[(df['RH_4']-df['RH_4'].mean()).abs() <= 3*df['RH_4'].std()]
df=df[(df['RH_5']-df['RH_5'].mean()).abs() <= 3*df['RH_5'].std()]
df=df[(df['RH_6']-df['RH_6'].mean()).abs() <= 3*df['RH_6'].std()]
df=df[(df['RH_7']-df['RH_7'].mean()).abs() <= 3*df['RH_7'].std()]
df=df[(df['RH_8']-df['RH_8'].mean()).abs() <= 3*df['RH_8'].std()]
df=df[(df['RH_9']-df['RH_9'].mean()).abs() <= 3*df['RH_9'].std()]
df=df[(df['RH_out']-df['RH_out'].mean()).abs() <= 3*df['RH_out'].std()]
df=df[(df['Press_mm_hg']-df['Press_mm_hg'].mean()).abs() <= 3*df['Press_mm_hg'].std()]
df=df[(df['Windspeed']-df['Windspeed'].mean()).abs() <= 3*df['Windspeed'].std()]
df=df[(df['Visibility']-df['Visibility'].mean()).abs() <= 3*df['Visibility'].std()]
df=df[(df['Tdewpoint']-df['Tdewpoint'].mean()).abs() <= 3*df['Tdewpoint'].std()]
df=df[(df['rv1']-df['rv1'].mean()).abs() <= 3*df['rv1'].std()]
df=df[(df['rv2']-df['rv2'].mean()).abs() <= 3*df['rv2'].std()]
print(df.shape)

(19735, 46)
(17118, 46)


Spliting data and normalization

In [190]:
df_train,df_test = train_test_split(df,train_size=0.7,random_state=42)
scaler = StandardScaler()
x_train=df_train.iloc[:,1:]
y_train=df_train['Appliances']
scaler.fit(x_train)
x_train_sc=scaler.transform(x_train)
x_test=df_test.iloc[:,1:]
y_test=df_test['Appliances']
scaler.fit(x_test)
x_test_sc=scaler.transform(x_test)

Linear Regression Model

In [191]:
lm=linear_model.LinearRegression()
lm.fit(x_train_sc,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

Linear Regression on Training dataset

In [192]:
y_train_pred=lm.predict(x_train_sc)
print("R2   :",r2_score(y_train,y_train_pred))
print("MAE  :",mean_absolute_error(y_train,y_train_pred))
print("RMSE :",np.sqrt(mean_squared_error(y_train,y_train_pred)))
print("MAPE :",mean_absolute_percentage_error(y_train,y_train_pred))

R2   : 0.246677578831
MAE  : 34.8186933258
RMSE : 58.126235611
MAPE : 45.8231905339


Linear Regression on Testing dataset

In [193]:
y_test_pred=lm.predict(x_test_sc)
print("R2   :",r2_score(y_test,y_test_pred))
print("MAE  :",mean_absolute_error(y_test,y_test_pred))
print("RMSE :",np.sqrt(mean_squared_error(y_test,y_test_pred)))
print("MAPE :",mean_absolute_percentage_error(y_test,y_test_pred))

R2   : 0.229768149054
MAE  : 34.5174622076
RMSE : 56.6320945638
MAPE : 47.0933502516


Random Forest Model

In [194]:
rf=RandomForestRegressor()
rf.fit(x_train_sc, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

Random Forest on Training dataset

In [195]:
y_train_pred=rf.predict(x_train_sc)
print("R2   :",r2_score(y_train,y_train_pred))
print("MAE  :",mean_absolute_error(y_train,y_train_pred))
print("RMSE :",np.sqrt(mean_squared_error(y_train,y_train_pred)))
print("MAPE :",mean_absolute_percentage_error(y_train,y_train_pred))

R2   : 0.902155365581
MAE  : 10.07110666
RMSE : 20.9483710913
MAPE : 11.8698136903


Random Forest on Testing dataset

In [196]:
y_test_pred=rf.predict(x_test_sc)
print("R2   :",r2_score(y_test,y_test_pred))
print("MAE  :",mean_absolute_error(y_test,y_test_pred))
print("RMSE :",np.sqrt(mean_squared_error(y_test,y_test_pred)))
print("MAPE :",mean_absolute_percentage_error(y_test,y_test_pred))

R2   : 0.465035848574
MAE  : 25.0737928349
RMSE : 47.1969819082
MAPE : 31.3659338504


Building Random Forest Models Based on Features selected by thier importance (refer Part3:feature engineering and rough work's folder)

In [197]:
drop_col_list=['year','DOW_Friday','DOW_Monday','DOW_Saturday','DOW_Sunday','DOW_Thursday','DOW_Tuesday','DOW_Wednesday','TS_afternoon',
              'WDT_weekdays','WDT_weekends','TS_morning','month','time_min' ,'TS_evening','day','rv1','rv2','Visibility','Windspeed',
               'T9','lights','T7','Tdewpoint']
x_train=df_train.iloc[:,1:]
y_train=df_train['Appliances']
x_train.drop(drop_col_list,axis=1,inplace=True)
scaler.fit(x_train)
x_train_sc=scaler.transform(x_train)
x_test=df_test.iloc[:,1:]
x_test.drop(drop_col_list,axis=1,inplace=True)
y_test=df_test['Appliances']
scaler.fit(x_test)
x_test_sc=scaler.transform(x_test)

Random Forest Model After using tuned  hyperparameters which we got using random grid search (rough work's folder)

In [198]:
rf=RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=35,
           max_features='sqrt', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=258, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)
rf.fit(x_train_sc, y_train)

RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=35,
           max_features='sqrt', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=258, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

Random Forest on Training dataset

In [199]:
y_train_pred=rf.predict(x_train_sc)
print("R2   :",r2_score(y_train,y_train_pred))
print("MAE  :",mean_absolute_error(y_train,y_train_pred))
print("RMSE :",np.sqrt(mean_squared_error(y_train,y_train_pred)))
print("MAPE :",mean_absolute_percentage_error(y_train,y_train_pred))
#Base Model's evaluating parameters for training dataset
# R2   : 0.902155365581
# MAE  : 10.07110666
# RMSE : 20.9483710913
# MAPE : 11.8698136903

R2   : 0.999997098311
MAE  : 0.0374766366221
RMSE : 0.11407937878
MAPE : 0.0670381095246


Random Forest on Testing dataset

In [68]:
y_test_pred=rf.predict(x_test_sc)
print("R2   :",r2_score(y_test,y_test_pred))
print("MAE  :",mean_absolute_error(y_test,y_test_pred))
print("RMSE :",np.sqrt(mean_squared_error(y_test,y_test_pred)))
print("MAPE :",mean_absolute_percentage_error(y_test,y_test_pred))
#Base Model's evaluating parameters for testing dataset
# R2   : 0.465035848574
# MAE  : 25.0737928349
# RMSE : 47.1969819082
# MAPE : 31.3659338504

R2   : 0.562746622814
MAE  : 21.0750245205
RMSE : 42.6695903922
MAPE : 25.5898964878


There is significant improvement but it is comming at the cost of overfitting.

Building Linear Regression Model by drop least singnificant features.(refer Part3:feature engineering and rough work's folder)

In [203]:
drop_col_list=['year','rv1','rv2','Press_mm_hg','DOW_Wednesday','T1','RH_6','T5','WDT_weekends','RH_5','time_min'
#              ,'Visibility','DOW_Monday','WDT_weekdays','TS_afternoon','DOW_Sunday','day','T_out'
              ]
x_train=df_train.iloc[:,1:]
y_train=df_train['Appliances']
x_train.drop(drop_col_list,axis=1,inplace=True)
scaler.fit(x_train)
x_train_sc=scaler.transform(x_train)
x_test=df_test.iloc[:,1:]
x_test.drop(drop_col_list,axis=1,inplace=True)
y_test=df_test['Appliances']
scaler.fit(x_test)
x_test_sc=scaler.transform(x_test)

Linear Regression Model

In [204]:
lm=linear_model.LinearRegression()
lm.fit(x_train_sc,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

Linear Regression on training dataset

In [205]:
y_train_pred=lm.predict(x_train_sc)
print("R2   :",r2_score(y_train,y_train_pred))
print("MAE  :",mean_absolute_error(y_train,y_train_pred))
print("RMSE :",np.sqrt(mean_squared_error(y_train,y_train_pred)))
print("MAPE :",mean_absolute_percentage_error(y_train,y_train_pred))
#Base Model's evaluating parameters for training dataset
# R2   : 0.246677578831
# MAE  : 34.8186933258
# RMSE : 58.126235611
# MAPE : 45.8231905339

R2   : 0.246387010104
MAE  : 34.8147378947
RMSE : 58.1374446481
MAPE : 45.8116274684


Linear Regression on Testing dataset

In [206]:
y_test_pred=lm.predict(x_test_sc)
print("R2   :",r2_score(y_test,y_test_pred))
print("MAE  :",mean_absolute_error(y_test,y_test_pred))
print("RMSE :",np.sqrt(mean_squared_error(y_test,y_test_pred)))
print("MAPE :",mean_absolute_percentage_error(y_test,y_test_pred))
#Base Model's evaluating parameters for testing dataset
# R2   : 0.229768149054
# MAE  : 34.5174622076
# RMSE : 56.6320945638
# MAPE : 47.0933502516

R2   : 0.230087785103
MAE  : 34.4835361606
RMSE : 56.6203425587
MAPE : 47.0720474117


Important features based on coefficients of linear model.
By droping least singnificant feature our model can give better result.

In [207]:
feature_importances = [(x_train, abs((round(lm.coef_, 2)))) for x_train, lm.coef_ in zip(x_train.columns, lm.coef_)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: RH_2                 Importance: 32.0
Variable: T3                   Importance: 31.97
Variable: RH_1                 Importance: 28.58
Variable: T2                   Importance: 25.29
Variable: RH_3                 Importance: 21.84
Variable: T6                   Importance: 21.07
Variable: T8                   Importance: 20.46
Variable: T9                   Importance: 20.11
Variable: RH_8                 Importance: 16.11
Variable: TS_night             Importance: 11.51
Variable: RH_out               Importance: 10.21
Variable: month                Importance: 8.84
Variable: RH_4                 Importance: 8.33
Variable: lights               Importance: 7.91
Variable: TS_evening           Importance: 7.56
Variable: Tdewpoint            Importance: 6.38
Variable: TS_morning           Importance: 5.02
Variable: T7                   Importance: 4.75
Variable: RH_9                 Importance: 4.07
Variable: T4                   Importance: 3.63
Variable: Windspeed           

There is very slight improvement in linear model.

Neural Network

In [208]:
x_train=df_train.iloc[:,1:]
y_train=df_train['Appliances']
scaler.fit(x_train)
x_train_sc=scaler.transform(x_train)
x_test=df_test.iloc[:,1:]
y_test=df_test['Appliances']
scaler.fit(x_test)
x_test_sc=scaler.transform(x_test)

Neural Network Model

In [209]:
mlp = MLPRegressor(hidden_layer_sizes=(155),max_iter=500,alpha=1.00000000e-06,random_state=42)
#65>,105--3 layer  #155 --1 layer #115 --2
mlp.fit(x_train_sc,y_train)     

MLPRegressor(activation='relu', alpha=1e-06, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=155, learning_rate='constant',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=42, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

Neural Network on Training Dataset

In [210]:
y_train_pred=mlp.predict(x_train_sc)
print("R2   :",r2_score(y_train,y_train_pred))
print("MAE  :",mean_absolute_error(y_train,y_train_pred))
print("RMSE :",np.sqrt(mean_squared_error(y_train,y_train_pred)))
print("MAPE :",mean_absolute_percentage_error(y_train,y_train_pred))

R2   : 0.534484272556
MAE  : 26.2079151933
RMSE : 45.6929218734
MAPE : 32.7787131344


Neural Network on Testing Dataset

In [211]:
y_test_pred=mlp.predict(x_test_sc)
print("R2   :",r2_score(y_test,y_test_pred))
print("MAE  :",mean_absolute_error(y_test,y_test_pred))
print("RMSE :",np.sqrt(mean_squared_error(y_test,y_test_pred)))
print("MAPE :",mean_absolute_percentage_error(y_test,y_test_pred))

R2   : 0.394636874831
MAE  : 29.2237202142
RMSE : 50.2064910124
MAPE : 37.5725864014


Out of linear,RandomForest,Neural networks model's Random Forest model gave us the best results,

Base Model:

    For training dataset:-
    
        R2   : 0.902155365581
        MAE  : 10.07110666
        RMSE : 20.9483710913
        MAPE : 11.8698136903
    
    For testing dataset:-
    
        R2   : 0.465035848574
        MAE  : 25.0737928349
        RMSE : 47.1969819082
        MAPE : 31.3659338504

Optimized Model:

    For training dataset:-
    
        R2   : 0.999997098311
        MAE  : 0.0374766366221
        RMSE : 0.11407937878
        MAPE : 0.0670381095246
        
    For testing dataset:-
    
        R2   : 0.562746622814
        MAE  : 21.0750245205
        RMSE : 42.6695903922
        MAPE : 25.5898964878