# Prediction Models

Importing Libraries

In [6]:
import pandas as pd
import datetime
import numpy as np
import sklearn
from sklearn.cross_validation import train_test_split 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.grid_search import GridSearchCV
from sklearn import linear_model
from sklearn.metrics import *
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from tkinter import filedialog
from tkinter import *

List of Customised functions used

In [7]:
def mean_absolute_percentage_error(y_true, y_pred): 
        y_true, y_pred = np.array(y_true), np.array(y_pred)
        return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
weekend = ['Saturday','Sunday']
def week_day_type(x):
    if x in weekend:
        return 'weekends'
    else:
        return 'weekdays'
def time_slot(x):
    if x in morning:
        return 'morning'
    elif x in afternoon:
        return 'afternoon'
    elif x in evening:
        return 'evening'
    else:
        return 'night' 

Reading the csv file and loading it into a dataframe

In [8]:
root = Tk()
root.filename =  filedialog.askopenfilename(initialdir = "/",title = "Select file",filetypes = (("csv files","*.csv"),("all files","*.*")))
print (root.filename)
df=pd.read_csv(root.filename)
root.after(30, lambda: root.destroy())
root.mainloop()

C:/Users/Akash/Documents/ADS/Assignment-2-master/energydata_complete.csv


Creating additional features and dummy columns in the data so that our models get more insights and make more accurate predictions

In [9]:
df['date']=pd.to_datetime(df['date'])
df['year']=df['date'].dt.year
df['month']=df['date'].dt.month
df['day']=df['date'].dt.day
df['day_of_week']=df['date'].dt.weekday_name
df['time_hr_24']=df['date'].dt.hour
df['time_min']=df['date'].dt.minute
df['week_day_type']=df['day_of_week'].map(week_day_type)
morning=range(6,12)
afternoon=range(12,17)
evening=range(17,22)  
df['time_slot']=df['time_hr_24'].map(time_slot)
df.drop(['date'],axis=1,inplace=True)
df=pd.get_dummies(df,prefix=['DOW','TS','WDT'],columns=['day_of_week','time_slot','week_day_type'])
print(df.shape)

(19735, 46)


Spliting data and normalization

In [10]:
df_train,df_test = train_test_split(df,train_size=0.7,random_state=42)
scaler = StandardScaler()
x_train=df_train.iloc[:,1:]
y_train=df_train['Appliances']
scaler.fit(x_train)
x_train_sc=scaler.transform(x_train)
x_test=df_test.iloc[:,1:]
y_test=df_test['Appliances']
scaler.fit(x_test)
x_test_sc=scaler.transform(x_test)

Linear Regression Model

In [11]:
lm=linear_model.LinearRegression()
lm.fit(x_train_sc,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

Linear Regression on Training dataset

In [12]:
y_train_pred=lm.predict(x_train_sc)
print("R2   :",r2_score(y_train,y_train_pred))
print("MAE  :",mean_absolute_error(y_train,y_train_pred))
print("RMSE :",np.sqrt(mean_squared_error(y_train,y_train_pred)))
print("MAPE :",mean_absolute_percentage_error(y_train,y_train_pred))

R2   : 0.20492495517
MAE  : 52.04917031
RMSE : 91.8032672546
MAPE : 59.5590692172


Linear Regression on Testing dataset

In [13]:
y_test_pred=lm.predict(x_test_sc)
print("R2   :",r2_score(y_test,y_test_pred))
print("MAE  :",mean_absolute_error(y_test,y_test_pred))
print("RMSE :",np.sqrt(mean_squared_error(y_test,y_test_pred)))
print("MAPE :",mean_absolute_percentage_error(y_test,y_test_pred))

R2   : 0.202656060622
MAE  : 52.1348358691
RMSE : 90.6351639674
MAPE : 61.0346218181


Random Forest Model

In [14]:
rf=RandomForestRegressor()
rf.fit(x_train_sc, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

Random Forest on Training dataset

In [15]:
y_train_pred=rf.predict(x_train_sc)
print("R2   :",r2_score(y_train,y_train_pred))
print("MAE  :",mean_absolute_error(y_train,y_train_pred))
print("RMSE :",np.sqrt(mean_squared_error(y_train,y_train_pred)))
print("MAPE :",mean_absolute_percentage_error(y_train,y_train_pred))

R2   : 0.903342360976
MAE  : 14.3318372665
RMSE : 32.0090045424
MAPE : 14.2999986786


Random Forest on Testing dataset

In [16]:
y_test_pred=rf.predict(x_test_sc)
print("R2   :",r2_score(y_test,y_test_pred))
print("MAE  :",mean_absolute_error(y_test,y_test_pred))
print("RMSE :",np.sqrt(mean_squared_error(y_test,y_test_pred)))
print("MAPE :",mean_absolute_percentage_error(y_test,y_test_pred))

R2   : 0.455901439969
MAE  : 36.8596520858
RMSE : 74.8708328696
MAPE : 37.6541418787


Building Random Forest Models Based on Features selected by thier importance (refer Part3:feature engineering and rough work's folder)

In [17]:
drop_col_list=['year','DOW_Monday','DOW_Saturday','DOW_Sunday','DOW_Thursday','DOW_Tuesday','DOW_Wednesday','TS_afternoon','TS_morning',
              'WDT_weekdays','WDT_weekends','month','time_min','DOW_Friday','TS_evening','day','rv1','rv2','Visibility'
               ,'T9','T7','lights']
x_train=df_train.iloc[:,1:]
y_train=df_train['Appliances']
x_train.drop(drop_col_list,axis=1,inplace=True)
scaler.fit(x_train)
x_train_sc=scaler.transform(x_train)
x_test=df_test.iloc[:,1:]
x_test.drop(drop_col_list,axis=1,inplace=True)
y_test=df_test['Appliances']
scaler.fit(x_test)
x_test_sc=scaler.transform(x_test)

Random Forest Model After using tuned  hyperparameters which we got using random grid search (refer rough work's folder)

In [18]:
rf=RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=25,
           max_features='sqrt', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=350, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)
rf.fit(x_train_sc, y_train)

RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=25,
           max_features='sqrt', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=350, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

Random Forest on Training dataset

In [19]:
y_train_pred=rf.predict(x_train_sc)
print("R2   :",r2_score(y_train,y_train_pred))
print("MAE  :",mean_absolute_error(y_train,y_train_pred))
print("RMSE :",np.sqrt(mean_squared_error(y_train,y_train_pred)))
print("MAPE :",mean_absolute_percentage_error(y_train,y_train_pred))
#Base Model's evaluating parameters for training dataset
# R2   : 0.903342360976
# MAE  : 14.3318372665
# RMSE : 32.0090045424
# MAPE : 14.2999986786

R2   : 0.999360449368
MAE  : 1.36231948386
RMSE : 2.60370359672
MAPE : 1.80088335883


Random Forest on Testing dataset

In [20]:
y_test_pred=rf.predict(x_test_sc)
print("R2   :",r2_score(y_test,y_test_pred))
print("MAE  :",mean_absolute_error(y_test,y_test_pred))
print("RMSE :",np.sqrt(mean_squared_error(y_test,y_test_pred)))
print("MAPE :",mean_absolute_percentage_error(y_test,y_test_pred))
#Base Model's evaluating parameters for testing dataset
# R2   : 0.455901439969
# MAE  : 36.8596520858
# RMSE : 74.8708328696
# MAPE : 37.6541418787

R2   : 0.615763350576
MAE  : 28.9173039919
RMSE : 62.9177624391
MAPE : 28.1695894996


There is significant improvement but it is comming at the cost of overfitting.

Building Linear Regression Model by drop least singnificant features.(refer Part3:feature engineering and rough work's folder)

In [112]:
drop_col_list=['year','RH_5','rv2','rv1' ,'RH_6','time_hr_24','WDT_weekends','DOW_Monday','WDT_weekdays','Press_mm_hg','day'
               ,'TS_afternoon','time_min'
#               ,'DOW_Sunday','DOW_Wednesday'
              ]
x_train=df_train.iloc[:,1:]
y_train=df_train['Appliances']
x_train.drop(drop_col_list,axis=1,inplace=True)
scaler.fit(x_train)
x_train_sc=scaler.transform(x_train)
x_test=df_test.iloc[:,1:]
x_test.drop(drop_col_list,axis=1,inplace=True)
y_test=df_test['Appliances']
scaler.fit(x_test)
x_test_sc=scaler.transform(x_test)

Linear Regression Model

In [113]:
lm=linear_model.LinearRegression()
lm.fit(x_train_sc,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

Linear Regression on training dataset

In [114]:
y_train_pred=lm.predict(x_train_sc)
print("R2   :",r2_score(y_train,y_train_pred))
print("MAE  :",mean_absolute_error(y_train,y_train_pred))
print("RMSE :",np.sqrt(mean_squared_error(y_train,y_train_pred)))
print("MAPE :",mean_absolute_percentage_error(y_train,y_train_pred))
#Base Model's evaluating parameters for training dataset
# R2   : 0.20492495517
# MAE  : 52.04917031
# RMSE : 91.8032672546
# MAPE : 59.5590692172

R2   : 0.204335590571
MAE  : 52.0751956124
RMSE : 91.8372864162
MAPE : 59.6210484115


Linear Regression on Testing dataset

In [115]:
y_test_pred=lm.predict(x_test_sc)
print("R2   :",r2_score(y_test,y_test_pred))
print("MAE  :",mean_absolute_error(y_test,y_test_pred))
print("RMSE :",np.sqrt(mean_squared_error(y_test,y_test_pred)))
print("MAPE :",mean_absolute_percentage_error(y_test,y_test_pred))
#Base Model's evaluating parameters for testing dataset
# R2   : 0.202656060622
# MAE  : 52.1348358691
# RMSE : 90.6351639674
# MAPE : 61.0346218181

R2   : 0.203865241079
MAE  : 52.0925576441
RMSE : 90.5664133033
MAPE : 60.9601606995


Important features based on coefficients of linear model. By droping least singnificant feature our model can give better result.

In [116]:
feature_importances = [(x_train, abs((round(lm.coef_, 2)))) for x_train, lm.coef_ in zip(x_train.columns, lm.coef_)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: RH_2                 Importance: 50.1
Variable: T3                   Importance: 49.85
Variable: RH_1                 Importance: 47.95
Variable: T_out                Importance: 41.13
Variable: T2                   Importance: 38.44
Variable: T6                   Importance: 28.94
Variable: T9                   Importance: 27.87
Variable: RH_3                 Importance: 23.22
Variable: Tdewpoint            Importance: 17.87
Variable: T8                   Importance: 17.79
Variable: TS_night             Importance: 16.3
Variable: RH_8                 Importance: 13.61
Variable: lights               Importance: 12.75
Variable: month                Importance: 9.43
Variable: TS_evening           Importance: 8.68
Variable: RH_7                 Importance: 7.57
Variable: RH_out               Importance: 7.06
Variable: RH_4                 Importance: 6.89
Variable: DOW_Tuesday          Importance: 4.78
Variable: DOW_Saturday         Importance: 4.44
Variable: T4                 

There is very slight improvement in linear model.

Neural Network

In [117]:
x_train=df_train.iloc[:,1:]
print(x_train.shape)
y_train=df_train['Appliances']
scaler.fit(x_train)
x_train_sc=scaler.transform(x_train)
x_test=df_test.iloc[:,1:]
y_test=df_test['Appliances']
scaler.fit(x_test)
x_test_sc=scaler.transform(x_test)

(13814, 45)


Neural Network Model

In [118]:
mlp = MLPRegressor(hidden_layer_sizes=(365,365,365),max_iter=500,alpha=1.00000000e-06,random_state=42)
mlp.fit(x_train_sc,y_train)     

MLPRegressor(activation='relu', alpha=1e-06, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(365, 365, 365), learning_rate='constant',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=42, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

Neural Network on Training Dataset

In [120]:
y_train_pred=mlp.predict(x_train_sc)
print("R2   :",r2_score(y_train,y_train_pred))
print("MAE  :",mean_absolute_error(y_train,y_train_pred))
print("RMSE :",np.sqrt(mean_squared_error(y_train,y_train_pred)))
print("MAPE :",mean_absolute_percentage_error(y_train,y_train_pred))

R2   : 0.75920061787
MAE  : 27.9396749876
RMSE : 50.5221426581
MAPE : 30.2356429216


Neural Network on Testing Dataset

In [121]:
y_test_pred=mlp.predict(x_test_sc)
print("R2   :",r2_score(y_test,y_test_pred))
print("MAE  :",mean_absolute_error(y_test,y_test_pred))
print("RMSE :",np.sqrt(mean_squared_error(y_test,y_test_pred)))
print("MAPE :",mean_absolute_percentage_error(y_test,y_test_pred))

R2   : 0.405482606628
MAE  : 39.4471746019
RMSE : 78.262940013
MAPE : 39.1624511579


Out of linear,RandomForest,Neural networks model's Random Forest model gave us the best results,

Base Model:

    For training dataset:-
    
        R2   : 0.903342360976
        MAE  : 14.3318372665
        RMSE : 32.0090045424
        MAPE : 14.2999986786
        
    For testing dataset:-
    
         R2   : 0.455901439969
         MAE  : 36.8596520858
         RMSE : 74.8708328696
         MAPE : 37.6541418787
         
Optimized Model:

    For training dataset:-
    
        R2   : 0.999360449368
        MAE  : 1.36231948386
        RMSE : 2.60370359672
        MAPE : 1.80088335883
        
    For testing dataset:-
    
        R2   : 0.615763350576
        MAE  : 28.9173039919
        RMSE : 62.9177624391
        MAPE : 28.1695894996