In [55]:
import pandas as pd
import datetime
import numpy as np
import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import *
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
def mean_absolute_percentage_error(y_true, y_pred): 
        y_true, y_pred = np.array(y_true), np.array(y_pred)
        return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
df_train=pd.read_csv("C:/Users/Akash/Documents/ADS/Assignment2/Appliances-energy-prediction-data-master/training.csv")
df_train['date']=pd.to_datetime(df_train['date'])
df_train['year']=df_train['date'].dt.year
df_train['month']=df_train['date'].dt.month
df_train['day']=df_train['date'].dt.day
df_train['time_hr_24']=df_train['date'].dt.hour
df_train['time_min']=df_train['date'].dt.minute
morning=range(6,12)
afternoon=range(12,17)
evening=range(17,22)
def time_slot(x):
    if x in morning:
        return 'morning'
    elif x in afternoon:
        return 'afternoon'
    elif x in evening:
        return 'evening'
    else:
        return 'night'
df_train['time_slot']=df_train['time_hr_24'].map(time_slot)
df_train.drop(['date','year','time_hr_24'],axis=1,inplace=True)
df_train=pd.get_dummies(df_train,prefix=['WS','DOW','TS'],columns=['WeekStatus','Day_of_week','time_slot'])
#print(df_train.iloc[:,:].describe())

In [56]:
#for training dataset
x_train=df_train.iloc[:,1:]
y_train=df_train['Appliances']
col_list=['NSM','RH_1','RH_2','T3','RH_3','Press_mm_hg','TS_night','T1','T2','T4','RH_4','T5','RH_5','T6','RH_6',
                       'T7','RH_7','T8','RH_8','RH_9','T_out','RH_out','Windspeed','Tdewpoint']
x_train=x_train[col_list]
scaler.fit(x_train)
x_train=scaler.transform(x_train)
#x_train.head()

In [3]:
def evaluate(model,x_true,y_true):
    y_pred=model.predict(x_true)
    def mean_absolute_percentage_error(y_true, y_pred): 
        y_true, y_pred = np.array(y_true), np.array(y_pred)
        return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    print("R2   :",r2_score(y_train,y_train_pred))
    print("MAE  :",mean_absolute_error(y_train,y_train_pred))
    print("RMSE :",np.sqrt(mean_squared_error(y_train,y_train_pred)))
    print("MAPE :",mean_absolute_percentage_error(y_train,y_train_pred))

In [57]:
# rf = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
#            max_features='sqrt', max_leaf_nodes=None,
#            min_impurity_decrease=0.0, min_impurity_split=None,
#            min_samples_leaf=4, min_samples_split=2,
#            min_weight_fraction_leaf=0.0, n_estimators=822, n_jobs=1,
#            oob_score=False, random_state=None, verbose=0, warm_start=False)
rf=RandomForestRegressor(n_estimators=822,max_features='sqrt',random_state=42)
#'n_estimators': 822, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 10, 'bootstrap': True
rf.fit(x_train, y_train)
y_train_pred=rf.predict(x_train)
print("R2   :",r2_score(y_train,y_train_pred))
print("MAE  :",mean_absolute_error(y_train,y_train_pred))
print("RMSE :",np.sqrt(mean_squared_error(y_train,y_train_pred)))
print("MAPE :",mean_absolute_percentage_error(y_train,y_train_pred))
#evaluate(rf,x_train,y_train)

R2   : 0.946400955853
MAE  : 10.9912972201
RMSE : 23.8053912194
MAPE : 10.6006135341


In [32]:
#Random Hyperparameter Grid
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [200, 288, 377, 466, 555, 644, 733, 822, 911, 1000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [33]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rdf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rdf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(x_train, y_train)
print(rf_random.best_params_)
model=rf_random.best_estimator_
#evaluate(model,x_train,y_train)
y_train_pred=model.predict(x_train)
print("R2   :",r2_score(y_train,y_train_pred))
print("MAE  :",mean_absolute_error(y_train,y_train_pred))
print("RMSE :",np.sqrt(mean_squared_error(y_train,y_train_pred)))
print("MAPE :",mean_absolute_percentage_error(y_train,y_train_pred))

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 10.0min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 45.2min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 110.9min finished


{'n_estimators': 822, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 10, 'bootstrap': True}
R2   : 0.481883611503
MAE  : 38.9954241542
RMSE : 74.0134188113
MAPE : 42.7510797414


In [142]:
y_train_pred=model.predict(x_train)
print("R2   :",r2_score(y_train,y_train_pred))
print("MAE  :",mean_absolute_error(y_train,y_train_pred))
print("RMSE :",np.sqrt(mean_squared_error(y_train,y_train_pred)))
print("MAPE :",mean_absolute_percentage_error(y_train,y_train_pred))

R2   : 0.500598856623
MAE  : 38.1238976778
RMSE : 72.6643790823
MAPE : 41.3946574802


In [143]:
# Get numerical feature importances
# feature_list = list(x_train.columns)
# importances = list(rf.feature_importances_)
# feature_importances = [(x_train, round(importance, 2)) for x_train, importance in zip(feature_list, importances)]
# feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# [print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

In [58]:
#for testing dataset
df_test=pd.read_csv("C:/Users/Akash/Documents/ADS/Assignment2/Appliances-energy-prediction-data-master/testing.csv")
df_test['date']=pd.to_datetime(df_test['date'])
df_test['year']=df_test['date'].dt.year
df_test['month']=df_test['date'].dt.month
df_test['day']=df_test['date'].dt.day
df_test['time_hr_24']=df_test['date'].dt.hour
df_test['time_min']=df_test['date'].dt.minute
df_test['time_slot']=df_test['time_hr_24'].map(time_slot)
df_test.drop(['date','year','time_hr_24'],axis=1,inplace=True)
#df_test.head()
df_test=pd.get_dummies(df_test,prefix=['WS','DOW','TS'],columns=['WeekStatus','Day_of_week','time_slot'])
#print(df_test.describe())

In [59]:
x_test=df_test.iloc[:,1:]
x_test=x_test[col_list]
x_test=scaler.transform(x_test)
y_test=df_test['Appliances']
baseline_errors = abs(96.745742 - y_test)
print('Average baseline error: ', round(np.mean(baseline_errors), 2))
#evaluate(rf,x_test,y_test)

Average baseline error:  59.08


In [60]:
y_test_pred=rf.predict(x_test)
print("R2   :",r2_score(y_test,y_test_pred))
print("MAE  :",mean_absolute_error(y_test,y_test_pred))
print("RMSE :",np.sqrt(mean_squared_error(y_test,y_test_pred)))
print("MAPE :",mean_absolute_percentage_error(y_test,y_test_pred))

R2   : 0.593271230472
MAE  : 29.4040976748
RMSE : 64.7981154834
MAPE : 28.7094120911


In [46]:
y_test_pred1=model.predict(x_test)
print("R2   :",r2_score(y_test,y_test_pred1))
print("MAE  :",mean_absolute_error(y_test,y_test_pred1))
print("RMSE :",np.sqrt(mean_squared_error(y_test,y_test_pred1)))
print("MAPE :",mean_absolute_percentage_error(y_test,y_test_pred1))

R2   : -0.388825498022
MAE  : 104.587902455
RMSE : 119.738487988
MAPE : 167.712353593


In [22]:
# Get numerical feature importances
feature_list = list(x_train.columns)
importances = list(rf.feature_importances_)
feature_importances = [(x_train, round(importance, 2)) for x_train, importance in zip(feature_list, importances)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: NSM                  Importance: 0.09
Variable: T3                   Importance: 0.05
Variable: RH_3                 Importance: 0.05
Variable: RH_1                 Importance: 0.04
Variable: RH_2                 Importance: 0.04
Variable: Press_mm_hg          Importance: 0.04
Variable: TS_night             Importance: 0.04
Variable: T2                   Importance: 0.04
Variable: RH_4                 Importance: 0.04
Variable: T5                   Importance: 0.04
Variable: RH_5                 Importance: 0.04
Variable: T6                   Importance: 0.04
Variable: RH_6                 Importance: 0.04
Variable: RH_7                 Importance: 0.04
Variable: T8                   Importance: 0.04
Variable: RH_8                 Importance: 0.04
Variable: RH_9                 Importance: 0.04
Variable: T_out                Importance: 0.04
Variable: RH_out               Importance: 0.04
Variable: Tdewpoint            Importance: 0.04
Variable: T1                   Importanc

In [147]:
evaluate(model,x_test,y_test)

R2   : 0.500598856623
MAE  : 38.1238976778
RMSE : 72.6643790823
MAPE : 41.3946574802


In [38]:
rf_random.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
           max_features='sqrt', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=4, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=822, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)