# Import Unit

In [69]:
import numpy as np
import pandas as pd
import datetime
from sklearn_pandas import CategoricalImputer
from sklearn.base import BaseEstimator ,TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation , Dropout  ,Dense
import joblib

# Loading Data Unit

In [19]:
dataframe = pd.read_csv("Chiller_1_17_19.csv")
dataframe.head()

Unnamed: 0,Date,Inlet,Outlet,Set_Point,Unit_Consumed,Operating_Hrs,Load_percentage
0,01-Apr-17,,,,300.0,0.0,
1,02-Apr-17,,,,100.0,0.0,
2,03-Apr-17,,,,100.0,0.0,
3,04-Apr-17,17.0,12.5,10.0,2000.0,11.09,97.0
4,05-Apr-17,12.4,13.7,10.0,500.0,2.84,95.0


In [20]:
def split_train_test(data , test_ratio):
    np.random.seed(42)
    shuffled_indices = np.random.permutation(len(data))
    test_indices = shuffled_indices[:int(len(data)* test_ratio)]
    train_indices = shuffled_indices[int(len(data)* test_ratio):]
    return data.iloc[train_indices] , data.iloc[test_indices]


In [21]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 913 entries, 0 to 912
Data columns (total 7 columns):
Date               913 non-null object
 Inlet             473 non-null object
Outlet             473 non-null object
Set_Point          470 non-null float64
Unit_Consumed      913 non-null float64
Operating_Hrs      913 non-null float64
Load_percentage    473 non-null object
dtypes: float64(3), object(4)
memory usage: 50.1+ KB


In [57]:
dataframe.columns
dataframe.dropna(inplace=True)
dataframe.head()
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 470 entries, 3 to 912
Data columns (total 7 columns):
Date               470 non-null object
 Inlet             470 non-null object
Outlet             470 non-null object
Set_Point          470 non-null float64
Unit_Consumed      470 non-null float64
Operating_Hrs      470 non-null float64
Load_percentage    470 non-null object
dtypes: float64(3), object(4)
memory usage: 29.4+ KB


# Preprocessing unit

In [44]:
train_set , test_set = split_train_test(dataframe , 0.2)

In [45]:
train_set["Date"] = pd.to_datetime(train_set["Date"])
test_set["Date"] = pd.to_datetime(test_set["Date"])
train_set["month"] = train_set["Date"].dt.month
test_set["month"] = test_set["Date"].dt.month 
train_set["year"] = train_set["Date"].dt.year
test_set["year"] = test_set["Date"].dt.year 
train_set["day"] = train_set["Date"].dt.day
test_set["day"] = test_set["Date"].dt.day

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .l

In [46]:
train_set.head()

Unnamed: 0,Date,Inlet,Outlet,Set_Point,Unit_Consumed,Operating_Hrs,Load_percentage,month,year,day
856,2019-08-05,18.0,13.5,8.0,4800.0,24.02,98,8,2019,5
566,2018-10-19,16.4,12.3,10.0,1460.0,7.4,97,10,2018,19
34,2017-05-05,18.4,15.0,10.0,1900.0,8.87,97,5,2017,5
570,2018-10-23,16.9,12.8,10.0,1670.0,8.44,96,10,2018,23
63,2017-06-03,17.1,12.0,10.0,2554.0,11.81,96,6,2017,3


In [47]:
class DataFrameSelector(BaseEstimator , TransformerMixin):
    def __init__(self , attributes_name):
        self.attributes_name = attributes_name
    def fit(self , X ,y=None):
        return self
    def transform(self , X):
        return X[self.attributes_name].values

In [48]:
num_classes = ["month" , "year" , "day" , "Load_percentage"  ,"Operating_Hrs" ,"Set_Point" ,"Outlet" ," Inlet" ]
labels = ["Unit_Consumed"]

In [49]:
num_pipeline_new = Pipeline([("selector" , DataFrameSelector(num_classes)),
                             ("imputer" , Imputer(strategy= "median")),
                            ("std_scalar" , StandardScaler() )])

In [50]:
full_pipeline = FeatureUnion(transformer_list=[("num_pipeline" , num_pipeline_new)])

In [51]:
train_features = train_set.drop(["Unit_Consumed" , "Date"] , axis =1)
test_features = test_set.drop(["Unit_Consumed", "Date"] , axis =1)
train_features.head()

Unnamed: 0,Inlet,Outlet,Set_Point,Operating_Hrs,Load_percentage,month,year,day
856,18.0,13.5,8.0,24.02,98,8,2019,5
566,16.4,12.3,10.0,7.4,97,10,2018,19
34,18.4,15.0,10.0,8.87,97,5,2017,5
570,16.9,12.8,10.0,8.44,96,10,2018,23
63,17.1,12.0,10.0,11.81,96,6,2017,3


In [52]:
train_prepared  = full_pipeline.fit_transform(train_features)
test_prepared = full_pipeline.fit_transform(test_features)

In [55]:
train_prepared.shape

(376, 8)

In [58]:
train_labels = train_set["Unit_Consumed"].copy()
train_labels = np.array(train_labels)
test_labels = test_set["Unit_Consumed"].copy()
test_labels = np.array(test_labels)

# Model Fitting

In [59]:
lin_reg = LinearRegression()

In [60]:
lin_reg.fit(train_prepared , train_labels)
lin_predictions  = lin_reg.predict(test_prepared)
lin_mse = mean_squared_error(test_labels , lin_predictions)

lin_rmse = np.sqrt(lin_mse)
lin_rmse

340.77324809830475

# Neural net

In [61]:
model = Sequential()


In [62]:
model.add(Dense(train_prepared.shape[1]))
model.add(Dense(256))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(156))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(10))
model.add(Dense(1 , kernel_initializer='normal',activation='linear'))
model.compile(loss= "binary_crossentropy" ,
             optimizer= "adam",
             metrics= [tf.keras.metrics.RootMeanSquaredError(name='rmse')], 
             )

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [64]:
model.fit(train_prepared , train_labels ,validation_split=0.1 , epochs=3 )

Train on 338 samples, validate on 38 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x26d542ce308>

# Random Forest

In [66]:

new_model = RandomForestRegressor()
new_model.fit(train_prepared , train_labels)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [67]:
pred_random = new_model.predict(test_prepared)
random_mse = mean_squared_error(pred_random , test_labels)
rmse_ran = np.sqrt(random_mse)
rmse_ran

329.86668045671405

# Grid search cv


In [70]:
forest_reg_new = RandomForestRegressor()

param_grid = [{ "n_estimators" : [3,10,30]  , "max_features" : [2,4,6,8]} , 
               {"bootstrap" : [False] , "n_estimators" : [3,10] , "max_features" : [2,3,4]}]
grid_search = GridSearchCV(forest_reg_new , param_grid , cv=5 , scoring= "neg_mean_squared_error")

grid_search.fit(train_prepared , train_labels)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators='warn', n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid=[{'max_features': [2, 4, 6, 8],


In [71]:
grid_search.best_params_

{'max_features': 6, 'n_estimators': 30}

In [72]:
cvres = grid_search.cv_results_

for mean_score , parameters in zip(cvres["mean_test_score"] , cvres["params"]):
    print(np.sqrt(-mean_score) , parameters)

593.0531870151627 {'max_features': 2, 'n_estimators': 3}
469.53254265839263 {'max_features': 2, 'n_estimators': 10}
420.4956021144242 {'max_features': 2, 'n_estimators': 30}
422.03511559064316 {'max_features': 4, 'n_estimators': 3}
377.12558117407383 {'max_features': 4, 'n_estimators': 10}
345.2733910022432 {'max_features': 4, 'n_estimators': 30}
391.68960751692305 {'max_features': 6, 'n_estimators': 3}
348.1991906923056 {'max_features': 6, 'n_estimators': 10}
339.0492962187735 {'max_features': 6, 'n_estimators': 30}
388.1779296507813 {'max_features': 8, 'n_estimators': 3}
362.1996144343672 {'max_features': 8, 'n_estimators': 10}
351.2988391680109 {'max_features': 8, 'n_estimators': 30}
579.1003786196102 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
466.4665696307904 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
444.30271697537717 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
381.23732248596275 {'bootstrap': False, 'max_features': 3, 'n_estimator

In [73]:
feature_importance = grid_search.best_estimator_.feature_importances_
feature_importance.argmax()

4

In [74]:
final_model = grid_search.best_estimator_