In [5]:
import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

import seaborn as sns

df = pd.read_csv('myDataFrame.csv')
df.set_index('date', inplace=True)

df.columns= [ '% Iron Feed', '% Silica Feed','Starch Flow','Amina Flow','Ore Pulp Flow','Ore Pulp pH','Ore Pulp Density','p-Flotation Column 01 Air Flow','p-Flotation Column 02 Air Flow','p-Flotation Column 03 Air Flow','p-Flotation Column 04 Air Flow','p-Flotation Column 05 Air Flow','p-Flotation Column 06 Air Flow','p-Flotation Column 07 Air Flow','p-Flotation Column 01 Level','p-Flotation Column 02 Level','p-Flotation Column 03 Level','p-Flotation Column 04 Level','p-Flotation Column 05 Level','p-Flotation Column 06 Level','p-Flotation Column 07 Level','% Iron Concentrate','% Silica Concentrate']
df = df.drop(['% Iron Concentrate'], axis=1)
Y = df['% Silica Concentrate']
X = df.drop(['% Silica Concentrate'], axis=1)
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
X_scaled = pd.DataFrame(min_max_scaler.fit_transform(X), columns=X.columns)
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=42)

In [6]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from math import sqrt

In [7]:

reg = DecisionTreeRegressor(random_state=0)
from pprint import pprint
print('Parameters currently in use:\n')
pprint(reg.get_params())

Parameters currently in use:

{'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': 'deprecated',
 'random_state': 0,
 'splitter': 'best'}


In [8]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(0, 200, num = 40)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Create the random grid
random_grid = {
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf
               }
pprint(random_grid)

{'max_depth': [0,
               5,
               10,
               15,
               20,
               25,
               30,
               35,
               41,
               46,
               51,
               56,
               61,
               66,
               71,
               76,
               82,
               87,
               92,
               97,
               102,
               107,
               112,
               117,
               123,
               128,
               133,
               138,
               143,
               148,
               153,
               158,
               164,
               169,
               174,
               179,
               184,
               189,
               194,
               200,
               None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10]}


In [9]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=7)
kf.get_n_splits(X_train)
print(kf)

KFold(n_splits=7, random_state=None, shuffle=False)


In [10]:
for train_index, test_index in kf.split(X_train):
...     print("TRAIN:", train_index, "TEST:", test_index)
...     X_train2, X_test2 = X_train.iloc[train_index], X_train.iloc[test_index]
...     Y_train2, Y_test2 = Y_train.iloc[train_index], Y_train.iloc[test_index]

TRAIN: [ 375  376  377 ... 2621 2622 2623] TEST: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
 2

In [11]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
regressor = DecisionTreeRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = regressor, param_distributions = random_grid, n_iter = 50, cv = 7, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train2, Y_train2)

Fitting 7 folds for each of 50 candidates, totalling 350 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 276 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 327 out of 350 | elapsed:    3.9s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 350 out of 350 | elapsed:    4.0s finished


RandomizedSearchCV(cv=7, estimator=DecisionTreeRegressor(), n_iter=50,
                   n_jobs=-1,
                   param_distributions={'max_depth': [0, 5, 10, 15, 20, 25, 30,
                                                      35, 41, 46, 51, 56, 61,
                                                      66, 71, 76, 82, 87, 92,
                                                      97, 102, 107, 112, 117,
                                                      123, 128, 133, 138, 143,
                                                      148, ...],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10]},
                   random_state=42, verbose=2)

In [12]:
rf_random.best_params_

{'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 5}

In [13]:
reg = DecisionTreeRegressor(random_state=0).fit(X_train, Y_train)
Y_pred = reg.predict(X_test)

In [14]:
mse = mean_squared_error(Y_test, Y_pred)
rmse = sqrt(mse)
mae = mean_absolute_error(Y_test,Y_pred)
r_squared = r2_score (Y_test,Y_pred)

In [15]:
print(mse,rmse,mae,r_squared)

1.9292090840248652 1.3889597128876219 1.005987882251863 -0.5262914478280072


In [16]:
reg2 = DecisionTreeRegressor(min_samples_split=2,min_samples_leaf=4,max_features="sqrt",max_depth=5,random_state=0).fit(X_train, Y_train)
Y_pred2 = reg2.predict(X_test)

In [17]:
mse2 = mean_squared_error(Y_test, Y_pred2)
rmse2 = sqrt(mse2)
mae2 = mean_absolute_error(Y_test,Y_pred2)
r_squared2 = r2_score (Y_test,Y_pred2)

In [18]:
print(mse2,rmse2,mae2,r_squared2)

1.100164895916968 1.048887456268292 0.8389927932498579 0.12960586504421845


In [90]:
data = {'MSE': [mse, mse2],
        'RMSE': [rmse, rmse2],
        'MAE': [mae, mae2],
        'R²': [r_squared, r_squared2]}
df = pd.DataFrame(data,columns=['MSE', 'RMSE','MAE','R²'],index=['with default hyper parameter', 'with 3/5/7-cross valdiation'])

In [91]:
df

Unnamed: 0,MSE,RMSE,MAE,R²
with default hyper parameter,1.583895,1.258529,0.871806,-0.222468
with 3/5/7-cross valdiation,1.122785,1.059616,0.821678,0.133422
