# HYPER PARAMETER OPTIMIZATION FOR 3 DATASETS

### <font color='black'>Import the necessary libraries</font>

In [32]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.model_selection import train_test_split
import optuna
import sklearn.svm
from sklearn.model_selection import cross_val_score

# Dataset No. 1

- Dependent Parameters(Outputs): Tool Wear(Tw), Surface Roughness (Ra)
- Independent Parameters(Inputs): Speed, Feed, Depth

In [33]:
df = pd.read_csv('Table1.csv')
df

Unnamed: 0,Speed m/min,Feed –fmm/rev\n,Depth -mm,Ra -um,Tw -mm,MRR -mm3/min
0,40,0.3,0.15,2.46,0.292,7690
1,40,0.2,0.60,2.52,0.288,8229
2,40,0.1,0.15,2.41,0.291,9230
3,40,0.2,0.15,2.48,0.291,5132
4,40,0.3,0.30,2.47,0.293,8229
...,...,...,...,...,...,...
814,100,0.1,0.45,2.37,0.291,8207
815,100,0.4,0.60,2.44,0.278,4628
816,100,0.3,0.15,2.47,0.281,4628
817,100,0.4,0.45,2.47,0.291,3077


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 819 entries, 0 to 818
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Speed m/min     819 non-null    int64  
 1   Feed –fmm/rev
  819 non-null    float64
 2   Depth -mm       819 non-null    float64
 3   Ra -um          819 non-null    float64
 4   Tw -mm          819 non-null    float64
 5   MRR -mm3/min    819 non-null    int64  
dtypes: float64(4), int64(2)
memory usage: 38.5 KB


In [35]:
df.describe()

Unnamed: 0,Speed m/min,Feed –fmm/rev\n,Depth -mm,Ra -um,Tw -mm,MRR -mm3/min
count,819.0,819.0,819.0,819.0,819.0,819.0
mean,70.18315,0.249328,0.374908,2.5016,0.285932,5458.857143
std,22.318886,0.111047,0.161373,0.165374,0.005221,3089.406208
min,40.0,0.1,0.15,2.17,0.278,515.0
25%,60.0,0.2,0.3,2.41,0.281,2057.0
50%,80.0,0.2,0.45,2.47,0.286,5132.0
75%,100.0,0.3,0.45,2.52,0.291,8207.0
max,100.0,0.4,0.6,2.88,0.293,9230.0


### Initialising the Variables as Dependent or Independent features from the Datasets

- X refers to Independent Features

- Y referes to Dependent Features

In [36]:
X = np.array(df.iloc[:,:-3])
print(X)

[[ 40.     0.3    0.15]
 [ 40.     0.2    0.6 ]
 [ 40.     0.1    0.15]
 ...
 [100.     0.3    0.15]
 [100.     0.4    0.45]
 [100.     0.2    0.15]]


In [37]:
Y = np.array(df.iloc[:,[-2,-3]])
print(Y)

[[0.292 2.46 ]
 [0.288 2.52 ]
 [0.291 2.41 ]
 ...
 [0.281 2.47 ]
 [0.291 2.47 ]
 [0.278 2.48 ]]


## <font color='blue'>Regression without Optimization for Dataset 1 </font> 

In [46]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.20,random_state=0)

In [57]:
model = RandomForestRegressor(bootstrap='True', max_depth=1659, max_features='log2',
                      max_leaf_nodes=3, n_estimators=322, n_jobs=2)
model.fit(X_train, Y_train)

RandomForestRegressor(bootstrap='True', max_depth=1659, max_features='log2',
                      max_leaf_nodes=3, n_estimators=322, n_jobs=2)

In [58]:
model = RandomForestRegressor(n_estimators=100).fit(X_train,Y_train)
prediction = model.predict(X_test)

In [59]:
Y_pred = model.predict(X_test)

In [60]:
df=pd.DataFrame({'Actual':[Y_test], 'Predicted':[Y_pred]})
df

Unnamed: 0,Actual,Predicted
0,"[[0.29100000000000004, 2.47], [0.284, 2.44], [...","[[0.2829793202703729, 2.475091968178021], [0.2..."


In [61]:
MAE1 = metrics.mean_absolute_error(Y_test, Y_pred)
MSE1 = metrics.mean_squared_error(Y_test, Y_pred)
RMSE1 = np.sqrt(metrics.mean_squared_error(Y_test, Y_pred))

### Accuracy and Mean absolute Error (without Optimization)

In [62]:
errors1 = abs(Y_pred - Y_test)
print('Mean Absolute Error:', round(np.mean(errors1), 4))
mape1 = 100 * (errors1 / Y_test)
accuracy1 = 100 - np.mean(mape1)
print('Accuracy:', round(accuracy1, 2))

Mean Absolute Error: 0.0635
Accuracy: 96.77


## <font color='blue'>Hyperparameter Optimization using Optuna Algorithm for Dataset 1</font>

Testing for best parameters using 1000 different combination of parameters such as 
1. Estimators 
2. Categorical features (MSE or MAE) 
3. Bootstrap value
4. Maximum Features 
5. Maximum Leaf Nodes

In [87]:
def objective(trial):
    criterion = trial.suggest_categorical('criterion', ['mse', 'mae'])
    bootstrap = trial.suggest_categorical('bootstrap',['True','False'])
    max_depth = trial.suggest_int('max_depth', 1, 10000)
    max_features = trial.suggest_categorical('max_features', ['auto', 'sqrt','log2'])
    max_leaf_nodes = trial.suggest_int('max_leaf_nodes', 1, 10000)
    n_estimators =  trial.suggest_int('n_estimators', 30, 1000)
    model = RandomForestRegressor(bootstrap = bootstrap, criterion = criterion,
                                 max_depth = max_depth, max_features = max_features,
                                 max_leaf_nodes = max_leaf_nodes,n_estimators = n_estimators,n_jobs=2)
    score = cross_val_score(model, X_train, Y_train, cv=5, scoring="r2")
    r2_mean = score.mean()
    return r2_mean

### Splitting of Dataset into Train and Test Dataset

- Proportion of Test:Train data = 20:80

- Initialising the Variables such as X_test, X_train, Y_test, Y_train

- These Values are useful later for making predictions and giving accuracy

In [88]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.20,random_state=0)

### Actual Hyperparameter Optimization cell
- Based on Number of Trials (1000 here), using several different combination of parameters, the below Optimization
  algorithm tests and predicts the best possible parameters for the used Regression Model in order to yield Maximum
  accuracy.
- Takes 1000 iterations/ trials to figure out best set of parameters for our Regression Model
- n_trials is directly proportional to the accuracy of the model

In [89]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)
optimised_model1 = RandomForestRegressor(bootstrap = study.best_params['bootstrap'], criterion = study.best_params['criterion'],
                                     max_depth = study.best_params['max_depth'], max_features = study.best_params['max_features'],
                                     max_leaf_nodes = study.best_params['max_leaf_nodes'],n_estimators = study.best_params['n_estimators'],
                                     n_jobs=2)
optimised_model1.fit(X_train ,Y_train)

[32m[I 2021-05-08 21:47:35,274][0m A new study created in memory with name: no-name-e08aeced-cf4c-4ceb-ab9d-efc4a08fd703[0m
[32m[I 2021-05-08 21:47:37,988][0m Trial 0 finished with value: -0.1429302646778886 and parameters: {'criterion': 'mse', 'bootstrap': 'True', 'max_depth': 2530, 'max_features': 'log2', 'max_leaf_nodes': 4465, 'n_estimators': 255}. Best is trial 0 with value: -0.1429302646778886.[0m
[32m[I 2021-05-08 21:47:40,301][0m Trial 1 finished with value: -0.143324234638077 and parameters: {'criterion': 'mse', 'bootstrap': 'False', 'max_depth': 7318, 'max_features': 'sqrt', 'max_leaf_nodes': 7055, 'n_estimators': 440}. Best is trial 0 with value: -0.1429302646778886.[0m
[32m[I 2021-05-08 21:47:46,394][0m Trial 2 finished with value: -0.0991657888339514 and parameters: {'criterion': 'mae', 'bootstrap': 'False', 'max_depth': 4446, 'max_features': 'sqrt', 'max_leaf_nodes': 4131, 'n_estimators': 605}. Best is trial 2 with value: -0.0991657888339514.[0m
[32m[I 2021-0

RandomForestRegressor(bootstrap='False', criterion='mae', max_depth=4446,
                      max_features='sqrt', max_leaf_nodes=4131,
                      n_estimators=605, n_jobs=2)

### Prediction for Model with Optimized Parameters
After Optimization, the model predicts the Dependent Features values based on Testing and Training data

In [131]:
Y_pred = optimised_model1.predict(X_test)

### Plotting the Data in statistical format (Actual v/s Predicted)
The Data gets plotted in an array format, with each coloumn representing the (Actual v/s Predicted) values made by the regression model itself

In [132]:
d=pd.DataFrame({'Actual':[Y_test], 'Predicted':[Y_pred]})
d

Unnamed: 0,Actual,Predicted
0,"[[0.29100000000000004, 2.47], [0.284, 2.44], [...","[[0.28344628099173597, 2.4698429752066264], [0..."


In [133]:
MAE_opt1 = metrics.mean_absolute_error(Y_test, Y_pred)
MSE_opt1 = metrics.mean_squared_error(Y_test, Y_pred)
RMSE_opt1 = np.sqrt(metrics.mean_squared_error(Y_test, Y_pred))

### Prediction of Accuracy and Mean Absolute Percentage Error
Using the statistical Data, (Actual v/s Predicted), Mean Absolute Error (MAE) and Mean Absolute Percentage Error (MAPE) are figured, on basis of which, Accuracy is predicted.

In [134]:
errors_opt1 = abs(Y_pred - Y_test)
print('Mean Absolute Error:', round(np.mean(errors_opt1), 4))
mape_opt1 = 100 * (errors_opt1 / Y_test)
accuracy_opt1 = 100 - np.mean(mape_opt1)
print('Accuracy:', round(accuracy_opt1, 2))

Mean Absolute Error: 0.0592
Accuracy: 96.95


# Dataset No. 2

Independent Features (A): 
1. Speed
2. Feed
3. Depth
4. Tangential Force (Ft)
5. Axial Force (Fa)

Dependent Features (B): 
1. Beta 
2. Roughness of Surface (Ra) 
3. Metal Removal Rate (MRR)


In [15]:
d = pd.read_csv('Table2.csv', header= 0,encoding= 'unicode_escape')
print(d)

     Speed m/min   Feed f mm/rev\n  Depth -mm  Ft - N  Fa - N     Beta  \
0              50              0.10       0.15     252     147  6822.22   
1              50              0.25       0.45     269     179  4483.33   
2              50              0.20       0.60     318     154  3166.67   
3              50              0.15       0.15     314     184  4433.33   
4              50              0.25       0.30     306     179  8200.00   
..            ...               ...        ...     ...     ...      ...   
814           110              0.25       0.30     317     147  3166.67   
815           110              0.20       0.45     307     147  3166.67   
816           110              0.15       0.60     273     149  8200.00   
817           110              0.15       0.45     281     157  4044.44   
818           110              0.25       0.60     306     179  2826.67   

        Ra         MRR  
0    1.310  2121.42857  
1    2.000  3535.71429  
2    3.260  6600.00000  

In [16]:
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 819 entries, 0 to 818
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Speed m/min      819 non-null    int64  
 1   Feed f mm/rev
  819 non-null    float64
 2   Depth -mm        819 non-null    float64
 3   Ft - N           819 non-null    int64  
 4   Fa - N           819 non-null    int64  
 5   Beta             819 non-null    float64
 6   Ra               819 non-null    float64
 7   MRR              819 non-null    float64
dtypes: float64(5), int64(3)
memory usage: 51.3 KB


In [17]:
d.describe()

Unnamed: 0,Speed m/min,Feed f mm/rev\n,Depth -mm,Ft - N,Fa - N,Beta,Ra,MRR
count,819.0,819.0,819.0,819.0,819.0,819.0,819.0,819.0
mean,80.18315,0.176618,0.37033,283.070818,161.487179,6106.536691,1.759739,3929.722659
std,22.318886,0.056321,0.172512,23.286614,14.771885,3684.475698,0.581777,2007.096535
min,50.0,0.1,0.15,246.0,136.0,2113.33,1.02,589.285714
25%,70.0,0.1,0.15,266.0,147.0,3211.11,1.41,1767.85714
50%,90.0,0.2,0.3,274.0,157.0,4483.33,1.73,3535.71429
75%,110.0,0.25,0.6,307.0,173.0,7493.33,1.88,5833.92857
max,110.0,0.25,0.6,318.0,185.0,16800.0,3.26,6600.0


### Scaling of Data
Since some features (Beta, Metal Removal Rate (MRR)) are notably huge as compared to other parameters(Roughness of Surface(Ra)), scaling of data is required in order to reduce the scope of errors and maintain uniformity in scale of Data

The Scaling of data has been done using log function

In [18]:
d['Beta_log'] = np.log(d.Beta + 0.01)
d['MRR_log'] = np.log(d.MRR + 0.01)

### Declaration of Variables as Dependent (B) or Independent Features (A)

Since the Model uses Multi-Input Multi-Output System, declaration of the features in array format is neccessary, hence all the features (Dependent or Independent) have been defined in array format before being operated on.

In [19]:
A = np.array(d.iloc[:,:-3])
print(A)

[[5.00000e+01 1.00000e-01 1.50000e-01 ... 1.47000e+02 6.82222e+03
  1.31000e+00]
 [5.00000e+01 2.50000e-01 4.50000e-01 ... 1.79000e+02 4.48333e+03
  2.00000e+00]
 [5.00000e+01 2.00000e-01 6.00000e-01 ... 1.54000e+02 3.16667e+03
  3.26000e+00]
 ...
 [1.10000e+02 1.50000e-01 6.00000e-01 ... 1.49000e+02 8.20000e+03
  2.00000e+00]
 [1.10000e+02 1.50000e-01 4.50000e-01 ... 1.57000e+02 4.04444e+03
  1.51800e+00]
 [1.10000e+02 2.50000e-01 6.00000e-01 ... 1.79000e+02 2.82667e+03
  1.80000e+00]]


In [20]:
print(d['Beta_log'])
print("\n Beta_Log")
print("\n---------------------------------------------------------------------------------------------------------------\n")
print(d['MRR_log'])
print("\n MRR_log")

0      8.827942
1      8.408124
2      8.060439
3      8.396909
4      9.011891
         ...   
814    8.060439
815    8.060439
816    9.011891
817    8.305101
818    7.946858
Name: Beta_log, Length: 819, dtype: float64

 Beta_Log

---------------------------------------------------------------------------------------------------------------

0      7.659850
1      8.170673
2      8.794826
3      8.553665
4      7.477529
         ...   
814    7.659850
815    8.083662
816    7.477529
817    7.408537
818    8.758459
Name: MRR_log, Length: 819, dtype: float64

 MRR_log


### Representing scaled data as actual data for operation in dependent variable (B)
Since the data is scaled, it is ready to be operated on, hence it is declared in scaled for in dependent variable for the purpose of achieving greater accuracy.

In [21]:
B = np.array(d.iloc[:,[-4,-2,-1]])
print(B)

[[1.31       8.82794168 7.65984971]
 [2.         8.40812358 8.17067345]
 [3.26       8.060439   8.79482644]
 ...
 [2.         9.01189065 7.47752909]
 [1.518      8.30510085 7.40853663]
 [1.8        7.94685816 8.75845885]]


## <font color='blue'>Regression without Optimization for Dataset 2</font>

In [22]:
A_train,A_test,B_train,B_test=train_test_split(A,B,test_size=0.20,random_state=0)

In [27]:
model2 =  RandomForestRegressor(criterion = 'mae' ,bootstrap = 'True', max_depth = 3931, max_features = 'auto', max_leaf_nodes = 3819, n_estimators = 371)
model2.fit(A_train,B_train)

RandomForestRegressor(bootstrap='True', criterion='mae', max_depth=3931,
                      max_leaf_nodes=3819, n_estimators=371)

In [28]:
B_pred = model2.predict(A_test)

In [29]:
df=pd.DataFrame({'Actual':[B_test], 'Predicted':[B_pred]})
df

Unnamed: 0,Actual,Predicted
0,"[[1.518, 8.684590601751784, 7.477529093315083]...","[[1.54306738544474, 8.765751396969138, 7.99460..."


In [30]:
MAE2 = metrics.mean_absolute_error(B_test, B_pred)
MSE2 = metrics.mean_squared_error(B_test, B_pred)
RMSE2 = np.sqrt(metrics.mean_squared_error(B_test, B_pred))

### Accuracy and Mean absolute Error (without Optimization)

In [31]:
errors2 = abs(B_pred - B_test)
mape2 = 100 * (errors2 / B_test)
accuracy2 = 100 - np.mean(mape2)
print(accuracy2)

96.90096334702658


## <font color='blue'>Hyperparameter Optimization Using Optuna Algorithm for Dataset 2</font>


In [148]:
def objective(trial):
    criterion = trial.suggest_categorical('criterion', ['mse', 'mae'])
    bootstrap = trial.suggest_categorical('bootstrap',['True','False'])
    max_depth = trial.suggest_int('max_depth', 1, 10000)
    max_features = trial.suggest_categorical('max_features', ['auto', 'sqrt','log2'])
    max_leaf_nodes = trial.suggest_int('max_leaf_nodes', 1, 10000)
    n_estimators =  trial.suggest_int('n_estimators', 30, 1000)
    model2 = RandomForestRegressor(bootstrap = bootstrap, criterion = criterion,
                                 max_depth = max_depth, max_features = max_features,
                                 max_leaf_nodes = max_leaf_nodes,n_estimators = n_estimators,n_jobs=2)
    score = cross_val_score(model2, A_train, B_train, cv=5, scoring="r2")
    r2_mean = score.mean()
    return r2_mean

### Splitting of Test and Train Dataset
- Proportion of Test:Train data = 20:80

- Initialising the Variables such as A_test, A_train, B_test, B_train

- These Values are useful later for making predictions and giving accuracy

In [149]:
A_train,A_test,B_train,B_test=train_test_split(A,B,test_size=0.20,random_state=0)

### Actual Hyperparameter Optimization Cell
- Based on Number of Trials (1000 here), using several different combination of parameters, the below Optimization
  algorithm tests and predicts the best possible parameters for the used Regression Model in order to yield Maximum
  accuracy.
- Takes 1000 iterations/ trials to figure out best set of parameters for our Regression Model
- n_trials is directly proportional to the accuracy of the model.

In [150]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)
optimised_model2 = RandomForestRegressor(bootstrap = study.best_params['bootstrap'], criterion = study.best_params['criterion'],
                                     max_depth = study.best_params['max_depth'], max_features = study.best_params['max_features'],
                                     max_leaf_nodes = study.best_params['max_leaf_nodes'],n_estimators = study.best_params['n_estimators'],
                                     n_jobs=2)
optimised_model2.fit(A_train ,B_train)

[32m[I 2021-05-08 21:58:27,864][0m A new study created in memory with name: no-name-3b6c8c13-b489-4f4f-a398-dc18a6a530e5[0m
[32m[I 2021-05-08 21:58:35,079][0m Trial 0 finished with value: 0.6044723719886813 and parameters: {'criterion': 'mse', 'bootstrap': 'True', 'max_depth': 8135, 'max_features': 'sqrt', 'max_leaf_nodes': 4105, 'n_estimators': 895}. Best is trial 0 with value: 0.6044723719886813.[0m
[32m[I 2021-05-08 21:58:37,545][0m Trial 1 finished with value: 0.6144715145937363 and parameters: {'criterion': 'mae', 'bootstrap': 'False', 'max_depth': 3947, 'max_features': 'auto', 'max_leaf_nodes': 6188, 'n_estimators': 49}. Best is trial 1 with value: 0.6144715145937363.[0m
[32m[I 2021-05-08 21:58:39,438][0m Trial 2 finished with value: 0.6014333199161054 and parameters: {'criterion': 'mse', 'bootstrap': 'True', 'max_depth': 2607, 'max_features': 'sqrt', 'max_leaf_nodes': 2643, 'n_estimators': 272}. Best is trial 1 with value: 0.6144715145937363.[0m
[32m[I 2021-05-08 21

RandomForestRegressor(bootstrap='False', criterion='mae', max_depth=1835,
                      max_leaf_nodes=1114, n_estimators=592, n_jobs=2)

### Prediction for Model with Optimized Parameters
Initialising the Regression model with tuned set of parameters as predicted by Optuna optimization algorithm

In [151]:
B_pred = optimised_model2.predict(A_test)

### Plotting the Data in Statistical Format (Actual v/s Predicted)

In [152]:
d=pd.DataFrame({'Actual':[B_test], 'Predicted':[B_pred]})
d

Unnamed: 0,Actual,Predicted
0,"[[1.518, 8.684590601751784, 7.477529093315083]...","[[1.545712837837839, 8.768620172159634, 7.9216..."


### Prediction of Optimized MAE, MSE, RMSE

In [153]:
MAE_opt2 = metrics.mean_absolute_error(B_test, B_pred)
MSE_opt2 = metrics.mean_squared_error(B_test, B_pred)
RMSE_opt2 = np.sqrt(metrics.mean_squared_error(B_test, B_pred))

### Prediction of Percentage errors(Mean Absolute Percentage Error) and Accuracy

In [155]:
errors_opt2 = abs(B_pred - B_test)
print('Mean Absolute Error:', round(np.mean(errors_opt2), 2))
mape_opt2 = 100 * (errors_opt2 / B_test)
accuracy_opt2 = 100 - np.mean(mape_opt2)
print('Accuracy:', round(accuracy_opt2, 2))

Mean Absolute Error: 0.21
Accuracy: 96.92


# Dataset Number 3

In [2]:
b = pd.read_csv('Table9.csv', header= 0,encoding= 'unicode_escape')
print(b)

     Speed   Feed   Depth -mm    Tl  Tl650  Tl900     Tw  Tw650  Tw900  Temp0  \
0        50   0.25       0.30  25.3  43.34  41.56  0.388  0.299  0.297    213   
1        50   0.15       0.15  26.0  40.80  26.88  0.394  0.299  0.292    184   
2        50   0.10       0.30  31.0  23.65  34.23  0.288  0.287  0.286    229   
3        50   0.25       0.45  26.0  40.18  34.20  0.371  0.288  0.315    221   
4        50   0.20       0.60  25.3  31.30  27.30  0.292  0.324  0.261    222   
..      ...    ...        ...   ...    ...    ...    ...    ...    ...    ...   
814     110   0.20       0.15  20.0  40.20  39.20  0.352  0.272  0.278    187   
815     110   0.15       0.15  29.0  26.87  45.40  0.296  0.272  0.281    273   
816     110   0.20       0.15  21.5  26.87  35.06  0.302  0.267  0.282    273   
817     110   0.20       0.30  24.0  37.40  34.20  0.390  0.290  0.315    238   
818     110   0.10       0.30  26.0  37.40  35.87  0.394  0.269  0.282    279   

     Temp650  Temp900  
0  

In [3]:
b.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 819 entries, 0 to 818
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Speed      819 non-null    int64  
 1   Feed       819 non-null    float64
 2   Depth -mm  819 non-null    float64
 3   Tl         819 non-null    float64
 4   Tl650      819 non-null    float64
 5   Tl900      819 non-null    float64
 6   Tw         819 non-null    float64
 7   Tw650      819 non-null    float64
 8   Tw900      819 non-null    float64
 9   Temp0      819 non-null    int64  
 10  Temp650    819 non-null    float64
 11  Temp900    819 non-null    int64  
dtypes: float64(9), int64(3)
memory usage: 76.9 KB


In [4]:
b.describe()

Unnamed: 0,Speed,Feed,Depth -mm,Tl,Tl650,Tl900,Tw,Tw650,Tw900,Temp0,Temp650,Temp900
count,819.0,819.0,819.0,819.0,819.0,819.0,819.0,819.0,819.0,819.0,819.0,819.0
mean,80.18315,0.173443,0.377289,25.445629,31.531306,35.306325,0.34777,0.293523,0.283875,225.655678,190.882173,185.197802
std,22.318886,0.055448,0.169342,6.008294,6.975741,6.880134,0.040289,0.020667,0.018459,28.426263,26.79884,25.047143
min,50.0,0.1,0.15,17.9,22.56,26.88,0.288,0.267,0.259,184.0,154.0,151.0
25%,70.0,0.1,0.15,19.34,25.0,28.3,0.296,0.272,0.264,200.5,162.0,156.0
50%,90.0,0.15,0.45,25.3,31.0,34.23,0.359,0.29,0.284,224.0,191.0,184.0
75%,110.0,0.2,0.6,31.0,40.18,41.56,0.388,0.305,0.297,251.0,205.0,210.0
max,110.0,0.25,0.6,36.0,43.34,48.0,0.408,0.335,0.315,279.0,240.0,232.0


### Scalling of features:
1. Temperature (Temp)
2. Tool Life (Tl)

In [5]:
b['Temp0_log'] = np.log(b.Temp0 + 0.01)
b['Tl_log'] = np.log(b.Tl + 0.01)
print(b['Temp0_log'])
print('\n---------------------------------------------------------------------------------------------------------\n')
print(b['Tl_log'])

0      5.361339
1      5.214990
2      5.433766
3      5.398208
4      5.402722
         ...   
814    5.231162
815    5.609508
816    5.609508
817    5.472313
818    5.631248
Name: Temp0_log, Length: 819, dtype: float64

---------------------------------------------------------------------------------------------------------

0      3.231200
1      3.258481
2      3.434310
3      3.258481
4      3.231200
         ...   
814    2.996232
815    3.367641
816    3.068518
817    3.178470
818    3.258481
Name: Tl_log, Length: 819, dtype: float64


### Features: P: Independent and Q: Dependent
P(Independent):
1. Speed
2. Feed
3. Depth

Q (Dependent)
1. Tool Life
2. Tool Wear
3. Temperature

In [6]:
P = np.array(b.iloc[:,:-11])
print(P)

[[5.0e+01 2.5e-01 3.0e-01]
 [5.0e+01 1.5e-01 1.5e-01]
 [5.0e+01 1.0e-01 3.0e-01]
 ...
 [1.1e+02 2.0e-01 1.5e-01]
 [1.1e+02 2.0e-01 3.0e-01]
 [1.1e+02 1.0e-01 3.0e-01]]


In [7]:
Q = np.array(b.iloc[:,[-8,-2,-1]])
print(Q)

[[0.388      5.36133911 3.23119957]
 [0.394      5.2149901  3.25848108]
 [0.288      5.43376567 3.43430973]
 ...
 [0.302      5.60950842 3.06851794]
 [0.39       5.47231269 3.17847041]
 [0.394      5.63124762 3.25848108]]


## <font color='blue'>Regression without Optimization for Dataset 3</font>

In [8]:
P_train,P_test,Q_train,Q_test=train_test_split(P,Q,test_size=0.20,random_state=0)

In [9]:
model3 =  RandomForestRegressor(n_estimators = 100, random_state = 42)
model3.fit(P_train,Q_train)

RandomForestRegressor(random_state=42)

In [10]:
Q_pred = model3.predict(P_test)

In [11]:
b=pd.DataFrame({'Actual':[Q_test], 'Predicted':[Q_pred]})
b

Unnamed: 0,Actual,Predicted
0,"[[0.33399999999999996, 5.214990103958283, 3.58...","[[0.3475415872534329, 5.400290187056377, 3.170..."


In [12]:
MAE3 = metrics.mean_absolute_error(Q_test, Q_pred)
MSE3 = metrics.mean_squared_error(Q_test, Q_pred)
RMSE3 = np.sqrt(metrics.mean_squared_error(Q_test, Q_pred))


### Accuracy and Mean absolute Error (without Optimization)

In [13]:
errors3 = abs(Q_pred - Q_test)
mape3 = 100 * (errors3 / Q_test)
accuracy3 = 100 - np.mean(mape3)
print(accuracy3)

93.29502861232007


## <font color='blue'> Hyperparameter Optimization Tuning Using Optuna Algorithm for Dataset 3</font>

In [168]:
def objective(trial):
    criterion = trial.suggest_categorical('criterion', ['mse', 'mae'])
    bootstrap = trial.suggest_categorical('bootstrap',['True','False'])
    max_depth = trial.suggest_int('max_depth', 1, 10000)
    max_features = trial.suggest_categorical('max_features', ['auto', 'sqrt','log2'])
    max_leaf_nodes = trial.suggest_int('max_leaf_nodes', 1, 10000)
    n_estimators =  trial.suggest_int('n_estimators', 30, 1000)
    model3 = RandomForestRegressor(bootstrap = bootstrap, criterion = criterion,
                                 max_depth = max_depth, max_features = max_features,
                                 max_leaf_nodes = max_leaf_nodes,n_estimators = n_estimators,n_jobs=2)
    score = cross_val_score(model3, P_train, Q_train, cv=5, scoring="r2")
    r2_mean = score.mean()
    return r2_mean

### Splitting of Test and Train Dataset
- Proportion of Test:Train data = 20:80

- Initialising the Variables such as P_test, P_train, Q_test, Q_train

- These Values are useful later for making predictions and giving accuracy

In [169]:
P_train,P_test,Q_train,Q_test=train_test_split(P,Q,test_size=0.20,random_state=0)

### Actual Hyperparameter Optimization Cell
- Based on Number of Trials (1000 here), using several different combination of parameters, the below Optimization
  algorithm tests and predicts the best possible parameters for the used Regression Model in order to yield Maximum
  accuracy.
- Takes 1000 iterations/ trials to figure out best set of parameters for our Regression Model
- n_trials is directly proportional to the accuracy of the model.

In [170]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)
optimised_model3 = RandomForestRegressor(bootstrap = study.best_params['bootstrap'], criterion = study.best_params['criterion'],
                                     max_depth = study.best_params['max_depth'], max_features = study.best_params['max_features'],
                                     max_leaf_nodes = study.best_params['max_leaf_nodes'],n_estimators = study.best_params['n_estimators'],
                                     n_jobs=2)
optimised_model3.fit(P_train ,Q_train)

[32m[I 2021-05-08 22:01:04,254][0m A new study created in memory with name: no-name-d1f67778-a851-46c5-a4b5-88b05cfcfb4e[0m
[32m[I 2021-05-08 22:01:12,981][0m Trial 0 finished with value: -0.14111222044176772 and parameters: {'criterion': 'mae', 'bootstrap': 'False', 'max_depth': 8479, 'max_features': 'sqrt', 'max_leaf_nodes': 6746, 'n_estimators': 726}. Best is trial 0 with value: -0.14111222044176772.[0m
[32m[I 2021-05-08 22:01:14,295][0m Trial 1 finished with value: -0.16012427105947893 and parameters: {'criterion': 'mse', 'bootstrap': 'False', 'max_depth': 8642, 'max_features': 'log2', 'max_leaf_nodes': 8357, 'n_estimators': 249}. Best is trial 0 with value: -0.14111222044176772.[0m
[32m[I 2021-05-08 22:01:28,535][0m Trial 2 finished with value: -0.18684657406847238 and parameters: {'criterion': 'mae', 'bootstrap': 'False', 'max_depth': 2707, 'max_features': 'auto', 'max_leaf_nodes': 1177, 'n_estimators': 783}. Best is trial 0 with value: -0.14111222044176772.[0m
[32m[

RandomForestRegressor(bootstrap='False', criterion='mae', max_depth=8479,
                      max_features='sqrt', max_leaf_nodes=6746,
                      n_estimators=726, n_jobs=2)

### Setting up Model with the best set of parameters returned by Optimization algorithm

Further Steps:
1. Making Predictions for Dependent Feature
2. Statistically represent Data and calculate errors 
3. Based on errors/ MAE/ MAPE predict the maximum possible accuracy

In [171]:
Q_pred = optimised_model3.predict(P_test)

### Plotting data in statistical format (Actual v/s Predicted)

In [172]:
b=pd.DataFrame({'Actual':[Q_test], 'Predicted':[Q_pred]})
b

Unnamed: 0,Actual,Predicted
0,"[[0.33399999999999996, 5.214990103958283, 3.58...","[[0.3532190082644665, 5.415575090982122, 3.160..."


### Prediction of Optimized MAE, MSE, RMS

In [173]:
MAE_opt3 = metrics.mean_absolute_error(Q_test, Q_pred)
MSE_opt3 = metrics.mean_squared_error(Q_test, Q_pred)
RMSE_opt3 = np.sqrt(metrics.mean_squared_error(Q_test, Q_pred))

### Prediction of Mean Absolute Error, Mean Absolute Percentage Error and Accuracy 

In [175]:
ti

Mean Absolute Error: 0.12
Accuracy: 93.3


# <font color="green">Final Report</font>

In [176]:
import csv
with open('Report.csv', 'w', newline='') as fp:
    a = csv.writer(fp,delimiter=',')
    data = [['Dataset No.','MSE','MAE','RMSE','Accuracy','Opt. MSE','Opt. MAE','Opt. RMSE','Opt Accuracy'],
            ['1',MSE1,MAE1,RMSE1,accuracy1,MSE_opt1,MAE_opt1,RMSE_opt1,accuracy_opt1],
            ['2',MSE2,MAE2,RMSE2,accuracy2,MSE_opt2,MAE_opt2,RMSE_opt2, accuracy_opt2],
            ['3',MSE3,MAE3,RMSE3,accuracy3,MSE_opt3,MAE_opt3,RMSE_opt3,accuracy_opt3]]
    a.writerows(data)

In [177]:
rp = pd.read_csv('Report.csv')
rp

Unnamed: 0,Dataset No.,MSE,MAE,RMSE,Accuracy,Opt. MSE,Opt. MAE,Opt. RMSE,Opt Accuracy
0,1,0.015224,0.063246,0.123385,96.779054,0.014896,0.059247,0.122047,96.950957
1,2,0.155466,0.217454,0.394292,96.619924,0.160289,0.209959,0.400361,96.919808
2,3,0.028907,0.124611,0.170019,93.295029,0.02898,0.1241,0.170234,93.302569
