# Notebook_05: Model Experimentation

The objective of this notebook is to experiment with different modelling techniques and data processing steps to shortlist a couple of good options to take forward to optimisation.

## Imports & Setup

In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import sys
sys.path.append('../src')
from model_metrics import score_model, auto_mlflow

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import mlflow
import mlflow.sklearn

mlflow.set_experiment("msc_model_shortlisting_2")




INFO: 'msc_model_shortlisting_2' does not exist. Creating a new experiment


In [2]:
mlflow.get_tracking_uri()

'file:///Users/tomfleet/Documents/Python/Projects/MScProject/Notebooks/mlruns'

In [3]:
project_root = Path().resolve().parent
data_path = project_root / 'Data' / 'Final' / 'al_data_final.csv'
data = pd.read_csv(data_path)

In [4]:
data

Unnamed: 0,x,temp,nf_hz,amp_mm,tc_act
0,5,22,40.970000,12.010000,0.00
1,5,22,38.460000,12.290000,0.50
2,5,22,38.360000,10.470000,0.51
3,5,22,38.380000,9.880000,0.52
4,5,22,37.480877,12.786667,0.75
...,...,...,...,...,...
175,25,200,32.472500,17.736250,1.50
176,25,200,30.274583,18.053958,1.75
177,25,200,28.076667,18.371667,2.00
178,25,200,25.878750,18.689375,2.25


### Train Test Split

In [5]:
X = data.drop('tc_act', axis = 1)
y = data['tc_act']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True, stratify = data['x'])

### Polynomial Feature Generation

Generation of polynomial features from the original data frame

In [6]:
# First try polynomialising the whole thing (minus the target obviously)
poly_features_3 = PolynomialFeatures(degree = 3)
poly_features_2 = PolynomialFeatures(degree = 2)

poly_2_pipeline = Pipeline([
    ("poly_features_2", poly_features_2)
])

poly_3_pipeline = Pipeline([
    ("poly_features_3", poly_features_3)
])

X_train_poly_2 = poly_2_pipeline.fit_transform(X_train)
X_train_poly_3 = poly_3_pipeline.fit_transform(X_train)

# Data As Is

Let's try a few things on the data as it stands to get a baseline.

## Linear Regression

In [7]:
from sklearn.linear_model import LinearRegression

In [8]:
lin_reg = LinearRegression()

auto_mlflow(run_name = 'Linear Regression', model_name=lin_reg, data_params={"scaled": False, "polynomial_features": False}, X = X_train, y = y_train)

Non-validation Scores
-----------
RMSE (No Val): 0.171
R^2 (No Val): 0.948

Validation Scores
-----------
RMSE's: [0.166 0.205 0.213 0.154 0.142]
Mean: 0.176
StDev: 0.028
CoV: 0.159


#### Observations

* Not bad, pretty good RMSE, great $R^2$ value

* No sign of overfitting, but it's basically the simplest model ever so that's no surprise

* Because it's not overfitting I won't use any regularisation (Lasso or Ridge etc.)

## Random Forest Regressor

In [9]:
from sklearn.ensemble import RandomForestRegressor

In [10]:
forest_reg = RandomForestRegressor()

auto_mlflow(run_name = 'Random Forest', model_name = forest_reg, data_params={"scaled": False, "polynomial_features": False}, X = X_train, y = y_train)

Non-validation Scores
-----------
RMSE (No Val): 0.054
R^2 (No Val): 0.995

Validation Scores
-----------
RMSE's: [0.149 0.164 0.123 0.169 0.166]
Mean: 0.154
StDev: 0.017
CoV: 0.111


Looks similar to the Linear Regression but it's overfitting a bit, let's try a quick grid search

In [11]:
from sklearn.model_selection import GridSearchCV

In [12]:
from sklearn.model_selection import GridSearchCV

forest_reg_gridsearch = RandomForestRegressor()

param_grid = [{"n_estimators": [150, 200, 300, 400], "max_depth":[6, 8, 10, 12], "max_features": [2, 4, 6, 8], "min_samples_split": [1, 2, 4]}]

grid_search = GridSearchCV(forest_reg_gridsearch, param_grid, scoring = 'neg_mean_squared_error', n_jobs = -1, verbose = 1)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 192 candidates, totalling 960 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   19.5s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   36.2s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   58.7s
[Parallel(n_jobs=-1)]: Done 960 out of 960 | elapsed:  1.1min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='deprecated', n

In [13]:
grid_search.best_params_

{'max_depth': 10,
 'max_features': 4,
 'min_samples_split': 2,
 'n_estimators': 300}

In [14]:
forest_grid_best = grid_search.best_estimator_

auto_mlflow(run_name = 'Random Forest (Optimised)', model_name = forest_grid_best, data_params={"scaled": False, "polynomial_features": False}, X = X_train, y = y_train)

Non-validation Scores
-----------
RMSE (No Val): 0.054
R^2 (No Val): 0.995

Validation Scores
-----------
RMSE's: [0.149 0.16  0.127 0.161 0.164]
Mean: 0.152
StDev: 0.014
CoV: 0.09


Not made much difference but a bit better. It's logged anyway.

## SVM Regressor

In [15]:
from sklearn.svm import SVR

In [16]:
svr = SVR()

auto_mlflow(run_name = 'SVR', model_name = svr, data_params={"scaled": False, "polynomial_features": False}, X = X_train, y = y_train)

Non-validation Scores
-----------
RMSE (No Val): 0.439
R^2 (No Val): 0.654

Validation Scores
-----------
RMSE's: [0.518 0.564 0.557 0.491 0.545]
Mean: 0.535
StDev: 0.027
CoV: 0.051


In [17]:
param_grid = [{"C": [10000, 100000, 1000000], "epsilon": [0.001, 0.0001, 0.00001]}]

svr = SVR()

grid_search = GridSearchCV(svr, param_grid, scoring = 'neg_mean_squared_error', n_jobs = -1)

grid_search.fit(X_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                           epsilon=0.1, gamma='scale', kernel='rbf',
                           max_iter=-1, shrinking=True, tol=0.001,
                           verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid=[{'C': [10000, 100000, 1000000],
                          'epsilon': [0.001, 0.0001, 1e-05]}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=0)

In [18]:
grid_search.best_params_

{'C': 100000, 'epsilon': 1e-05}

In [19]:
grid_search.best_score_

-0.007464042743695004

In [20]:
score_model(grid_search.best_estimator_, X_train, y_train)

Non-validation Scores
-----------
RMSE (No Val): 0.053
R^2 (No Val): 0.995

Validation Scores
-----------
RMSE's: [0.071 0.095 0.096 0.102 0.062]
Mean: 0.085
StDev: 0.016
CoV: 0.183


(0.052981679228176086,
 0.9949662658845404,
 array([0.07130055, 0.09456528, 0.09569276, 0.10157221, 0.06180481]),
 0.08498712206797818,
 0.015531639523810143,
 0.18275285885533266)

In [21]:
svr_grid_best = grid_search.best_estimator_

auto_mlflow(run_name = 'SVR (Optimised)', model_name = svr_grid_best, data_params={"scaled": False, "polynomial_features": False}, X = X_train, y = y_train)

Non-validation Scores
-----------
RMSE (No Val): 0.053
R^2 (No Val): 0.995

Validation Scores
-----------
RMSE's: [0.071 0.095 0.096 0.102 0.062]
Mean: 0.085
StDev: 0.016
CoV: 0.183


#### Observations

* SVR seems okay but they really perform best with scaled data, thats why C is so large in this example

* Random forest seems similar to linear regression but more with more overfitting

# Using Polynomial Features

Let's try polynomialising the features and see what that does to improve any models.

## Polynomial Regression

### 2nd Degree Polynomial

In [22]:
poly_reg = LinearRegression()

auto_mlflow(run_name = 'Polynomial Regression', model_name = poly_reg, data_params={"scaled": False, "polynomial_features": True, "polynomial_degree": 2, "polynomial_applied_to_all_feats": True}, X = X_train_poly_2, y = y_train)


Non-validation Scores
-----------
RMSE (No Val): 0.079
R^2 (No Val): 0.989

Validation Scores
-----------
RMSE's: [0.091 0.088 0.105 0.085 0.083]
Mean: 0.09
StDev: 0.008
CoV: 0.086


### 3rd Degree

In [23]:
poly_reg = LinearRegression()

auto_mlflow(run_name = 'Polynomial Regression', model_name = poly_reg, data_params={"scaled": False, "polynomial_features": True, "polynomial_degree": 3, "polynomial_applied_to_all_feats": True}, X = X_train_poly_3, y = y_train)

Non-validation Scores
-----------
RMSE (No Val): 0.076
R^2 (No Val): 0.99

Validation Scores
-----------
RMSE's: [0.093 0.078 0.098 0.15  0.071]
Mean: 0.098
StDev: 0.028
CoV: 0.285


## Polynomial Ridge Regression

### 2nd Degree

In [24]:
# Use the build in CV first to get good params
from sklearn.linear_model import RidgeCV, Ridge

poly_ridge = RidgeCV(alphas = (0.01, 0.1, 1.0, 2.0, 5.0, 8.0, 10.0))

poly_ridge.fit(X_train_poly_2, y_train)

RidgeCV(alphas=array([ 0.01,  0.1 ,  1.  ,  2.  ,  5.  ,  8.  , 10.  ]),
        cv=None, fit_intercept=True, gcv_mode=None, normalize=False,
        scoring=None, store_cv_values=False)

In [25]:
poly_ridge.alpha_

0.01

In [26]:
# Now we know the optimum alpha we can log it with auto_mlflow

poly_ridge = Ridge(alpha = 0.1)

auto_mlflow(run_name = 'Poly Ridge', model_name = poly_ridge, data_params={"scaled": False, "polynomial_features": True, "polynomial_degree": 2, "polynomial_applied_to_all_feats": True}, X = X_train_poly_2, y = y_train)

Non-validation Scores
-----------
RMSE (No Val): 0.08
R^2 (No Val): 0.989

Validation Scores
-----------
RMSE's: [0.089 0.089 0.103 0.085 0.082]
Mean: 0.089
StDev: 0.007
CoV: 0.08


### 3rd Degree

In [27]:
# Same process, get RidgeCV to get best params

poly_ridge = RidgeCV(alphas = (0.01, 0.1, 1.0, 2.0, 5.0, 8.0, 10.0))

poly_ridge.fit(X_train_poly_3, y_train)

RidgeCV(alphas=array([ 0.01,  0.1 ,  1.  ,  2.  ,  5.  ,  8.  , 10.  ]),
        cv=None, fit_intercept=True, gcv_mode=None, normalize=False,
        scoring=None, store_cv_values=False)

In [28]:
poly_ridge.alpha_

2.0

In [29]:
poly_ridge = Ridge(alpha = 1.0)

auto_mlflow(run_name = 'Poly Ridge', model_name = poly_ridge, data_params={"scaled": False, "polynomial_features": True, "polynomial_degree": 3, "polynomial_applied_to_all_feats": True}, X = X_train_poly_3, y = y_train)


Non-validation Scores
-----------
RMSE (No Val): 0.053
R^2 (No Val): 0.995

Validation Scores
-----------
RMSE's: [0.065 0.075 0.071 0.087 0.064]
Mean: 0.072
StDev: 0.008
CoV: 0.115


## Polynomial Lasso

In [30]:
from sklearn.linear_model import LassoCV, Lasso

### 2nd Degree

In [31]:
# Same process, do built in CV first
lasso_reg = LassoCV()

lasso_reg.fit(X_train_poly_2, y_train)

LassoCV(alphas=None, copy_X=True, cv=None, eps=0.001, fit_intercept=True,
        max_iter=1000, n_alphas=100, n_jobs=None, normalize=False,
        positive=False, precompute='auto', random_state=None,
        selection='cyclic', tol=0.0001, verbose=False)

In [32]:
lasso_reg.alpha_

0.4040096665243699

In [33]:
lasso_reg = Lasso(alpha = 0.42106172839506184)

auto_mlflow(run_name = 'Poly Lasso', model_name = lasso_reg, data_params={"scaled": False, "polynomial_features": True, "polynomial_degree": 2, "polynomial_applied_to_all_feats": True}, X = X_train_poly_2, y = y_train)

Non-validation Scores
-----------
RMSE (No Val): 0.123
R^2 (No Val): 0.973

Validation Scores
-----------
RMSE's: [0.136 0.147 0.156 0.113 0.106]
Mean: 0.131
StDev: 0.019
CoV: 0.148


### 3rd Degree

In [34]:
lasso_reg = LassoCV()

lasso_reg.fit(X_train_poly_3, y_train)

LassoCV(alphas=None, copy_X=True, cv=None, eps=0.001, fit_intercept=True,
        max_iter=1000, n_alphas=100, n_jobs=None, normalize=False,
        positive=False, precompute='auto', random_state=None,
        selection='cyclic', tol=0.0001, verbose=False)

In [35]:
lasso_reg.alpha_

59.62699566715469

In [36]:
lasso_reg = Lasso(alpha = 89.44020987654322)

auto_mlflow(run_name = 'Poly Lasso', model_name = lasso_reg, data_params={"scaled": False, "polynomial_features": True, "polynomial_degree": 3, "polynomial_applied_to_all_feats": True}, X = X_train_poly_3, y = y_train)

Non-validation Scores
-----------
RMSE (No Val): 0.099
R^2 (No Val): 0.983

Validation Scores
-----------
RMSE's: [0.109 0.108 0.114 0.114 0.086]
Mean: 0.106
StDev: 0.011
CoV: 0.1


## Polynomial Elastic Net

### 2nd Degree

In [37]:
from sklearn.linear_model import ElasticNetCV, ElasticNet

In [38]:
net_reg = ElasticNetCV(l1_ratio = [.1, .5, .7, .9, .95, .99, 1])

net_reg.fit(X_train_poly_2, y_train)

ElasticNetCV(alphas=None, copy_X=True, cv=None, eps=0.001, fit_intercept=True,
             l1_ratio=[0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1], max_iter=1000,
             n_alphas=100, n_jobs=None, normalize=False, positive=False,
             precompute='auto', random_state=None, selection='cyclic',
             tol=0.0001, verbose=0)

In [39]:
net_reg.l1_ratio_

1.0

In [40]:
net_reg.alpha_

0.4040096665243699

In [41]:
net_reg = ElasticNet(l1_ratio = 1.0, alpha = 0.42106172839506184)

auto_mlflow(run_name = 'Poly Elastic Net', model_name = net_reg, data_params={"scaled": False, "polynomial_features": True, "polynomial_degree": 2, "polynomial_applied_to_all_feats": True}, X = X_train_poly_2, y = y_train)

Non-validation Scores
-----------
RMSE (No Val): 0.123
R^2 (No Val): 0.973

Validation Scores
-----------
RMSE's: [0.136 0.147 0.156 0.113 0.106]
Mean: 0.131
StDev: 0.019
CoV: 0.148


### 3rd Degree

In [42]:
net_reg = ElasticNetCV(l1_ratio = [.1, .5, .7, .9, .95, .99, 1])

net_reg.fit(X_train_poly_3, y_train)

ElasticNetCV(alphas=None, copy_X=True, cv=None, eps=0.001, fit_intercept=True,
             l1_ratio=[0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1], max_iter=1000,
             n_alphas=100, n_jobs=None, normalize=False, positive=False,
             precompute='auto', random_state=None, selection='cyclic',
             tol=0.0001, verbose=0)

In [43]:
net_reg.l1_ratio_

1.0

In [44]:
net_reg.alpha_

59.62699566715469

In [45]:
net_reg = ElasticNet(l1_ratio = 1.0, alpha = 89.44020987654322)

auto_mlflow(run_name = 'Poly Elastic Net', model_name = net_reg, data_params={"scaled": False, "polynomial_features": True, "polynomial_degree": 3, "polynomial_applied_to_all_feats": True}, X = X_train_poly_3, y = y_train)

Non-validation Scores
-----------
RMSE (No Val): 0.099
R^2 (No Val): 0.983

Validation Scores
-----------
RMSE's: [0.109 0.108 0.114 0.114 0.086]
Mean: 0.106
StDev: 0.011
CoV: 0.1


## Random Forest Polynomial

### 2nd Degree

In [46]:
forest_poly_reg = RandomForestRegressor()

auto_mlflow(run_name = 'Poly Random Forest', model_name = forest_poly_reg, data_params={"scaled": False, "polynomial_features": True, "polynomial_degree": 2, "polynomial_applied_to_all_feats": True}, X = X_train_poly_2, y = y_train)

Non-validation Scores
-----------
RMSE (No Val): 0.05
R^2 (No Val): 0.995

Validation Scores
-----------
RMSE's: [0.133 0.164 0.114 0.145 0.16 ]
Mean: 0.143
StDev: 0.018
CoV: 0.127


In [47]:
forest_poly_gridsearch = RandomForestRegressor()

param_grid = [{"n_estimators": [150, 200, 300, 400], "max_depth":[6, 8, 10, 12], "max_features": [2, 4, 6, 8], "min_samples_split": [1, 2, 4]}]

grid_search = GridSearchCV(forest_poly_gridsearch, param_grid, scoring = 'neg_mean_squared_error', n_jobs = -1, verbose = 1)

grid_search.fit(X_train_poly_2, y_train)

Fitting 5 folds for each of 192 candidates, totalling 960 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  51 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:   20.8s
[Parallel(n_jobs=-1)]: Done 458 tasks      | elapsed:   46.7s
[Parallel(n_jobs=-1)]: Done 808 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 960 out of 960 | elapsed:  1.6min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='deprecated', n

In [48]:
poly_forest_grid_best = grid_search.best_estimator_

auto_mlflow(run_name = 'Poly Random Forest (Optimised)', model_name = poly_forest_grid_best, data_params={"scaled": False, "polynomial_features": True, "polynomial_degree": 2, "polynomial_applied_to_all_feats": True}, X = X_train_poly_2, y = y_train)


Non-validation Scores
-----------
RMSE (No Val): 0.05
R^2 (No Val): 0.996

Validation Scores
-----------
RMSE's: [0.135 0.162 0.126 0.144 0.143]
Mean: 0.142
StDev: 0.012
CoV: 0.084


### 3rd Degree

In [49]:
forest_poly_reg = RandomForestRegressor()

auto_mlflow(run_name = 'Poly Random Forest', model_name = forest_poly_reg, data_params={"scaled": False, "polynomial_features": True, "polynomial_degree": 3, "polynomial_applied_to_all_feats": True}, X = X_train_poly_3, y = y_train)

Non-validation Scores
-----------
RMSE (No Val): 0.055
R^2 (No Val): 0.995

Validation Scores
-----------
RMSE's: [0.142 0.168 0.114 0.133 0.134]
Mean: 0.138
StDev: 0.018
CoV: 0.128


In [50]:
forest_poly_gridsearch = RandomForestRegressor()

param_grid = [{"n_estimators": [150, 200, 300, 400], "max_depth":[6, 8, 10, 12], "max_features": [2, 4, 6, 8], "min_samples_split": [1, 2, 4]}]

grid_search = GridSearchCV(forest_poly_gridsearch, param_grid, scoring = 'neg_mean_squared_error', n_jobs = -1, verbose = 1)

grid_search.fit(X_train_poly_3, y_train)

Fitting 5 folds for each of 192 candidates, totalling 960 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  51 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:   20.5s
[Parallel(n_jobs=-1)]: Done 458 tasks      | elapsed:   44.5s
[Parallel(n_jobs=-1)]: Done 808 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 960 out of 960 | elapsed:  1.5min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='deprecated', n

In [51]:
poly_forest_grid_best = grid_search.best_estimator_

auto_mlflow(run_name = 'Poly Random Forest (Optimised)', model_name = poly_forest_grid_best, data_params={"scaled": False, "polynomial_features": True, "polynomial_degree": 3, "polynomial_applied_to_all_feats": True}, X = X_train_poly_3, y = y_train)

Non-validation Scores
-----------
RMSE (No Val): 0.05
R^2 (No Val): 0.996

Validation Scores
-----------
RMSE's: [0.138 0.169 0.113 0.145 0.131]
Mean: 0.139
StDev: 0.018
CoV: 0.131


## SVM Polynomial

### 2nd Degree

In [52]:
poly_svr = SVR()

auto_mlflow(run_name = 'Poly SVR', model_name = poly_svr, data_params={"scaled": False, "polynomial_features": True, "polynomial_degree": 2, "polynomial_applied_to_all_feats": True}, X = X_train_poly_2, y = y_train)

Non-validation Scores
-----------
RMSE (No Val): 0.636
R^2 (No Val): 0.274

Validation Scores
-----------
RMSE's: [0.652 0.74  0.729 0.624 0.681]
Mean: 0.685
StDev: 0.044
CoV: 0.065


In [53]:
poly_svr = SVR()

param_grid = [
    {"C": [0.1, 1, 10, 100, 1000], "epsilon": [0.1, 0.01, 0.001, 0.0001]}
]

grid_search = GridSearchCV(poly_svr, param_grid, scoring = 'neg_mean_squared_error', n_jobs = -1)

grid_search.fit(X_train_poly_2, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                           epsilon=0.1, gamma='scale', kernel='rbf',
                           max_iter=-1, shrinking=True, tol=0.001,
                           verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid=[{'C': [0.1, 1, 10, 100, 1000],
                          'epsilon': [0.1, 0.01, 0.001, 0.0001]}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=0)

In [54]:
poly_svr_grid_best = grid_search.best_estimator_

auto_mlflow(run_name = 'Poly SVR (Optimised)', model_name = poly_svr, data_params={"scaled": False, "polynomial_features": True, "polynomial_degree": 2, "polynomial_applied_to_all_feats": True}, X = X_train_poly_2, y = y_train)

Non-validation Scores
-----------
RMSE (No Val): 0.636
R^2 (No Val): 0.274

Validation Scores
-----------
RMSE's: [0.652 0.74  0.729 0.624 0.681]
Mean: 0.685
StDev: 0.044
CoV: 0.065


### 3rd Degree

In [55]:
poly_svr = SVR()

auto_mlflow(run_name = 'Poly SVR', model_name = poly_svr, data_params={"scaled": False, "polynomial_features": True, "polynomial_degree": 3, "polynomial_applied_to_all_feats": True}, X = X_train_poly_3, y = y_train)

Non-validation Scores
-----------
RMSE (No Val): 0.689
R^2 (No Val): 0.148

Validation Scores
-----------
RMSE's: [0.694 0.789 0.747 0.656 0.722]
Mean: 0.721
StDev: 0.045
CoV: 0.063


In [56]:
poly_svr = SVR()

param_grid = [
    {"C": [0.1, 1, 10, 100, 1000], "epsilon": [0.1, 0.01, 0.001, 0.0001]}
]

grid_search = GridSearchCV(poly_svr, param_grid, scoring = 'neg_mean_squared_error', n_jobs = -1)

grid_search.fit(X_train_poly_3, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                           epsilon=0.1, gamma='scale', kernel='rbf',
                           max_iter=-1, shrinking=True, tol=0.001,
                           verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid=[{'C': [0.1, 1, 10, 100, 1000],
                          'epsilon': [0.1, 0.01, 0.001, 0.0001]}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=0)

In [57]:
poly_svr_grid_best = grid_search.best_estimator_

auto_mlflow(run_name = 'Poly SVR (Optimised)', model_name = poly_svr, data_params={"scaled": False, "polynomial_features": True, "polynomial_degree": 3, "polynomial_applied_to_all_feats": True}, X = X_train_poly_3, y = y_train)

Non-validation Scores
-----------
RMSE (No Val): 0.689
R^2 (No Val): 0.148

Validation Scores
-----------
RMSE's: [0.694 0.789 0.747 0.656 0.722]
Mean: 0.721
StDev: 0.045
CoV: 0.063
