# Model 3: Random Search Cross Validation
In this notebook, we will be using the refined data from Model 2 and our insights gained from our regression function to fine-tune our hyperparameters.

In [1]:
# Import pandas/numpy
import pandas as pd
import numpy as np

# Import our modeling.py module
from src.modules.modeling import *

# Import warnings
import warnings


In [2]:
# Set ignore for warnings
warnings.filterwarnings('ignore')

In [3]:
# Import our saved X and y data from Model 2
X_train_M3 = pd.read_csv('data/X_train_M2.csv')
X_test_M3 = pd.read_csv('data/X_test_M2.csv')
y_train_M3 = pd.read_csv('data/y_train_M2.csv')
y_test_M3 = pd.read_csv('data/y_test_M2.csv')

In [4]:
X_train_M3.columns

Index(['crs_dep_time', 'crs_arr_time', 'crs_elapsed_time', 'flights',
       'distance', 'orig_weather_categ', 'dest_weather_categ', 'year', 'month',
       'day', 'dep_time_of_day', 'arr_time_of_day', 'orig_region',
       'dest_region', 'mean_arr_delay_orig_airport',
       'mean_arr_delay_dest_airport', 'mean_arr_delay_carrier'],
      dtype='object')

In [5]:
X_test_M3.columns

Index(['crs_dep_time', 'crs_arr_time', 'crs_elapsed_time', 'flights',
       'distance', 'orig_weather_categ', 'dest_weather_categ', 'year', 'month',
       'day', 'dep_time_of_day', 'arr_time_of_day', 'orig_region',
       'dest_region', 'mean_arr_delay_orig_airport',
       'mean_arr_delay_dest_airport', 'mean_arr_delay_carrier'],
      dtype='object')

Before running our Random Search, we are going to define some models and parameter ranges for the function


### Linear Regression

In [6]:
lr_model = LinearRegression()

In [115]:
lr_parameters = {'fit_intercept' : [True, False],
                 'normalize' : [True, False]
                 }

In [119]:
random_search(X_train_M3, X_test_M3, y_train_M3, y_test_M3, lr_model, lr_parameters, scale=False)

 Results from Random Search 

 The best estimator across ALL searched params:
 LinearRegression(fit_intercept=False, normalize=True)

 The best score across ALL searched params:
 0.057461315283664605

 The best parameters across ALL searched params:
 {'normalize': True, 'fit_intercept': False}


#### BEST Results from Random Search
 The best estimator across ALL searched params:
 LinearRegression(fit_intercept=False, normalize=True)
<br>
<br>
 The best score across ALL searched params:
 0.057461315283664605
<br>
<br>
 The best parameters across ALL searched params:
 {'normalize': True, 'fit_intercept': False}

### Random Forest

In [8]:
rf_model = RandomForestRegressor()

In [113]:
rf_parameters = {'n_estimators' : np.arange(700, 800, 25),
                 'max_depth' : np.arange(5,9),
                 'max_features' : ['auto', 'sqrt', 'log2', '0.2'],
                 'min_samples_leaf' : np.arange(1,5),
                 'min_samples_split' : np.arange(2,10),
                 'bootstrap' : [True, False]
               }

In [114]:
random_search(X_train_M3, X_test_M3, y_train_M3, y_test_M3, rf_model, rf_parameters)

 Results from Random Search 

 The best estimator across ALL searched params:
 RandomForestRegressor(max_depth=6, max_features='sqrt', min_samples_leaf=4,
                      min_samples_split=9, n_estimators=750)

 The best score across ALL searched params:
 0.04408263923508082

 The best parameters across ALL searched params:
 {'n_estimators': 750, 'min_samples_split': 9, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 6, 'bootstrap': True}


#### BEST Results from Random Search 

The best estimator across ALL searched params:
 RandomForestRegressor(max_depth=6, max_features='sqrt', min_samples_leaf=4,
                      min_samples_split=9, n_estimators=750)
<br>
<br>
 The best score across ALL searched params:
 0.04408263923508082
<br>
<br>
 The best parameters across ALL searched params:
 {'n_estimators': 750, 'min_samples_split': 9, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 6, 'bootstrap': True}

### Epsilon-Support Vector Regression (SVR)

In [10]:
svr_model = SVR()

In [140]:
svr_parameters = {
              'C' : np.arange(70, 72, 0.1),
              'gamma' : np.arange(0.005, 0.01, 0.0005),
              'epsilon' : np.arange(25.0,26.0,0.2)
             }

In [141]:
random_search(X_train_M3, X_test_M3, y_train_M3, y_test_M3, svr_model, svr_parameters, scale=True)

 Results from Random Search 

 The best estimator across ALL searched params:
 SVR(C=70.0, epsilon=25.799999999999997, gamma=0.006999999999999998)

 The best score across ALL searched params:
 0.03628544134123792

 The best parameters across ALL searched params:
 {'gamma': 0.006999999999999998, 'epsilon': 25.799999999999997, 'C': 70.0}


#### BEST Results from Random Search 
<br>
 The best estimator across ALL searched params:
 SVR(C=70.0, epsilon=25.799999999999997, gamma=0.006999999999999998)
<br>
<br>
 The best score across ALL searched params:
 0.03628544134123792
<br>
<br>
 The best parameters across ALL searched params:
 {'gamma': 0.006999999999999998, 'epsilon': 25.799999999999997, 'C': 70.0}

### Trying Other Regressors (SGD)

In [142]:
from sklearn.linear_model import SGDRegressor
SGD_model = SGDRegressor()
SGD_parameters = {'alpha' : np.arange(1e-10, 1e-7, 1e-9),
                  'l1_ratio': np.arange(0.8, 1, 0.01),
                  'max_iter': np.arange(50000,70000,50),
                  'shuffle' : [True,False]
}

In [143]:
random_search(X_train_M3, X_test_M3, y_train_M3, y_test_M3, SGD_model, SGD_parameters, scale=True)

 Results from Random Search 

 The best estimator across ALL searched params:
 SGDRegressor(alpha=3.11e-08, l1_ratio=0.8900000000000001, max_iter=54100)

 The best score across ALL searched params:
 0.06564722160166406

 The best parameters across ALL searched params:
 {'shuffle': True, 'max_iter': 54100, 'l1_ratio': 0.8900000000000001, 'alpha': 3.11e-08}


#### BEST Results from Random Search 
<br>
 The best estimator across ALL searched params:
 SGDRegressor(alpha=3.11e-08, l1_ratio=0.8900000000000001, max_iter=54100)
<br>
<br>
 The best score across ALL searched params:
 0.06564722160166406
<br>
<br>
 The best parameters across ALL searched params:
 {'shuffle': True, 'max_iter': 54100, 'l1_ratio': 0.8900000000000001, 'alpha': 3.11e-08}