# Model 3: Random Search Cross Validation
In this notebook, we will be using the refined data from Model 2 and our insights gained from our regression function to fine-tune our hyperparameters.

In [1]:
# Import sys so we can import custom packages without error
import sys
sys.path.append('../')

In [2]:
# Import pandas/numpy
import pandas as pd
import numpy as np

# Import our modeling.py module
from src.modules.modeling import *

# Import warnings
import warnings


In [3]:
# Set ignore for warnings
warnings.filterwarnings('ignore')

In [4]:
# Import our saved X and y data from Model 2
X_train_M3 = pd.read_csv('../data/X_train_M2.csv')
X_test_M3 = pd.read_csv('../data/X_test_M2.csv')
y_train_M3 = pd.read_csv('../data/y_train_M2.csv')
y_test_M3 = pd.read_csv('../data/y_test_M2.csv')

In [5]:
X_train_M3.columns

Index(['crs_dep_time', 'crs_arr_time', 'crs_elapsed_time', 'flights',
       'distance', 'orig_weather_categ', 'dest_weather_categ', 'year', 'month',
       'day', 'dep_time_of_day', 'arr_time_of_day', 'orig_region',
       'dest_region', 'mean_arr_delay_orig_airport',
       'mean_arr_delay_dest_airport', 'mean_arr_delay_carrier'],
      dtype='object')

In [6]:
X_test_M3.columns

Index(['crs_dep_time', 'crs_arr_time', 'crs_elapsed_time', 'flights',
       'distance', 'orig_weather_categ', 'dest_weather_categ', 'year', 'month',
       'day', 'dep_time_of_day', 'arr_time_of_day', 'orig_region',
       'dest_region', 'mean_arr_delay_orig_airport',
       'mean_arr_delay_dest_airport', 'mean_arr_delay_carrier'],
      dtype='object')

Before running our Random Search, we are going to define some models and parameter ranges for the function


In [7]:
X = pd.concat([X_train_M3, X_test_M3], axis=0)

In [8]:
X

Unnamed: 0,crs_dep_time,crs_arr_time,crs_elapsed_time,flights,distance,orig_weather_categ,dest_weather_categ,year,month,day,dep_time_of_day,arr_time_of_day,orig_region,dest_region,mean_arr_delay_orig_airport,mean_arr_delay_dest_airport,mean_arr_delay_carrier
0,801,850,109.0,1.0,533.0,0,0,2018,12,16,0,0,0,0,-1.652778,7.986150,8.249349
1,610,847,97.0,1.0,350.0,0,1,2019,7,9,0,0,1,1,-3.320000,0.844595,5.078223
2,855,1049,114.0,1.0,489.0,1,0,2019,5,5,0,0,2,2,7.851852,5.985507,3.853175
3,1300,1650,170.0,1.0,1120.0,2,2,2019,1,31,2,2,3,1,8.521739,3.136364,5.078223
4,1210,1320,70.0,1.0,207.0,3,1,2019,11,26,2,2,3,2,11.366071,4.200000,3.853175
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1833,2049,196.0,1.0,1107.0,2,1,2019,12,31,3,3,0,2,3.374359,4.643725,5.078223
1996,1640,1807,87.0,1.0,370.0,2,0,2018,6,24,2,3,3,0,-0.763158,9.221277,5.078223
1997,1559,2110,191.0,1.0,1371.0,0,1,2018,6,23,2,1,3,6,-0.763158,64.533333,6.519685
1998,1030,1320,170.0,1.0,1068.0,0,1,2018,3,27,0,2,2,5,4.410256,11.220096,8.249349


In [9]:
y = pd.concat([y_train_M3, y_test_M3], axis=0)

In [10]:
y

Unnamed: 0,arr_delay
0,-17.0
1,-12.0
2,-16.0
3,0.0
4,-11.0
...,...
1995,10.0
1996,-9.0
1997,40.0
1998,-1.0


In [11]:
import sklearn
sklearn.metrics.get_scorer_names()

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'matthews_corrcoef',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_

### Linear Regression

In [16]:
scoring = {'R2 Score': 'r2', 'Accuracy' : 'accuracy', 'MSE' : 'neg_mean_squared_error'}

In [17]:
lr_model = LinearRegression()

In [18]:
lr_parameters = {'fit_intercept' : [True, False],
                 'normalize' : [True, False]
                 }

In [19]:
random_search(X, y, lr_model, lr_parameters, lr_scoring, 0.3, scale=False)

 Results from Random Search 

 The best estimator across ALL searched params:
 LinearRegression(fit_intercept=False, normalize=True)

 R2 score:
 0.07654439701490101

 MSE scroe:
 2348.293111945376

 The best score across ALL searched params:
 0.06846094063762856

 The best parameters across ALL searched params:
 {'normalize': True, 'fit_intercept': False}


#### BEST Results from Random Search
 The best estimator across ALL searched params:
 LinearRegression(fit_intercept=False, normalize=True)
<br>
<br>
 The best score across ALL searched params:
 0.057461315283664605
<br>
<br>
 The best parameters across ALL searched params:
 {'normalize': True, 'fit_intercept': False}

### Random Forest

In [20]:
rf_model = RandomForestRegressor()

In [21]:
rf_parameters = {'n_estimators' : np.arange(700, 800, 25),
                 'max_depth' : np.arange(5,9),
                 'max_features' : ['auto', 'sqrt', 'log2', '0.2'],
                 'min_samples_leaf' : np.arange(1,5),
                 'min_samples_split' : np.arange(2,10),
                 'bootstrap' : [True, False]
               }

In [22]:
random_search(X, y, rf_model, rf_parameters, scoring, 0.3, scale=False)

 Results from Random Search 

 The best estimator across ALL searched params:
 RandomForestRegressor(max_depth=6, max_features='log2', min_samples_leaf=3,
                      min_samples_split=9, n_estimators=700)

 R2 score:
 0.08008398397804528

 MSE scroe:
 2650.889661099115

 The best score across ALL searched params:
 0.04880429640822428

 The best parameters across ALL searched params:
 {'n_estimators': 700, 'min_samples_split': 9, 'min_samples_leaf': 3, 'max_features': 'log2', 'max_depth': 6, 'bootstrap': True}


#### BEST Results from Random Search 

The best estimator across ALL searched params:
 RandomForestRegressor(max_depth=6, max_features='sqrt', min_samples_leaf=4,
                      min_samples_split=9, n_estimators=750)
<br>
<br>
 The best score across ALL searched params:
 0.04408263923508082
<br>
<br>
 The best parameters across ALL searched params:
 {'n_estimators': 750, 'min_samples_split': 9, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 6, 'bootstrap': True}

### Epsilon-Support Vector Regression (SVR)

In [24]:
svr_model = SVR()

In [25]:
svr_parameters = {
              'C' : np.arange(70, 72, 0.1),
              'gamma' : np.arange(0.005, 0.01, 0.0005),
              'epsilon' : np.arange(25.0,26.0,0.2)
             }

In [27]:
random_search(X, y, svr_model, svr_parameters, scoring, 0.3, scale=True)

 Results from Random Search 

 The best estimator across ALL searched params:
 SVR(C=70.79999999999995, epsilon=25.599999999999998, gamma=0.007499999999999998)

 R2 score:
 0.031003367343941712

 MSE scroe:
 1707.8927857197664

 The best score across ALL searched params:
 0.03909787647322407

 The best parameters across ALL searched params:
 {'gamma': 0.007499999999999998, 'epsilon': 25.599999999999998, 'C': 70.79999999999995}


### Trying Other Regressors (SGD)

In [31]:
from sklearn.linear_model import SGDRegressor
SGD_model = SGDRegressor()
SGD_parameters = {'alpha' : np.arange(1e-9, 1e-7, 1e-9),
                  'l1_ratio': np.arange(1, 5, 0.5),
                  'max_iter': np.arange(50000,70000,50),
                  'shuffle' : [True,False]
}

In [32]:
random_search(X, y, SGD_model, SGD_parameters, scoring, 0.3, scale=True)

 Results from Random Search 

 The best estimator across ALL searched params:
 SGDRegressor(alpha=2e-08, l1_ratio=1.0, max_iter=58950)

 R2 score:
 0.09803396678167775

 MSE scroe:
 2675.5888971863315

 The best score across ALL searched params:
 0.04372727845458226

 The best parameters across ALL searched params:
 {'shuffle': True, 'max_iter': 58950, 'l1_ratio': 1.0, 'alpha': 2e-08}
