In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score

X_train = pd.read_pickle('./data/X_train_pickle.pkl')
X_test = pd.read_pickle('./data/X_test_pickle.pkl')
y_train = pd.read_pickle('./data/y_train_pickle.pkl')
y_test = pd.read_pickle('./data/y_test_pickle.pkl')

# <span style="color:green">Beginning of Logistic Regression GridSearch</span>

#### Adjust the variables to your liking.

In [2]:
c_scale = [.001, .01, .1, 1, 10, 100, 1000]   # Set a log scale of C values. 
cv = 5                                        # Set the number of CrossValidations

### The cell below takes <span style="color:green">3 seconds</span> to run 
The results will be saved to your disk allowing you to explore futher at a later time without needing to re-run this cell every time.

In [3]:
lr = LogisticRegression(solver='lbfgs', random_state=0)
param_grid = [{'C': c_scale}]                         # Set grid search parameters based on variables assigned above.

lr_grid_search = GridSearchCV(lr, param_grid, cv=cv,  # Just passing in the variables declared above
                              scoring='f1',           # We want to refit based on better f1 scores.
                              n_jobs=-2,              # Uses all but one of machine's processors.
                              verbose=50)             # verbose > 0 gives us a progress bar to check on.

lr_grid_search.fit(X_train, y_train) 
%store lr_grid_search

Fitting 5 folds for each of 7 candidates, totalling 35 fits
[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done   1 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-2)]: Done   2 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-2)]: Done   3 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-2)]: Done   4 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-2)]: Done   5 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-2)]: Done   6 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-2)]: Done   7 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-2)]: Done   8 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-2)]: Done   9 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-2)]: Done  10 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-2)]: Done  11 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-2)]: Done  12 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-2)]: Done  13 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-2)]: Done  14 tasks      | elapsed:    2



# <span style="color:orange">Review GridSeach Results.</span>
#### Determine if there are any tunings we want to make before we test our model against the `test set`. 

In [4]:
%store -r lr_grid_search          # restores the results from our previous run of the above cell.
lr_grid_search.best_estimator_    # outputs the best parameters to use based on our GridSearch

no stored variable # restores the results from our previous run of the above cell.


LogisticRegression(C=1000, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

# <span style="color:red">Final Test Calculation</span>

Now that we're comfortable with the hyper tunings of our parameters, we can finally run our model on our `test set` to truly challenge our model and determine an unbiased result.

In [5]:
y_pred = lr_grid_search.predict(X_test)                 # predicting y hat
lr_acc = round(accuracy_score(y_test, y_pred)*100, 2)   # calculating accuracy
lr_f1 = round(lr_grid_search.best_score_*100, 2)        # calculating F1 Score
print("After hyper tuning the Logistic Regression model, the best accuracy we could compute was {} with a f1 score of {}".format(lr_acc, lr_f1))

lr = {'accuracy': lr_acc,
      'f1': lr_f1 }
%store lr

After hyper tuning the Logistic Regression model, the best accuracy we could compute was 84.89 with a f1 score of 65.75
Stored 'lr' (dict)
