In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

In [2]:
X_train = pd.read_pickle('./data/X_train_pickle.pkl')
X_test = pd.read_pickle('./data/X_test_pickle.pkl')
y_train = pd.read_pickle('./data/y_train_pickle.pkl')
y_test = pd.read_pickle('./data/y_test_pickle.pkl')

# Logistic Regression

In [3]:
# Hyper tuning LR within a range of degrees
cv = 5
log_scale = [.001, .01, .1, 1, 10, 100, 1000]

In [4]:
# Run this cell once to save results to your disk.
# Will take about <3 seconds to run. 
lr = LogisticRegression(solver='lbfgs', multi_class='multinomial', random_state=0)
param_grid = [{'C': log_scale}]

lr_grid_search = GridSearchCV(lr, param_grid, cv=cv,  # Just passing in the variables declared above
                              scoring='f1',
                              n_jobs=-2,                               # Uses all but one of machine's processors.
                              verbose=50)                              # verbose > 0 gives us a progress bar to check on.
lr_grid_search.fit(X_train, y_train) 
lr_grid_search.best_estimator_
%store lr_grid_search

Fitting 5 folds for each of 7 candidates, totalling 35 fits
[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done   1 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-2)]: Done   2 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-2)]: Done   3 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-2)]: Done   4 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-2)]: Done   5 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-2)]: Done   6 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-2)]: Done   7 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-2)]: Done   8 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-2)]: Done   9 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-2)]: Done  10 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-2)]: Done  11 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-2)]: Done  12 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-2)]: Done  13 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-2)]: Done  14 tasks      | elapsed:    3

# End of GridSearch -> Calculate Results

In [5]:
%store -r lr_grid_search

y_pred = lr_grid_search.predict(X_test)
lr_acc = round(accuracy_score(y_test, y_pred)*100, 2)
lr_f1 = round(lr_grid_search.best_score_*100, 2)
print("Logistic Regression correctly identified {}% of the True Values".format(lr_acc))
print('The F1 score determined that the LR correctly identified {}% of the True Positives'.format(lr_f1))

lr = {'accuracy': lr_acc,
      'f1': lr_f1 }
%store lr

Logistic Regression correctly identified 84.58% of the True Values
The F1 score determined that the LR correctly identified 65.56% of the True Positives
Stored 'lr' (dict)
