# 04 Create Models

### Purpose of Notebook
- Build a bunch of models:
    - Logistic Regression
    - K Nearest Neighbors
    - Random Forest
    - AdaBoost with GridSearch for CV
    
    
- Put results of each model into a table and review

## Imports

In [2]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import random
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC

random.seed(42)

In [2]:
def get_metrics(y_true, y_predict):
    matrix_def = [['tn','fp'], ['fn','tp']]
    matrix = confusion_matrix(y_true, y_predict)
    tn, fp, fn, tp = matrix.ravel()
    accuracy = (tp+tn)/(tn+fp+fn+tp)
    misclass = 1-accuracy
    sensitivity = tp/(tp+fn)
    specificity = tn/(tn+fp)
    precision = tp/(tp+fp)
    print('Matrix Definition')
    print(np.array(matrix_def))
    print('')
    print('Confusion Matrix')
    print(matrix)
    print('')
    print('METRICS')
    print(f'accuracy: {accuracy}')
    print(f'misclass: {misclass}')
    print(f'sensitivity: {sensitivity}')
    print(f'specificity: {specificity}')
    #return accuracy, misclass, sensitivity, specificity, precision

## Pull in train and test data

In [4]:
with open('../Data/X_train_clean.pkl', 'rb') as f:
    X_train = pickle.load(f)
    
with open('../Data/X_test_clean.pkl', 'rb') as f:
    X_test = pickle.load(f)
    
with open('../Data/y_train.pkl', 'rb') as f:
    y_train = pickle.load(f)
    
with open('../Data/y_test.pkl', 'rb') as f:
    y_test = pickle.load(f)

## Create DataFrame to hold results

In [8]:
results = pd.DataFrame(columns=['model','train_score', 'test_score'])

In [13]:
results.at('1','model')

TypeError: __call__() takes from 1 to 2 positional arguments but 3 were given

## Logistic Regression

In [5]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
print('Train:', lr.score(X_train,y_train))
print('Test:', lr.score(X_test,y_test))

Train: 0.9567765567765568
Test: 0.8769230769230769


In [16]:
coefficients = pd.DataFrame(lr.coef_[0], index=X_train.columns)

In [18]:
coefficients.sort_values(0)

Unnamed: 0,0
feel,-2.551461
like,-2.016074
love,-1.964066
year,-1.955400
num_comments,-1.900468
just,-1.660887
life,-1.465191
want,-1.447719
don,-1.417637
relationship,-1.384231


Old Score
Train: 0.9582417582417583
Test: 0.8769230769230769

In [7]:
get_metrics(y_test, lr.predict(X_test))

Matrix Definition
[['tn' 'fp']
 ['fn' 'tp']]

Confusion Matrix
[[251  49]
 [ 23 262]]

METRICS
accuracy: 0.8769230769230769
misclass: 0.12307692307692308
sensitivity: 0.9192982456140351
specificity: 0.8366666666666667


## K Nearest Neighbors

In [50]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
print('Train:', knn.score(X_train,y_train))
print('Test:', knn.score(X_test,y_test))

Train: 0.8344322344322345
Test: 0.7658119658119659


Old Score:
Train: 0.8424908424908425
Test: 0.7692307692307693

In [9]:
get_metrics(y_test, knn.predict(X_test))

Matrix Definition
[['tn' 'fp']
 ['fn' 'tp']]

Confusion Matrix
[[200 100]
 [ 35 250]]

METRICS
accuracy: 0.7692307692307693
misclass: 0.23076923076923073
sensitivity: 0.8771929824561403
specificity: 0.6666666666666666


## Random Forest

In [51]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
print('Train:', rf.score(X_train,y_train))
print('Test:',rf.score(X_test,y_test))

Train: 0.9948717948717949
Test: 0.8615384615384616


Old Score:
Train: 0.9978021978021978
Test: 0.864957264957265

In [11]:
get_metrics(y_test, rf.predict(X_test))

Matrix Definition
[['tn' 'fp']
 ['fn' 'tp']]

Confusion Matrix
[[264  36]
 [ 43 242]]

METRICS
accuracy: 0.864957264957265
misclass: 0.135042735042735
sensitivity: 0.8491228070175438
specificity: 0.88


## AdaBoost with GridSearch for CV

In [21]:
ada = AdaBoostClassifier(random_state=42)
ada_params = {}
gs = GridSearchCV(ada, param_grid=ada_params)
gs.fit(X_train, y_train)
print('Best Score:', gs.best_score_)
print('Best Parameters:', gs.best_params_)
print('Test:',gs.score(X_test,y_test))

Best Score: 0.8930402930402931
Best Parameters: {}
Test: 0.9213675213675213


In [23]:
coefficients = pd.DataFrame(gs.best_estimator_.feature_importances_, index=X_train.columns)

In [26]:
coefficients.sort_values(0, ascending=False)

Unnamed: 0,0
text_len,0.14
num_comments,0.12
score,0.08
title_len,0.06
year,0.04
son,0.02
walk,0.02
suppose,0.02
com,0.02
week,0.02


In [22]:
gs.best_estimator_.feature_importances_

array([0.  , 0.  , 0.  , ..., 0.08, 0.06, 0.14])

Old Score: <br>
Best Score: 0.9003663003663004<br>
Best Parameters: {}<br>
Test: 0.9128205128205128<br>

In [20]:
np.mean(y_test) 

0.48717948717948717

In [54]:
get_metrics(y_test, gs.predict(X_test))

Matrix Definition
[['tn' 'fp']
 ['fn' 'tp']]

Confusion Matrix
[[276  24]
 [ 22 263]]

METRICS
accuracy: 0.9213675213675213
misclass: 0.07863247863247869
sensitivity: 0.9228070175438596
specificity: 0.92


## Try SVM

In [14]:
%%time
sv = SVC()
param_grid = {
    'gamma': np.logspace(-5, 2, 20)
}

gs_sv = GridSearchCV(sv, param_grid,scoring='accuracy', verbose=1, n_jobs=3)
gs_sv.fit(X_train, y_train)
print('Best Score:', gs_sv.best_score_)
print('Best Parameters:', gs_sv.best_params_)
print('Test:',gs_sv.score(X_test,y_test))

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:  5.4min
[Parallel(n_jobs=3)]: Done  60 out of  60 | elapsed:  7.3min finished


Best Score: 0.8278388278388278
Best Parameters: {'gamma': 0.6158482110660255}
Test: 0.8444444444444444
CPU times: user 56.7 s, sys: 943 ms, total: 57.6 s
Wall time: 7min 39s


In [15]:
%%time
sv = SVC()
sv.fit(X_train, y_train)
print('Train:', sv.score(X_train,y_train))
print('Test:', sv.score(X_test,y_test))

Train: 0.5062271062271062
Test: 0.5128205128205128
CPU times: user 38.8 s, sys: 306 ms, total: 39.1 s
Wall time: 39.4 s
