# 05 Create Baseline Models

### Purpose of Notebook
- Build a bunch of quick-to-train models:
    - Logistic Regression
    - K Nearest Neighbors
    - Random Forest
    - AdaBoost with GridSearch for CV
    
    
- Put results of each model into a table and review

## Imports

In [47]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import random
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC

random.seed(42)

In [48]:
def get_metrics(y_true, y_predict, print_scores = True):
    matrix_def = [['tn','fp'], ['fn','tp']]
    matrix = confusion_matrix(y_true, y_predict)
    tn, fp, fn, tp = matrix.ravel()
    accuracy = (tp+tn)/(tn+fp+fn+tp)
    misclass = 1-accuracy
    sensitivity = tp/(tp+fn)
    specificity = tn/(tn+fp)
    precision = tp/(tp+fp)
    if print_scores:
        print('Matrix Definition')
        print(np.array(matrix_def))
        print('')
        print('Confusion Matrix')
        print(matrix)
        print('')
        print('METRICS')
        print(f'accuracy: {accuracy}')
        print(f'misclass: {misclass}')
        print(f'sensitivity: {sensitivity}')
        print(f'specificity: {specificity}')
    else:
        return accuracy, misclass, sensitivity, specificity, precision

## Pull in train and test data

In [49]:
with open('../Data/X_train_clean.pkl', 'rb') as f:
    X_train = pickle.load(f)
    
with open('../Data/X_test_clean.pkl', 'rb') as f:
    X_test = pickle.load(f)
    
with open('../Data/y_train.pkl', 'rb') as f:
    y_train = pickle.load(f)
    
with open('../Data/y_test.pkl', 'rb') as f:
    y_test = pickle.load(f)

## Create DataFrame to hold results

In [28]:
results = pd.DataFrame(columns=['model','train_score', 'test_score'])

## Logistic Regression

In [50]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
print('Train:', lr.score(X_train,y_train))
print('Test:', lr.score(X_test,y_test))

Train: 0.9604395604395605
Test: 0.8888888888888888


In [51]:
get_metrics(y_test, lr.predict(X_test))

Matrix Definition
[['tn' 'fp']
 ['fn' 'tp']]

Confusion Matrix
[[255  45]
 [ 20 265]]

METRICS
accuracy: 0.8888888888888888
misclass: 0.11111111111111116
sensitivity: 0.9298245614035088
specificity: 0.85


In [31]:
counter = len(results) + 1
results.loc[counter, 'model'] = 'Logistic Regression'
results.loc[counter, 'train_score'] = lr.score(X_train,y_train)
results.loc[counter, 'test_score'] = lr.score(X_test,y_test)

In [69]:
coefficients = pd.DataFrame(lr.coef_[0], 
                            index=X_train.columns,
                            columns=['coefficients']).sort_values('coefficients', ascending=False)

In [70]:
coefficients.head(10)

Unnamed: 0,coefficients
score,5.620922
say,2.000529
man,1.718731
yuri,1.41887
did,1.160459
doctor,1.039385
wife,0.984503
asks,0.949465
joke,0.91847
yellow,0.893752


In [79]:
coefficients.sort_values('coefficients').head(10)

Unnamed: 0,coefficients
feel,-2.585227
like,-2.214176
year,-1.96312
love,-1.871212
num_comments,-1.84777
just,-1.834649
don,-1.605106
want,-1.407912
ve,-1.384648
life,-1.322288


## K Nearest Neighbors

In [34]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
print('Train:', knn.score(X_train,y_train))
print('Test:', knn.score(X_test,y_test))

Train: 0.8344322344322345
Test: 0.7658119658119659


In [35]:
get_metrics(y_test, knn.predict(X_test))

Matrix Definition
[['tn' 'fp']
 ['fn' 'tp']]

Confusion Matrix
[[200 100]
 [ 37 248]]

METRICS
accuracy: 0.7658119658119659
misclass: 0.23418803418803413
sensitivity: 0.8701754385964913
specificity: 0.6666666666666666


In [36]:
counter = len(results) + 1
results.loc[counter, 'model'] = 'K Nearest Neighbors'
results.loc[counter, 'train_score'] = knn.score(X_train,y_train)
results.loc[counter, 'test_score'] = knn.score(X_test,y_test)

## Random Forest

In [52]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
print('Train:', rf.score(X_train,y_train))
print('Test:',rf.score(X_test,y_test))

Train: 0.9956043956043956
Test: 0.8683760683760684


In [53]:
get_metrics(y_test, rf.predict(X_test))

Matrix Definition
[['tn' 'fp']
 ['fn' 'tp']]

Confusion Matrix
[[251  49]
 [ 28 257]]

METRICS
accuracy: 0.8683760683760684
misclass: 0.1316239316239316
sensitivity: 0.9017543859649123
specificity: 0.8366666666666667


In [39]:
counter = len(results) + 1
results.loc[counter, 'model'] = 'Random Forest'
results.loc[counter, 'train_score'] = rf.score(X_train,y_train)
results.loc[counter, 'test_score'] = rf.score(X_test,y_test)

## AdaBoost with GridSearch for CV

In [54]:
ada = AdaBoostClassifier(random_state=42)
ada_params = {}
gs = GridSearchCV(ada, param_grid=ada_params)
gs.fit(X_train, y_train)
print('Best Score:', gs.best_score_)
print('Best Parameters:', gs.best_params_)
print('Test:',gs.score(X_test,y_test))

Best Score: 0.8923076923076924
Best Parameters: {}
Test: 0.9213675213675213


In [75]:
feature_importance = pd.DataFrame(gs.best_estimator_.feature_importances_, 
                                  index=X_train.columns,
                                  columns=['coefficients']).sort_values('coefficients', 
                                                                        ascending=False)

In [80]:
feature_importance.head(10)

Unnamed: 0,coefficients
text_len,0.14
num_comments,0.12
score,0.08
title_len,0.06
year,0.04
com,0.02
people,0.02
say,0.02
love,0.02
husband,0.02


In [44]:
get_metrics(y_test, gs.predict(X_test))

Matrix Definition
[['tn' 'fp']
 ['fn' 'tp']]

Confusion Matrix
[[276  24]
 [ 22 263]]

METRICS
accuracy: 0.9213675213675213
misclass: 0.07863247863247869
sensitivity: 0.9228070175438596
specificity: 0.92


In [55]:
counter = len(results) + 1
results.loc[counter, 'model'] = 'AdaBoost'
results.loc[counter, 'train_score'] = gs.score(X_train,y_train)
results.loc[counter, 'test_score'] = gs.score(X_test,y_test)

## View Baseline Model Results

In [83]:
results

Unnamed: 0,model,train_score,test_score
1,Logistic Regression,0.956777,0.876923
2,K Nearest Neighbors,0.834432,0.765812
3,Random Forest,0.997802,0.852991
4,AdaBoost,0.950183,0.921368
