In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Review of Logistic Regression
This is yet another classifier model, and is different from SVM in that it tries to fit a logarithmic  
curve through the data where the data is represented with 1 and 0 on the y axis and dots on the x axis

In [3]:
df = pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


### 1. We are going to train our model with only 4 of these features

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

feature_cols = ['Pregnancies', 'Insulin', 'BMI', 'Age']
feature_df = df[feature_cols]

X = feature_df.to_numpy()
y = df['Outcome'].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=0)

log_reg = LogisticRegression(solver='lbfgs')
log_reg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

### 2. Make predictions

With logistic regression we can just do the normal predict

In [14]:
y_pred = log_reg.predict(X_test)

BUT! Because logistic regression fits a logorithmic curve through the data between 0 and 1, we can ask it to tell us the probability of the condition being 1 or 0 based on the feature value(s). This is the true power of logistic regression, because we can manually set thresholds for what probability we consider to the outcome to be 1 or 0.

In [19]:
y_pred_prob = log_reg.predict_proba(X_test)
y_pred_prob[:4]

array([[0.61405867, 0.38594133],
       [0.7505398 , 0.2494602 ],
       [0.74167648, 0.25832352],
       [0.60291327, 0.39708673]])

In [17]:
log_reg.classes_

array([0, 1])

So we can see that for the first row for example, the probability of the outcome being 0 is .61, and for 1 it is .38. If we want to find out what the cols are, we can say log_reg.classes_ to get the col names

### 3. Build a confusion matrix, which is used to measure how good the model is
We can use Sklearn again to get a confusion matrix. A confusion matrix is basically this: 

In [20]:
# Predicted:    0   1  
# Actual:    0 TN  FP    
# Actual:    1 FN  TP  

Where:  
TN - True Negatives  
TP - True Positives  
FN - False Negatives  
FP - False Positives   

In [21]:
from sklearn import metrics

c_matrix = metrics.confusion_matrix(y_pred, y_test)
c_matrix

array([[114,  46],
       [ 16,  16]])

### 4. Find f1 score of the model
We can again use sklearn for this, or use our own functions!

In [22]:
def get_accuracy(confusion_matrix):
    # TP + TN
    total = confusion_matrix.sum().sum()
    tp_plus_tn = confusion_matrix[1][1] + confusion_matrix[0][0]
    return tp_plus_tn/total

def get_error(confusion_matrix):
    # FP + FN
    total = confusion_matrix.sum().sum()
    fp_plus_fn = confusion_matrix[0][1] + confusion_matrix[0][0]
    return fp_plus_fn/total

def get_recall(confusion_matrix):
    # TP / (TP + FN)
    tp_plus_tn = confusion_matrix[1][1] + confusion_matrix[0][1]
    tp = confusion_matrix[1][1]
    return tp/tp_plus_tn

def get_specificity(confusion_matrix):
    # TN / (TN + FN)
    tn_plus_fn = confusion_matrix[0][0] + confusion_matrix[0][1]
    tn = confusion_matrix[0][0]
    return tn/tn_plus_fn

def get_precision(confusion_matrix):
    # TP / (TP + FP)
    tp_plus_fp = confusion_matrix[1][1] + confusion_matrix[1][0]
    tp = confusion_matrix[1][1]
    return tp/tp_plus_fp

def get_f1_score(confusion_matrix):
    numerator = 2*get_precision(confusion_matrix) * get_recall(confusion_matrix)
    denominator = get_precision(confusion_matrix) + get_recall(confusion_matrix)
    return numerator/denominator

In [26]:
print(f'F1 score of our model: {get_f1_score(c_matrix)}')
print(f'With confusion matrix: \n{c_matrix}')

F1 score of our model: 0.3404255319148936
With confusion matrix: 
[[114  46]
 [ 16  16]]


Not that good of a model! We will learn how to improve it in the next lesson on model evaluation!