In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [7]:
df = pd.DataFrame(data=np.array([[1, 2], [3, 4]]))

In [100]:
def generate_confusion_matrix(actual_list, predicted_list):
    out = [[0., 0.],[0., 0.]]
    # Predicted     0   1
    # Actual:    0 TN  FP
    # Actual:    1 FN  TP 

    assert(len(actual_list) == len(predicted_list))
    
    for actual,predicted in zip(actual_list, predicted_list):
        # True Negatives/positives
        if actual == predicted: # True {something}
            if actual == 0: # True Negatices
                out[0][0] += 1
            else: # True Positives
                out[1][1] += 1
        else: # False {something}
            if predicted == 1: # False Positives
                out[0][1] += 1
            else: # False negatives
                out[1][0] += 1
    
    return pd.DataFrame(data=out, columns=['0', '1'], index=['0', '1'])

In [33]:
def get_accuracy(confusion_matrix):
    # TP + TN
    total = confusion_matrix.sum().sum()
    tp_plus_tn = confusion_matrix[1][1] + confusion_matrix[0][0]
    return tp_plus_tn/total

def get_error(confusion_matrix):
    # FP + FN
    total = confusion_matrix.sum().sum()
    fp_plus_fn = confusion_matrix[0][1] + confusion_matrix[0][0]
    return fp_plus_fn/total

def get_recall(confusion_matrix):
    # TP / (TP + FN)
    tp_plus_tn = confusion_matrix[1][1] + confusion_matrix[0][1]
    tp = confusion_matrix[1][1]
    return tp/tp_plus_tn

def get_specificity(confusion_matrix):
    # TN / (TN + FN)
    tn_plus_fn = confusion_matrix[0][0] + confusion_matrix[0][1]
    tn = confusion_matrix[0][0]
    return tn/tn_plus_fn

def get_precision(confusion_matrix):
    # TP / (TP + FP)
    tp_plus_fp = confusion_matrix[1][1] + confusion_matrix[1][0]
    tp = confusion_matrix[1][1]
    return tp/tp_plus_fp

def get_f1_score(confusion_matrix):
    numerator = 2*get_precision(confusion_matrix) * get_recall(confusion_matrix)
    denominator = get_precision(confusion_matrix) + get_recall(confusion_matrix)
    return numerator/denominator

In [34]:
c_matrix = pd.DataFrame(data=np.array([[118., 12.], [ 47., 15.]]))

In [35]:
c_matrix

Unnamed: 0,0,1
0,118.0,12.0
1,47.0,15.0


In [36]:
print(f'Accuracy: {get_accuracy(c_matrix)}')
print(f'Error: {get_error(c_matrix)}')
print(f'Recall: {get_recall(c_matrix)}')
print(f'Speceficity: {get_specificity(c_matrix)}')
print(f'Precision: {get_precision(c_matrix)}')
print(f'F1_score: {get_f1_score(c_matrix)}')

Accuracy: 0.6927083333333334
Error: 0.859375
Recall: 0.24193548387096775
Speceficity: 0.7151515151515152
Precision: 0.5555555555555556
F1_score: 0.3370786516853933


In [41]:
df = pd.read_csv('diabetes.csv')

feature_cols = ['Pregnancies', 'Insulin', 'BMI', 'Age']
feature_df = df[feature_cols]

X = feature_df.to_numpy()
y = df['Outcome'].to_numpy()

In [42]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=0)

In [43]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [46]:
y_pred = log_reg.predict_proba(X_test)

In [67]:
def round_y_prob(input_matrix, threshold=.5):
    out = []
    for i in input_matrix:
        if i[1] < threshold:
            out.append(0)
        else:
            out.append(1)
    return out 

In [86]:
y_actual_pred = round_y_prob(y_pred, .357)

In [87]:
print(y_actual_pred)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [99]:
print(y_test)

[1 0 0 1 0 0 1 1 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1
 0 0 0 0 0 0 1 1 0 0 1 1 1 0 0 1 0 0 0 0 1 1 1 1 0 0 1 1 1 1 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 1 0 0 0 1 0
 1 1 1 1 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 0 1 1 0 0 0 0 0 1 0 0 0
 0 1 0 1 0 0 1 0 0 0 1 1 1 1 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 1 0 1 1
 0 1 1 1 0 0 0]


In [71]:
# Dataset is very unbalanced, so a threshold of .5 might not be a good idea! 

In [85]:
vc = pd.DataFrame(data=y_train, columns=['output'])['output'].value_counts()
vc[1]/(vc.sum())

0.3576388888888889

In [101]:
cf_matrix = generate_confusion_matrix(y_test, y_actual_pred)

In [102]:
c_matrix

Unnamed: 0,0,1
0,118.0,12.0
1,47.0,15.0


In [103]:
cf_matrix

Unnamed: 0,0,1
0,126.0,4.0
1,54.0,8.0


In [97]:
cf_matrix = pd.DataFrame(data=np.array([[126., 4.], [54., 8.]]))
cf_matrix

Unnamed: 0,0,1
0,126.0,4.0
1,54.0,8.0


In [98]:
print(f'Accuracy: {get_accuracy(cf_matrix)}')
print(f'Error: {get_error(cf_matrix)}')
print(f'Recall: {get_recall(cf_matrix)}')
print(f'Speceficity: {get_specificity(cf_matrix)}')
print(f'Precision: {get_precision(cf_matrix)}')
print(f'F1_score: {get_f1_score(cf_matrix)}')

Accuracy: 0.6979166666666666
Error: 0.9375
Recall: 0.12903225806451613
Speceficity: 0.7
Precision: 0.6666666666666666
F1_score: 0.2162162162162162
