In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [7]:
df = pd.DataFrame(data=np.array([[1, 2], [3, 4]]))

In [148]:
def generate_confusion_matrix(actual_list, predicted_list):
    out = [[0., 0.],[0., 0.]]
    # Predicted     0   1
    # Actual:    0 TN  FP
    # Actual:    1 FN  TP 

    assert(len(actual_list) == len(predicted_list))
    
    for actual,predicted in zip(actual_list, predicted_list):
        # True Negatives/positives
        if actual == predicted: # True {something}
            if actual == 0: # True Negatices
                out[0][0] += 1
            else: # True Positives
                out[1][1] += 1
        else: # False {something}
            if predicted == 1: # False Positives
                out[0][1] += 1
            else: # False negatives
                out[1][0] += 1
    
    return pd.DataFrame(data=out, columns=[0, 1], index=[0, 1])

In [149]:
def get_accuracy(confusion_matrix):
    # TP + TN
    total = confusion_matrix.sum().sum()
    tp_plus_tn = confusion_matrix[1][1] + confusion_matrix[0][0]
    return tp_plus_tn/total

def get_error(confusion_matrix):
    # FP + FN
    total = confusion_matrix.sum().sum()
    fp_plus_fn = confusion_matrix[0][1] + confusion_matrix[0][0]
    return fp_plus_fn/total

def get_recall(confusion_matrix):
    # TP / (TP + FN)
    tp_plus_tn = confusion_matrix[1][1] + confusion_matrix[0][1]
    tp = confusion_matrix[1][1]
    return tp/tp_plus_tn

def get_specificity(confusion_matrix):
    # TN / (TN + FN)
    tn_plus_fn = confusion_matrix[0][0] + confusion_matrix[0][1]
    tn = confusion_matrix[0][0]
    return tn/tn_plus_fn

def get_precision(confusion_matrix):
    # TP / (TP + FP)
    tp_plus_fp = confusion_matrix[1][1] + confusion_matrix[1][0]
    tp = confusion_matrix[1][1]
    return tp/tp_plus_fp

def get_f1_score(confusion_matrix):
    numerator = 2*get_precision(confusion_matrix) * get_recall(confusion_matrix)
    denominator = get_precision(confusion_matrix) + get_recall(confusion_matrix)
    return numerator/denominator

In [34]:
c_matrix = pd.DataFrame(data=np.array([[118., 12.], [ 47., 15.]]))

In [35]:
c_matrix

Unnamed: 0,0,1
0,118.0,12.0
1,47.0,15.0


In [36]:
print(f'Accuracy: {get_accuracy(c_matrix)}')
print(f'Error: {get_error(c_matrix)}')
print(f'Recall: {get_recall(c_matrix)}')
print(f'Speceficity: {get_specificity(c_matrix)}')
print(f'Precision: {get_precision(c_matrix)}')
print(f'F1_score: {get_f1_score(c_matrix)}')

Accuracy: 0.6927083333333334
Error: 0.859375
Recall: 0.24193548387096775
Speceficity: 0.7151515151515152
Precision: 0.5555555555555556
F1_score: 0.3370786516853933


In [41]:
df = pd.read_csv('diabetes.csv')

feature_cols = ['Pregnancies', 'Insulin', 'BMI', 'Age']
feature_df = df[feature_cols]

X = feature_df.to_numpy()
y = df['Outcome'].to_numpy()

In [42]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=0)

In [43]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [46]:
y_pred = log_reg.predict_proba(X_test)

In [104]:
def round_y_prob(input_matrix, threshold=.5):
    out = []
    for i in input_matrix:
        if i[1] < threshold:
            out.append(0)
        else:
            out.append(1)
    return out 

In [125]:
# Dataset is very unbalanced, so a threshold of .5 might not be a good idea! 
vc = pd.DataFrame(data=y_train, columns=['output'])['output'].value_counts()
threshold = vc[1]/(vc.sum())

In [155]:
y_actual_pred = round_y_prob(y_pred, .335195)

In [156]:
cf_matrix = generate_confusion_matrix(y_test, y_actual_pred)
cf_matrix

Unnamed: 0,0,1
0,84.0,46.0
1,17.0,45.0


In [157]:
print(f'Accuracy: {get_accuracy(cf_matrix)}')
print(f'Error: {get_error(cf_matrix)}')
print(f'Recall: {get_recall(cf_matrix)}')
print(f'Speceficity: {get_specificity(cf_matrix)}')
print(f'Precision: {get_precision(cf_matrix)}')
print(f'F1_score: {get_f1_score(cf_matrix)}')

Accuracy: 0.671875
Error: 0.5260416666666666
Recall: 0.7258064516129032
Speceficity: 0.8316831683168316
Precision: 0.4945054945054945
F1_score: 0.5882352941176471


In [158]:
from sklearn.model_selection import cross_val_score 
import pandas as pd
from sklearn.linear_model import LogisticRegression

pima = pd.read_csv('diabetes.csv')

feature_cols = ['Pregnancies', 'Insulin', 'BMI', 'Age']

# X is a matrix,access the features we want in feature_cols
X = pima[feature_cols]

# y is a vector, hence we use dot to access 'label'
y = pima['Outcome']

print(y.value_counts()[0])
print(y.value_counts()[1])

logreg = LogisticRegression(class_weight={1: 500/268})
#logreg = LogisticRegression(class_weight={1: y.value_counts()[0]/y.value_counts()[1]})
# logreg = LogisticRegression()

all_accuracies = cross_val_score(estimator=logreg, X=X, y=y, cv=5, scoring='accuracy')
print(all_accuracies)
print(all_accuracies.mean())

all_f1 = cross_val_score(estimator=logreg, X=X, y=y, cv=5, scoring='f1')
print(all_f1)
print(all_f1.mean())

500
268
[0.64935065 0.65584416 0.64935065 0.69281046 0.65359477]
0.6601901366607248
[0.578125   0.55462185 0.54237288 0.624      0.576     ]
0.5750239460190857




In [None]:
# When we want to choose between SVM and logistic regression, choose the one that has less 
# variance in accuracy(5-fold cv). If they are both low, choose the model that has the highest mean 
# for accuracy. 