In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

pima = pd.read_csv('Datasets/diabetes.csv')

feature_cols = ['Pregnancies', 'Insulin', 'BMI', 'Age']

# X is a matrix, access the features we want in feature_cols
X = pima[feature_cols]

# y is a vector, hence we use dot to access 'label'
y = pima['Outcome']

# split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=0)

y_test.value_counts()

0    130
1     62
Name: Outcome, dtype: int64

## Activity: Check the size of y_train and show that it is 75% of the whole dataset

In [2]:
# check the size of y_train
print(len(y_train))
print(0.75*len(pima))

print(len(y_test))
print(0.25*len(pima))

576
576.0
192
192.0


## Activity: Build the classifier model with Logistic Regression and produce y_pred from X_test (features from test part)

- Instantiate logistic regression model
- Train the model with X_train and y_train
- Pass X_test into predict method -> call the result as y_pred
- print y_pred

hint : from sklearn.linear_model import LogisticRegression

In [3]:
from sklearn.linear_model import LogisticRegression

# Instantiate logistic regression model
logreg = LogisticRegression()

# fit model
# Train the model with X_train and y_train
logreg.fit(X_train, y_train)

# Pass X_test into predict method -> call the result as y_pred
y_pred = logreg.predict(X_test)

# print(y_test.values.T)
# print y_pred
print(y_pred)

[0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 1 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 1 0 1 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 1 1 1 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 1 0 0 0 0 1 0 0 0
 0 0 0 0 0 0 0]


## Intro to Confusion Matrix
A confusion matrix is a table that is used to describe the performance of a classifier on a set of test data where we know the true vales. Essentially, we use it to check how well our classifier's predicted values matched against the known values of the same data.

### Activity: Write a function that calculates the confusion matrix for the Pima Diabetes dataset

The confusion matrix itself is a simple 2x2 matrix, but it's important we go over the terminology of each row/column in the matrix:

**True Positives (TP)**: we correctly predicted a positive outcome (i.e. someone has diabetes, and we correctly predicted it)
- How many 0s (no diabetes) in y_test are predicted correctly as 0 (no diabetes) in y_pred?

**True Negatives (TN)**: we correctly predicted a negative outcome (i.e. someone does not have diabetes, and we correctly predicted it)
- How many 1s (diabetes) in y_test are predicted correctly 1 (diabetes) in y_pred?

**False Positives (FP)**: we incorrectly predicted a positive outcome (i.e. someone does not diabetes, and we incorrectly said that they did)
- How many 0s (no diabetes) in y_test are predicted incorrectly as 1 (diabetes) in y_pred?

**False Negatives (FN)**: we incorrectly predicted a negative outcome (i.e. someone has diabetes, and we incorrectly said that they do not)
- How many 1s (diabetes) in y_test are predicted incorrectly as 0 (no diabetes) in y_pred?

In [4]:
import numpy as np

def comp_yt_yp(y_test, y_predict):
    # create a blank 2x2 confusion matrix (all 0s)
    conf_matrix  = np.zeros((2, 2))
    # indices that will create all confusion matrix values
    # TP (1,1), TN (0,0), FP (0, 1), FN (1, 0)
    for row_index in [0, 1]:
        for column_index in [0, 1]:
            counter = 0
            # iterate through all elements of y_test, y_predict,
            # which are all values of either 0 or 1
            for (yt_index, yp_index) in zip(y_test, y_predict):
                # comparing the elements of y_test and y_predict with each confusion matrix value (TP, TN, FP, FN),
                # and if there's a match for the confusion matrix value we're looking at, increment the counter
                if (yt_index == row_index) & (yp_index == column_index):
                        counter += 1
            # Add the total number of elements for the confusion matrix value,
            # then look at the next value in the loop
            conf_matrix[row_index, column_index] = counter 
    return conf_matrix

# print the result of calculating our confusion matrix
print(comp_yt_yp(y_test, y_pred))

[[114.  16.]
 [ 46.  16.]]


In [18]:
num_TP = num_TN = num_FP = num_FN = 0
for (i, j) in zip(y_test, y_pred):
    # How many 0s (no diabetes) in y_test are predicted correctly as 0 (no diabetes) in y_pred?
    if i == 0 and j == 0:
        num_TP += 1
        
    # How many 1s (diabetes) in y_test are predicted correctly 1 (diabetes) in y_pred? 
    if i == 1 and j == 1:
        num_TN += 1
    
    # How many 0s (no diabetes) in y_test are predicted incorrectly as 1 (diabetes) in y_pred?
    if i == 0 and j == 1:
        num_FP += 1
    
    # How many 1s (diabetes) in y_test are predicted incorrectly as 0 (no diabetes) in y_pred?
    if i == 1 and j == 0:
        num_FN += 1

print(TP, TN, FP, FN)

16 114 16 46


In [6]:
def compare_ys(y_te, y_pre, value_of_y_te, value_of_y_pre):
    counter = 0
    for(i,j) in zip(y_te, y_pre):
        if (i == value_of_y_te) and (j == value_of_y_pre):
            counter += 1
            
    return counter

print(compare_ys(y_test, y_pred, 0, 0))
print(compare_ys(y_test, y_pred, 0, 1))
print(compare_ys(y_test, y_pred, 1, 0))
print(compare_ys(y_test, y_pred, 1, 1))

114
16
46
16


In [7]:
def comp_yt_yp(y_test, y_predict):
    conf_matrix = np.zeros((2,2))
    for m in [0, 1]:
        for n in [0, 1]:
            c = 0
            for (i, j) in zip(y_test, y_predict):
                if (i == m) & (j == n):
                    c += 1
            conf_matrix[m, n] = c
    return conf_matrix

print(comp_yt_yp(y_test, y_pred))

[[114.  16.]
 [ 46.  16.]]


## Easier way to compute elements of Confusion Matrix using sklearn

In [8]:
from sklearn import metrics

confusion = metrics.confusion_matrix(y_test, y_pred)
print(confusion)

TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]

[[114  16]
 [ 46  16]]


In [21]:
X_test[:10]

Unnamed: 0,Pregnancies,Insulin,BMI,Age
661,1,0,42.9,22
122,2,100,33.6,23
113,4,0,34.0,25
14,5,175,25.8,51
529,0,0,24.6,31
103,1,40,26.6,24
338,9,171,34.2,33
588,3,156,33.3,52
395,2,275,27.7,25
204,6,190,37.7,55


In [19]:
logreg.predict(X_test)[:10]

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 1])

In [20]:
logreg.predict_proba(X_test)[:10]

array([[0.61405867, 0.38594133],
       [0.7505398 , 0.2494602 ],
       [0.74167648, 0.25832352],
       [0.60291327, 0.39708673],
       [0.88426611, 0.11573389],
       [0.87695895, 0.12304105],
       [0.50819992, 0.49180008],
       [0.44582289, 0.55417711],
       [0.77950769, 0.22049231],
       [0.25853303, 0.74146697]])