In [4]:
import pandas as pd
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
col_names = ["pregnant", "glucose", "bp", "skin", "insulin", "bmi", "pedigree", "age", "label"]
pima = pd.read_csv(url, header=None, names=col_names)

In [5]:
pima.head()

Unnamed: 0,pregnant,glucose,bp,skin,insulin,bmi,pedigree,age,label
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
# define X and y
feature_cols = ["pregnant", "insulin", "bmi", "age"]
X = pima[feature_cols]
y = pima.label

In [7]:
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [9]:
# train a logistic regression model
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [10]:
y_pred_class = logreg.predict(X_test)

In [11]:
# calculate accuracy
from sklearn import metrics
print(metrics.accuracy_score(y_test, y_pred_class))

0.692708333333


In [12]:
# examine the class distribution of testing set
y_test.value_counts()

0    130
1     62
Name: label, dtype: int64

In [13]:
# calculate the percentage of 1
y_test.mean()

0.32291666666666669

In [14]:
# calculate the percentage of 0
1 - y_test.mean()

0.67708333333333326

In [15]:
# calculate null accuracy for binary classification
max(y_test.mean(), 1 - y_test.mean())

0.67708333333333326

In [17]:
# calculate null accuracy for multi-class classification
y_test.value_counts().head(1) / len(y_test)

0    0.677083
Name: label, dtype: float64

In [18]:
print("true: ", y_test.values[0:25])
print("pred: ", y_pred_class[0:25])

true:  [1 0 0 1 0 0 1 1 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 0 0]
pred:  [0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [19]:
# Confusion Matrix
print(metrics.confusion_matrix(y_test, y_pred_class))

[[118  12]
 [ 47  15]]


In [20]:
# safe confusion matrix and slice into 4 pieces
confusion = metrics.confusion_matrix(y_test, y_pred_class)
TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]

In [21]:
# Classification Accuracy - how many predictions are correct?
print((TP + TN) / (TP + TN + FP + FN))
print(metrics.accuracy_score(y_test, y_pred_class))

0.692708333333
0.692708333333


In [23]:
# Classification Error - how many predictions are wrong?
print((FP + FN) / (TP + TN + FP + FN))
print(1 - metrics.accuracy_score(y_test,  y_pred_class))

0.307291666667
0.307291666667


In [24]:
# Sensitivity - when the actual value is positive, how often is the prediction true?
print(TP / (TP + FN))
print(metrics.recall_score(y_test, y_pred_class))

0.241935483871
0.241935483871


In [25]:
# Specificity - when the actual value is negative, how often is the prediction correct?
print(TN / (TN + FP))

0.907692307692


In [26]:
# False Positive Rate - when the actual value is negative, how often is the prediction incorrect?
print(FP / (TN + FP))

0.0923076923077


In [27]:
# Precision - when a positive value is predicted, how often is the prediction correct?
print(TP / (TP + FP))
print(metrics.precision_score(y_test, y_pred_class))

0.555555555556
0.555555555556
