In [41]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split, KFold, cross_validate
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [45]:
# Problem 1
# Reads diabetes file for data
df = pd.read_csv('C:/Users/brans/Downloads/diabetes.csv')
x = df.values[:, :7]
y = df.values[:, 8]

# Standardization
ss = StandardScaler()
x = ss.fit_transform(x)
mms = MinMaxScaler()
x = mms.fit_transform(x)


# 80 and 20 split
np.random.seed(0)
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8,
                                                    test_size=0.2,
                                                    random_state=np.random)

# Regression
lr = LogisticRegression()
lr.fit(x_train, y_train)
y_pred = lr.predict(x_test)
print('Precision:', metrics.precision_score(y_test, y_pred))
print('Recall:', metrics.recall_score(y_test, y_pred))
print('Accuracy:', metrics.accuracy_score(y_test, y_pred))
print('\n')
cnf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix')
print(cnf_matrix)

Precision: 0.8
Recall: 0.5957446808510638
Accuracy: 0.8311688311688312


Confusion Matrix
[[100   7]
 [ 19  28]]


In [46]:
# Problem 2
# Gaussian Naive Bays model
gnb = GaussianNB()
gnb.fit(x_train, y_train)
y_pred = gnb.predict(x_test)

# Precision, Recall, Accuracy
print('Precision:', metrics.precision_score(y_test, y_pred))
print('Recall:', metrics.recall_score(y_test, y_pred))
print('Accuracy:', metrics.accuracy_score(y_test, y_pred))
print('\n')
cnf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix')
print(cnf_matrix)

Precision: 0.6829268292682927
Recall: 0.5957446808510638
Accuracy: 0.7922077922077922


Confusion Matrix
[[94 13]
 [19 28]]


In [34]:
# Problem 3
# K = 5 folds
metrics = ['accuracy', 'precision', 'recall']
kf5 = KFold(n_splits=5, random_state=1, shuffle=True)
scores = cross_validate(lr, x, y, scoring=metrics, cv=kf5, n_jobs=-1)
print('Precision:', scores['test_precision'].mean())
print('Recall:', scores['test_recall'].mean())
print('Accuracy: ', scores['test_accuracy'].mean())
print('\n')

# K = 10 folds
kf10 = KFold(n_splits=10, random_state=1, shuffle=True)
scores = cross_validate(lr, x, y, scoring=metrics, cv=kf10, n_jobs=-1)
print('Precision:', scores['test_precision'].mean())
print('Recall:', scores['test_recall'].mean())
print('Accuracy:', scores['test_accuracy'].mean())


Precision: 0.775323132227157
Recall: 0.5124971820632198
Accuracy:  0.777361853832442


Precision: 0.7427290946889816
Recall: 0.512147494628754
Accuracy: 0.7707792207792208


In [39]:
# Problem 4
# K = 5 folds
scores = cross_validate(gnb, x, y, scoring=metrics, cv=kf5, n_jobs=-1)
print('Precision:', scores['test_precision'].mean())
print('Recall:', scores['test_recall'].mean())
print('Accuracy:', scores['test_accuracy'].mean())
print('\n')

# K = 10 folds
scores = cross_validate(gnb, x, y, scoring=metrics, cv=kf10, n_jobs=-1)
print('Precision:', scores['test_precision'].mean())
print('Recall:', scores['test_recall'].mean())
print('Accuracy:', scores['test_accuracy'].mean())

Precision: 0.6819369457759837
Recall: 0.5828957227259114
Accuracy: 0.759137594431712


Precision: 0.6863522347218
Recall: 0.5936918506558686
Accuracy: 0.7655673274094327
