%load_ext watermark
%watermark -a "Chibuzor Enyioko" -d -v -p numpy,pandas,matplotlib,seaborn,sklearn

# Project 2: Supervised Classification

This project uses python packages to perform different unsupervised learning methods on a given breast cancer and diabetes dataset.

## Part 2: Diabetes Dataset
### Importing the Libraries

In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
import seaborn as sns

### Problems
1. Identify which column(s) (both train and test) has/have missing values? Identify the ‘row id’s.
“Impute” them with “Average/Most Frequent” values.

In [55]:
from sklearn.impute import KNNImputer


# importing data sets

dbt_training_data = pd.read_csv("diabetes_training.csv")
dbt_test_data = pd.read_csv("diabetes_testing.csv")

x_train = dbt_training_data.drop(columns=['id', 'class'])
y_train = dbt_training_data['class']
x_test = dbt_test_data.drop(columns=['id', 'class'])
y_test = dbt_test_data['class']

#imputer strategy to k nearest neighbors

knn_columns = ['plas','pres','skin','insu','mass','pedi','age']
imputer_knn = KNNImputer(n_neighbors=3)

x_train_knn = x_train[knn_columns].copy()
x_train_knn.replace(0, np.nan, inplace=True)
x_test_knn = x_test[knn_columns].copy()
x_test_knn.replace(0, np.nan, inplace=True)

train_imputed_knn = pd.DataFrame(imputer_knn.fit_transform(x_train_knn), columns=knn_columns)
test_imputed_knn = pd.DataFrame(imputer_knn.transform(x_test_knn), columns=knn_columns)

2. Calculate accuracy using each of these classifiers (up to 3 decimal places):

3. Now tweak the parameters of the above models, what is the best result you can get? Write the answer and upload the workbook as proof. Name this classifier widget as “<classifier>-best”. Example (if the tree widget is the best performer)

#### Logistic Regression

In [56]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

x_train = train_imputed_knn
x_test = test_imputed_knn


clf = LogisticRegression(penalty='l2', C=0.5, max_iter=1000)
clf.fit(x_train, y_train)

# Metrics
y_pred = clf.predict(x_test)
print(classification_report(y_test, y_pred, digits=3))


                 precision    recall  f1-score   support

tested_negative      0.779     0.870     0.822       146
tested_positive      0.721     0.576     0.641        85

       accuracy                          0.762       231
      macro avg      0.750     0.723     0.731       231
   weighted avg      0.758     0.762     0.755       231



In [57]:
# tweaking parameters for Logistic Regression
clf_best = LogisticRegression(penalty='l1', solver='saga', C=7, max_iter=100)
clf_best.fit(x_train, y_train)
y_pred_best = clf_best.predict(x_test)
print(classification_report(y_test, y_pred_best, digits=3))

                 precision    recall  f1-score   support

tested_negative      0.708     0.863     0.778       146
tested_positive      0.623     0.388     0.478        85

       accuracy                          0.688       231
      macro avg      0.665     0.626     0.628       231
   weighted avg      0.677     0.688     0.668       231





#### Naive Bayes

In [58]:
# Naive Bayes
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
y_pred_gnb = gnb.fit(x_train, y_train).predict(x_test)
print(classification_report(y_test, y_pred_gnb, digits=3))

                 precision    recall  f1-score   support

tested_negative      0.787     0.836     0.811       146
tested_positive      0.684     0.612     0.646        85

       accuracy                          0.753       231
      macro avg      0.736     0.724     0.728       231
   weighted avg      0.749     0.753     0.750       231



#### SVM

In [59]:
# SVM
from sklearn import svm

svm_clf = svm.SVC(C=1.0, kernel='rbf', gamma='auto', max_iter=100)
svm_clf.fit(x_train, y_train)
y_pred_svm = svm_clf.predict(x_test)
print(classification_report(y_test, y_pred_svm, zero_division=0, digits=3))


                 precision    recall  f1-score   support

tested_negative      0.734     0.699     0.716       146
tested_positive      0.522     0.565     0.542        85

       accuracy                          0.649       231
      macro avg      0.628     0.632     0.629       231
   weighted avg      0.656     0.649     0.652       231





In [60]:

# tweaking parameters for SVM
svm_clf_best = svm.SVC(C=1.2, kernel='poly', gamma='scale', max_iter=-1)
svm_clf_best.fit(x_train, y_train)
y_pred_svm_best = svm_clf_best.predict(x_test)
print(classification_report(y_test, y_pred_svm_best, zero_division=0, digits=3))


                 precision    recall  f1-score   support

tested_negative      0.759     0.904     0.825       146
tested_positive      0.754     0.506     0.606        85

       accuracy                          0.758       231
      macro avg      0.757     0.705     0.715       231
   weighted avg      0.757     0.758     0.744       231



#### Random Forest

In [61]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators=10, min_samples_split=5)
rf_clf.fit(x_train, y_train)
y_pred_rf = rf_clf.predict(x_test)
print(classification_report(y_test, y_pred_rf, digits=3))

                 precision    recall  f1-score   support

tested_negative      0.771     0.829     0.799       146
tested_positive      0.662     0.576     0.616        85

       accuracy                          0.736       231
      macro avg      0.716     0.703     0.708       231
   weighted avg      0.731     0.736     0.732       231



In [62]:
# tweaking random forest parameters
rf_clf_best = RandomForestClassifier(n_estimators=50, min_samples_split=10)
rf_clf_best.fit(x_train, y_train)
y_pred_rf_best = rf_clf_best.predict(x_test)
print(classification_report(y_test, y_pred_rf_best, digits=3))

                 precision    recall  f1-score   support

tested_negative      0.796     0.801     0.799       146
tested_positive      0.655     0.647     0.651        85

       accuracy                          0.745       231
      macro avg      0.725     0.724     0.725       231
   weighted avg      0.744     0.745     0.744       231



#### k-Nearest Neighbors

In [63]:
# k-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors=5, metric='euclidean', weights='uniform')
knn_clf.fit(x_train, y_train)
y_pred_knn = knn_clf.predict(x_test)
print(classification_report(y_test, y_pred_knn, digits=3))


                 precision    recall  f1-score   support

tested_negative      0.755     0.801     0.777       146
tested_positive      0.618     0.553     0.584        85

       accuracy                          0.710       231
      macro avg      0.687     0.677     0.681       231
   weighted avg      0.705     0.710     0.706       231



#### Tree

In [64]:
from sklearn.tree import DecisionTreeClassifier
tree_clf = DecisionTreeClassifier(max_depth=5, min_samples_split=5, min_samples_leaf=2)
tree_clf.fit(x_train, y_train)
y_pred_tree = tree_clf.predict(x_test)
print(classification_report(y_test, y_pred_tree, digits=3))

                 precision    recall  f1-score   support

tested_negative      0.763     0.815     0.788       146
tested_positive      0.640     0.565     0.600        85

       accuracy                          0.723       231
      macro avg      0.701     0.690     0.694       231
   weighted avg      0.718     0.723     0.719       231

