In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

column_names = ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age", "Outcome"]
df = pd.read_csv('./diabetes.csv', names=column_names)
df.drop(index=df.index[0], axis=0, inplace=True)

X = df.drop("Outcome", axis=1)
y = df["Outcome"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Dataset shape:", df.shape)
print("Training data shape:", X_train.shape)
print("Testing data shape:", X_test.shape)

Dataset shape: (768, 9)
Training data shape: (614, 8)
Testing data shape: (154, 8)


# Naive Bayes
GaussianNB model is used. No need of Laplacian correction here as the model assumes all data to follow normal distribution

In [3]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report

# Training
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

# predictions
y_pred_nb = nb_model.predict(X_test)

# Evaluation
print("Naïve Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Classification Report:\n", classification_report(y_test, y_pred_nb))

Naïve Bayes Accuracy: 0.7662337662337663
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.80      0.81        99
           1       0.66      0.71      0.68        55

    accuracy                           0.77       154
   macro avg       0.75      0.75      0.75       154
weighted avg       0.77      0.77      0.77       154



--------------------------------------------------------------------------------------------------------------------

# Linear Regression with Stochastic Gradient Descent



In [4]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, classification_report

# Training
sgd_model = SGDClassifier(loss="log_loss", random_state=42)
sgd_model.fit(X_train, y_train)

# predictions
y_pred_sgd = sgd_model.predict(X_test)

# evaluation
print("SGD Linear Regression Accuracy:", accuracy_score(y_test, y_pred_sgd))
print("Classification Report:\n", classification_report(y_test, y_pred_sgd))

SGD Linear Regression Accuracy: 0.6688311688311688
Classification Report:
               precision    recall  f1-score   support

           0       0.69      0.88      0.77        99
           1       0.57      0.29      0.39        55

    accuracy                           0.67       154
   macro avg       0.63      0.58      0.58       154
weighted avg       0.65      0.67      0.63       154



--------------------------------------------------------------------------------------------------------------------

# Logistic Regression

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Training
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train, y_train)

# predictions
y_pred_lr = lr_model.predict(X_test)

# Evaluation
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Classification Report:\n", classification_report(y_test, y_pred_lr))

Logistic Regression Accuracy: 0.7467532467532467
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.79      0.80        99
           1       0.64      0.67      0.65        55

    accuracy                           0.75       154
   macro avg       0.73      0.73      0.73       154
weighted avg       0.75      0.75      0.75       154



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


--------------------------------------------------------------------------------------------------------------------

# K-Nearest Neighbour

Using:
 - Euclidean Distance
 - Manhattan Distance

In [10]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# KNN with Euclidean distance
knn_euclidean = KNeighborsClassifier(n_neighbors=5, metric="euclidean")
knn_euclidean.fit(X_train, y_train)
y_pred_knn_euclidean = knn_euclidean.predict(X_test)

# KNN with Manhattan distance
knn_manhattan = KNeighborsClassifier(n_neighbors=5, metric="manhattan")
knn_manhattan.fit(X_train, y_train)
y_pred_knn_manhattan = knn_manhattan.predict(X_test)

# Evaluate the models
print("KNN (Euclidean) Accuracy:", accuracy_score(y_test, y_pred_knn_euclidean))
print("KNN (Manhattan) Accuracy:", accuracy_score(y_test, y_pred_knn_manhattan))
print("Classification Report (Euclidean):\n", classification_report(y_test, y_pred_knn_euclidean))
print("Classification Report (Manhattan):\n", classification_report(y_test, y_pred_knn_manhattan))

KNN (Euclidean) Accuracy: 0.6623376623376623
KNN (Manhattan) Accuracy: 0.6688311688311688
Classification Report (Euclidean):
               precision    recall  f1-score   support

           0       0.75      0.71      0.73        99
           1       0.52      0.58      0.55        55

    accuracy                           0.66       154
   macro avg       0.64      0.64      0.64       154
weighted avg       0.67      0.66      0.67       154

Classification Report (Manhattan):
               precision    recall  f1-score   support

           0       0.76      0.72      0.74        99
           1       0.53      0.58      0.56        55

    accuracy                           0.67       154
   macro avg       0.64      0.65      0.65       154
weighted avg       0.68      0.67      0.67       154



--------------------------------------------------------------------------------------------------------------------

# Decision Tree using Information Gain

In [12]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Decision Tree with Information Gain
dt_entropy = DecisionTreeClassifier(criterion="entropy", random_state=42)
dt_entropy.fit(X_train, y_train)
y_pred_dt_entropy = dt_entropy.predict(X_test)

# Evaluation
print("Decision Tree (Entropy) Accuracy:", accuracy_score(y_test, y_pred_dt_entropy))
print("Classification Report (Entropy):\n", classification_report(y_test, y_pred_dt_entropy))

Decision Tree (Entropy) Accuracy: 0.7207792207792207
Classification Report (Entropy):
               precision    recall  f1-score   support

           0       0.79      0.77      0.78        99
           1       0.60      0.64      0.62        55

    accuracy                           0.72       154
   macro avg       0.70      0.70      0.70       154
weighted avg       0.72      0.72      0.72       154

