## Loading Data and visualizing.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# reading the dataset
raw_dataset = pd.read_csv('diabetes.csv')

# shape of data
print(f"Dataset shape: {raw_dataset.shape}")

# checking the first five rows of the dataset
raw_dataset.head()

In [None]:
print("Dataset info:")
print(raw_dataset.info(), "\n")

print("Missing values in each column:")
print(raw_dataset.isnull().sum(), "\n")

# statistical summary of the dataset
print("Statistical summary of the dataset:")
print(raw_dataset.describe(), "\n")

In [None]:
print("Distribution of target variable (Outcome):")
print(raw_dataset['Outcome'].value_counts(), "\n")

# checking the distribution of the target variable
sns.countplot(x='Outcome', data=raw_dataset)
plt.title('Distribution of Outcome Variable')
plt.xticks(ticks=[0, 1], labels=['0 (No Diabetes)', '1 (Diabetes)'])
plt.show()

In [None]:
# Summary statistics by outcome
print("\nSummary statistics by outcome:")
raw_dataset.groupby('Outcome').describe()

In [None]:
# creating correlation matrix and heatmap
correlation_matrix = raw_dataset.corr()
# sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Heatmap of Features')
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', annot_kws={"size": 10})
plt.show()

from heatmap, seems like glucose has most effect on outcome from this data along with minor other features that also affect the outcome.

## Splitting and Standardizing the data

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
# Splitting the data into features and target variable based on correlation insights
X = raw_dataset.drop(columns=['SkinThickness', 'BloodPressure', 'Outcome'])  # Dropping less relevant features
y = raw_dataset['Outcome']

print(X.head())

# print(X.columns)

# scaling the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# print the first 5 rows of scaled features
print("\nFirst 5 rows of scaled features:")
print(X_scaled[:5])

In [None]:
# checking the shape of the dataset
print("Raw dataset shape:", raw_dataset.shape)
print("Features shape:", X.shape)
print("Target shape:", y.shape)

In [None]:
# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# print the shape of the train and test sets
print("\nTrain and Test set shapes:")
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

## Training model

Using logistic regression model first

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
logReg = LogisticRegression(random_state=42, max_iter=200)
logReg.fit(X_train, y_train)
y_pred_logReg = logReg.predict(X_test)
accuracy_logReg = accuracy_score(y_test, y_pred_logReg)
print(f"\nLogistic Regression Accuracy: {accuracy_logReg:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_logReg))

In [None]:
confusion_logReg = confusion_matrix(y_test, y_pred_logReg)
print("Confusion Matrix:\n", confusion_logReg)
sns.heatmap(confusion_logReg, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - Logistic Regression')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

Using SVM linear and rbf kernel

In [None]:
from sklearn.svm import SVC

# Using SVM with linear kernel
svm_linear = SVC(kernel='linear', random_state=42)
svm_linear.fit(X_train, y_train)
y_pred_svm_linear = svm_linear.predict(X_test)
accuracy_svm_linear = accuracy_score(y_test, y_pred_svm_linear)
print(f"\nSVM (Linear Kernel) Accuracy: {accuracy_svm_linear:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_svm_linear))

In [None]:
confusion_svm_linear = confusion_matrix(y_test, y_pred_svm_linear)
print("Confusion Matrix:\n", confusion_svm_linear)
sns.heatmap(confusion_svm_linear, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - SVM (Linear Kernel)')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# Using the SVM with RBF kernel
svm_rbf = SVC(kernel='rbf', random_state=42)
svm_rbf.fit(X_train, y_train)
y_pred_svm_rbf = svm_rbf.predict(X_test)
accuracy_svm_rbf = accuracy_score(y_test, y_pred_svm_rbf)
print(f"\nSVM (RBF Kernel) Accuracy: {accuracy_svm_rbf:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_svm_rbf))

In [None]:
confusion_svm_rbf = confusion_matrix(y_test, y_pred_svm_rbf)
print("Confusion Matrix:\n", confusion_svm_rbf)
sns.heatmap(confusion_svm_rbf, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - SVM (RBF Kernel)')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

#### best accuracy is svm model with linear kernel.

Will dump this model into a pickle file to be used in the streamlit app.

In [None]:
import pickle

filename = 'diabetes_model.pkl'
with open(filename, 'wb') as file:
    pickle.dump(svm_rbf, file)

print(f"Model saved as {filename}")