<a href="https://colab.research.google.com/github/Alif416/Diabetes-Detection-using-Machine-Learning/blob/main/Diabetes_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split



In [None]:
dataset = pd.read_csv("/content/diabetes_dataset.csv")
dataset.head(200)


In [None]:
numerical_data = dataset.select_dtypes(include='number')

#append the features of numerical_data to list
numerical_features=numerical_data.columns.tolist()

print(f'There are {len(numerical_features)} numerical features:', '\n')
print(numerical_features)

In [None]:
#Selecting categoricalfeatures
categorical_data=dataset.select_dtypes(include= 'object')

#append the features of categorical_data to list
categorical_features=categorical_data.columns.tolist()

print(f'There are {len(categorical_features)} numerical features:', '\n')
print(categorical_features)

# **Correlation Analysis And HeatMap**

In [None]:
# Calculate the correlation matrix
correlation_matrix = numerical_data.corr()
correlation_matrix

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.3f', linewidths=0.3)
plt.show()

# **Check Whether Imbalance Or Not**

In [None]:
#check Imbalance in data

#group instances based on the classes in OUTCOME variable
class_counts=dataset.groupby("diabetes").size()

columns=['outcome','count','percentage']
outcome=[0,1]
count=list()
percentage=list()

#Calculate the percentage of each value of the OUTCOME variable from total
for val in range(2):
    count.append(class_counts[val])
    percent=(class_counts[val]/105000)*100
    percentage.append(percent)

# Convert the calulated values into a dataframe
imbalance_df=pd.DataFrame(list(zip(outcome,count,percentage)),columns=columns)
imbalance_df




In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load your dataset
# df = pd.read_csv("your_dataset.csv")  # Uncomment and modify this if loading from a CSV

# Extract the target column
y = dataset['diabetes']  # Replace 'diabetes' with the actual column name if different

# Count occurrences of each class
class_counts = y.value_counts().sort_index()

# Plot bar chart
plt.figure(figsize=(8, 5))
plt.bar(class_counts.index.astype(str), class_counts.values, color=['blue', 'red'])  # Convert index to string for labels
plt.xlabel("Class (Diabetes: 0 = No, 1 = Yes)")
plt.ylabel("Count")
plt.title("Class Distribution in Dataset")
plt.xticks([0, 1], ['No Diabetes (0)', 'Diabetes (1)'])  # Optional: better labeling
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()


In [None]:
dataset.isnull().sum()

# **EDA**

In [None]:
numerical_data.hist(figsize=(12,12),bins=20)
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Select only numerical columns for boxplot analysis
numeric_cols = dataset.select_dtypes(include=['int64', 'float64']).columns

# Set up the figure
plt.figure(figsize=(20, 30))

# Plot boxplots for each numerical feature including the target variable 'OUTCOME'
for i, col in enumerate(numeric_cols, 1):
    plt.subplot(len(numeric_cols), 1, i)
    sns.boxplot(x=dataset[col], color='skyblue')
    plt.title(f'Boxplot of {col}', fontsize=12)
    plt.tight_layout()

plt.show()


# **Data Pre-Processing**

In [None]:
numerical_data.isnull().sum()

In [None]:
categorical_data.isnull().sum()

In [None]:
#Missing Values  both are existing in the num data and cat data
#Con: You may lose important features that are strongly related to the target (like bmi, gender, or HbA1c_level).
#Con: You lose valuable data, especially if the dataset is not very large.

#If many rows have missing values, dropping them reduces your dataset size and may hurt model performance.

"""Keeps all rows and columns — preserves your dataset size and structure.

If done correctly (median for numbers, mode for categories), it minimizes distortion.

Prepares your data for ML without losing important signals."""

In [None]:
from sklearn.impute import SimpleImputer

# List of binary columns to impute
binary_cols = ['diabetes', 'hypertension', 'heart_disease']

# Apply mode imputation (fills missing values with the most frequent class)
imputer = SimpleImputer(strategy='most_frequent')
numerical_data[binary_cols] = imputer.fit_transform(numerical_data[binary_cols])




In [None]:
from sklearn.impute import SimpleImputer

# Select columns for imputation
num_cols = ['bmi', 'HbA1c_level', 'blood_glucose_level', 'age']

# Use Median Imputation (recommended for skewed data)
imputer = SimpleImputer(strategy='mean')
numerical_data[num_cols] = imputer.fit_transform(numerical_data[num_cols])

print(numerical_data[num_cols].isnull().sum())  # Verify missing values are removed

In [None]:
numerical_data.isnull().sum()

In [None]:
from sklearn.impute import SimpleImputer

# Use mode imputation to fill missing gender values
imputer = SimpleImputer(strategy='most_frequent')
categorical_data[['gender']] = imputer.fit_transform(categorical_data[['gender']])

# Verify that missing values are removed
print(categorical_data['gender'].isnull().sum())  # Should be 0



In [None]:
categorical_data

In [None]:
categorical_data['smoking_history'].fillna(categorical_data['smoking_history'].mode()[0], inplace=True)

In [None]:
categorical_data.isnull().sum()

**Encoding**

In [None]:
categorical_data['gender'] = categorical_data['gender'].map({'Male': 0, 'Female': 1})

In [None]:
categorical_data

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
categorical_data['smoking_history'] = encoder.fit_transform(categorical_data['smoking_history'])  # Encodes categories as numbers
print(categorical_data['smoking_history'].unique())  # Check numeric encoding

In [None]:
categorical_data.columns

In [None]:
final_df = pd.concat([numerical_data, categorical_data], axis=1)
print(final_df)

In [None]:
final_df


In [None]:
final_df.columns

**Feature Scaling**

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
num_cols = ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']

final_df[num_cols] = scaler.fit_transform(final_df[num_cols])


In [None]:
final_df

In [None]:
from sklearn.model_selection import train_test_split

# Define input features (X) and target column (y)
target_col = 'diabetes'  # Change to your actual target variable
X = final_df.drop(columns=[target_col])  # Exclude target from features
y = final_df[target_col]  # Target variable

# Perform stratified train-test split (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Verify class distribution remains similar
print("Training set class distribution:\n", y_train.value_counts(normalize=True))
print("Test set class distribution:\n", y_test.value_counts(normalize=True))

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='most_frequent')
final_df[['gender']] = imputer.fit_transform(final_df[['gender']])


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [None]:
target_col = 'diabetes'
X = final_df.drop(columns=[target_col])  # Features
y = final_df[target_col]  # Target variable

# Perform Stratified Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [None]:
final_df.isnull().sum()

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)  # Use the same scaler for test data

# **Logistic_regression**

In [None]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred_log = log_reg.predict(X_test)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log))
print("Classification Report:\n", classification_report(y_test, y_pred_log))

# **KNN**

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)  # You can tune 'n_neighbors' later
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

print("KNN Accuracy:", accuracy_score(y_test, y_pred_knn))
print("Classification Report:\n", classification_report(y_test, y_pred_knn))

# **Neural Network**

In [None]:
mlp = MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=500, random_state=42)  # Tunable parameters
mlp.fit(X_train, y_train)
y_pred_mlp = mlp.predict(X_test)

print("Neural Network Accuracy:", accuracy_score(y_test, y_pred_mlp))
print("Classification Report:\n", classification_report(y_test, y_pred_mlp))

# **Bar Chart: Prediction Accuracy of All Models**

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Store model names and accuracy scores
models = ['Logistic Regression', 'KNN', 'Neural Network']
accuracies = [accuracy_score(y_test, y_pred_log), accuracy_score(y_test, y_pred_knn), accuracy_score(y_test, y_pred_mlp)]

# Create bar plot
plt.figure(figsize=(8, 5))
plt.bar(models, accuracies, color=['blue', 'red', 'green'])
plt.xlabel("Models")
plt.ylabel("Accuracy Score")
plt.title("Model Accuracy Comparison")
plt.ylim(0, 1)  # Accuracy ranges between 0 and 1
plt.show()

# ** Precision, Recall Comparison of Each Model**

In [None]:
from sklearn.metrics import precision_score, recall_score

# Compute precision and recall
precision_log = precision_score(y_test, y_pred_log)
recall_log = recall_score(y_test, y_pred_log)

precision_knn = precision_score(y_test, y_pred_knn)
recall_knn = recall_score(y_test, y_pred_knn)

precision_mlp = precision_score(y_test, y_pred_mlp)
recall_mlp = recall_score(y_test, y_pred_mlp)

# Print results
print(f"Logistic Regression - Precision: {precision_log}, Recall: {recall_log}")
print(f"KNN - Precision: {precision_knn}, Recall: {recall_knn}")
print(f"Neural Network - Precision: {precision_mlp}, Recall: {recall_mlp}")

# **Confusion Matrix for Each Model**

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Define models and predictions
model_preds = {'Logistic Regression': y_pred_log, 'KNN': y_pred_knn, 'Neural Network': y_pred_mlp}

for model_name, preds in model_preds.items():
    cm = confusion_matrix(y_test, preds)
    plt.figure(figsize=(5, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.title(f'Confusion Matrix - {model_name}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

# **AUC Score & ROC Curve**

In [None]:
from sklearn.metrics import roc_curve, auc

# Compute ROC curves
models = {'Logistic Regression': y_pred_log, 'KNN': y_pred_knn, 'Neural Network': y_pred_mlp}

plt.figure(figsize=(8, 6))
for model_name, preds in models.items():
    fpr, tpr, _ = roc_curve(y_test, preds)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {roc_auc:.2f})')

plt.plot([0, 1], [0, 1], 'r--')  # Diagonal line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison')
plt.legend()
plt.show()