**Diabetes Prediction Using Multiple Machine Learning Algorithms**

**Step 1. Importing the Dependencies**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

**Step 2. Data Collection and Initial Analysis**

In [None]:
# ✅ Step 2: Load Dataset (already uploaded to Colab)
df = pd.read_csv('/content/diabetes_prediction_dataset.csv')

In [None]:
# ✅ Step 3: Explore Dataset
print("Data Shape:", df.shape)
print("\nMissing Values:\n", df.isnull().sum())
print("\nClass Distribution:\n", df['diabetes'].value_counts())
df.head()

Data Shape: (27528, 9)

Missing Values:
 gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64

Class Distribution:
 diabetes
0    25168
1     2360
Name: count, dtype: int64


Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [None]:
# Step 4: Encode Categorical Features with separate encoders
gender_encoder = LabelEncoder()
df['gender'] = gender_encoder.fit_transform(df['gender'])

smoking_encoder = LabelEncoder()
df['smoking_history'] = smoking_encoder.fit_transform(df['smoking_history'])

In [None]:
# ✅ Step 5: Split Features and Target
X = df.drop('diabetes', axis=1)
y = df['diabetes']

In [None]:
# ✅ Step 6: Handle Imbalanced Data using SMOTE
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)

In [None]:
# ✅ Step 7: Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_smote)

**Step 4. Train-Test Split**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_smote, test_size=0.2, stratify=y_smote, random_state=42)

**Step 5. Model Training and Evaluation - Multiple Algorithms**

In [None]:
# ✅ Step 9: Train Multiple Models and Compare Accuracy
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

models = {
    "Logistic Regression": LogisticRegression(),
    "SVM": SVC(kernel='linear'),
    "KNN": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

accuracies = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    acc = accuracy_score(y_test, pred)
    accuracies[name] = acc
    print(f"{name} Accuracy: {acc:.4f}")

Logistic Regression Accuracy: 0.9001
SVM Accuracy: 0.9015
KNN Accuracy: 0.9348
Decision Tree Accuracy: 0.9702
Random Forest Accuracy: 0.9774
XGBoost Accuracy: 0.9779


In [None]:
# ✅ Step 10: Select Top 2 Models Based on Accuracy
sorted_models = sorted(accuracies.items(), key=lambda x: x[1], reverse=True)
print("\nTop 2 Models:", sorted_models[:2])


Top 2 Models: [('XGBoost', 0.9778506158124751), ('Random Forest', 0.9773539928486293)]


In [None]:
# ✅ Step 11: Ensemble Top 2 Models
best_model_1 = models[sorted_models[0][0]]
best_model_2 = models[sorted_models[1][0]]

from sklearn.ensemble import VotingClassifier
ensemble_model = VotingClassifier(estimators=[('model1', best_model_1), ('model2', best_model_2)], voting='soft')
ensemble_model.fit(X_train, y_train)
ensemble_pred = ensemble_model.predict(X_test)

print("\nEnsemble Model Accuracy:", accuracy_score(y_test, ensemble_pred))
print("\nClassification Report:\n", classification_report(y_test, ensemble_pred))


Ensemble Model Accuracy: 0.9813269765593962

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98      5034
           1       0.98      0.98      0.98      5034

    accuracy                           0.98     10068
   macro avg       0.98      0.98      0.98     10068
weighted avg       0.98      0.98      0.98     10068



In [None]:
# ✅ Step 12: Predict on New Data (Fixed Version)

# Define sample input (raw values before encoding)
sample_data = {
    'gender': ['Female'],
    'age': [44],
    'hypertension': [0],
    'heart_disease': [0],
    'smoking_history': ['never'],
    'bmi': [19.31],
    'HbA1c_level': [6.5],
    'blood_glucose_level': [200]
}

# Create DataFrame
sample_input = pd.DataFrame(sample_data)

# --- Fix: Ensure encoders have all categories before transforming ---
# Re-fit encoders with all possible categories (if needed)
gender_categories = ['Female', 'Male', 'Other']  # Add all possible values
smoking_categories = ['never', 'No Info', 'current', 'former', 'ever', 'not current']  # Adjust based on your data

# Re-initialize encoders with all possible categories
gender_encoder = LabelEncoder().fit(gender_categories)
smoking_encoder = LabelEncoder().fit(smoking_categories)

# Now transform the sample data
sample_input['gender'] = gender_encoder.transform(sample_input['gender'])
sample_input['smoking_history'] = smoking_encoder.transform(sample_input['smoking_history'])

# Ensure column order matches training data
sample_input = sample_input[X.columns]

# Scale features
sample_scaled = scaler.transform(sample_input)

# Make prediction
prediction = ensemble_model.predict(sample_scaled)
print("\nPrediction for sample input:", 'Diabetic' if prediction[0] == 1 else 'Not Diabetic')


Prediction for sample input: Diabetic
