In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
# Creates a ColumnTransformer object from the sklearn.compose module. The ColumnTransformer is used to apply different preprocessing steps to different subsets of columns in the dataset.
from sklearn.pipeline import Pipeline

#bmi:Underweight: < 18.5, Normal weight: 18.5 - 24.9, Overweight: 25 - 29.9, and Obesity: 30 or higher

#HbA1c levels :glycated haemoglobin are a measure of your average blood glucose (sugar) levels over the past two to three months. It's used to diagnose diabetes, monitor blood sugar control, and assess the risk of developing diabetes. 
# Here's a breakdown of HbA1c levels and what they mean:
# Normal: HbA1c below 5.7%.
# Prediabetes: HbA1c between 5.7% and 6.4%.
# Diabetes: HbA1c of 6.5% or higher.
#  
#Blood glucose, or blood sugar, is the concentration of glucose (a type of sugar) in the blood, serving as the body's primary source of energy. It's measured in milligrams per deciliter (mg/dL). Normal fasting blood glucose levels for non-diabetics are typically 70-100 mg/dL. For individuals with diabetes, the target range is usually 70-130 mg/dL

#A voting classifier :is a machine learning model that gains experience by training on a collection of several models and forecasts an output (class) based on the class with the highest likelihood of becoming the output. To forecast the output class based on the largest majority of votes, it averages the results of each classifier provided into the voting classifier





In [None]:
df = pd.read_csv("diabetes_prediction_dataset.csv")


In [None]:
df_new = df.drop_duplicates()
#duplicate values are removed from the DataFrame df, and the result is stored in df_new.

In [None]:
categorical_features = ['gender', 'smoking_history']
#gender and smoking history are categorical features in the dataset, representing non-numeric data

numerical_features = ['age', 'hypertension', 'heart_disease', 'bmi', 'HbA1c_level', 'blood_glucose_level']
X = df_new.drop('diabetes', axis=1)
y = df_new['diabetes']



In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features),
        ('num', 'passthrough', numerical_features)
    ])
#here categorical features are one-hot encoded, and numerical features are passed through without changes.


gbc = GradientBoostingClassifier(random_state=123456)
abc = AdaBoostClassifier(random_state=123456)
rfc = RandomForestClassifier(random_state=123456)
#gbc: GradientBoostingClassifier, builds trees sequentially to minimize errors.
##abc: AdaBoostClassifier, boosts weak learners by focusing on errors.
#rfc: RandomForestClassifier, aggregates predictions from multiple trees.



In [None]:
voting_clf = VotingClassifier(
    estimators=[
        ('gbc', gbc),
        ('abc', abc),
        ('rfc', rfc)
    ],
    voting='hard'
)
#votingClassifier:It combines multiple different models (classifiers) and makes predictions based on majority voting (hard) or average probabilities (soft). It's a form of ensemble learning.it uses hard voting, where the class with the most votes is selected as the final prediction.

In [None]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', voting_clf)
])
#This line creates a machine learning pipeline that combines preprocessing and model training steps into a single workflow.
#pipeline:chain multiple steps together


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123456)
#splitting the data into training and testing sets, with 20% of the data reserved for testing and 80 for training. 

In [None]:
pipeline.fit(X_train, y_train)
#data is fitted to the pipeline, which includes preprocessing and training the voting classifier.

In [None]:
gbc_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', gbc)])
abc_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', abc)])
rfc_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', rfc)])
#pipelines are created here for each individual classifier, combining preprocessing and model training steps.

gbc_pipeline.fit(X_train, y_train)
abc_pipeline.fit(X_train, y_train)
rfc_pipeline.fit(X_train, y_train)

In [None]:
print("Individual Model Accuracies:")
print(f"GradientBoostingClassifier: {accuracy_score(y_test, gbc_pipeline.predict(X_test)):.4f}")
print(f"AdaBoostClassifier: {accuracy_score(y_test, abc_pipeline.predict(X_test)):.4f}")
print(f"RandomForestClassifier: {accuracy_score(y_test, rfc_pipeline.predict(X_test)):.4f}")
print(f"Voting Classifier (Ensemble): {accuracy_score(y_test, pipeline.predict(X_test)):.4f}")


Individual Model Accuracies:
GradientBoostingClassifier: 0.9704
AdaBoostClassifier: 0.9704
RandomForestClassifier: 0.9683
Voting Classifier (Ensemble): 0.9705


In [None]:
import pickle

# Save trained pipelines from your notebook as separate files with .pkl extension
with open("gbc_pipeline.pkl", "wb") as f:
    pickle.dump(gbc_pipeline, f)

with open("abc_pipeline.pkl", "wb") as f:
    pickle.dump(abc_pipeline, f)

with open("rfc_pipeline.pkl", "wb") as f:
    pickle.dump(rfc_pipeline, f)

with open("ensemble_pipeline.pkl", "wb") as f:
    pickle.dump(pipeline, f)


In [None]:
#exception handling for user inputs and valid inputs checking is done here
try:
    # Collect user inputs
    print("\nEnter the following details for diabetes prediction:")
    gender = input("Gender (Female/Male/Other): ").strip().capitalize()
    age = float(input("Age (0.08 to 80): "))
    hypertension = int(input("Hypertension (0 for No, 1 for Yes): "))
    heart_disease = int(input("Heart Disease (0 for No, 1 for Yes): "))
    smoking_history = input("Smoking History (never/former/current/No Info): ").strip().lower()
    bmi = float(input("BMI (10.01 to 95.69): "))
    hba1c_level = float(input("HbA1c Level (3.5 to 9.0): "))
    blood_glucose = float(input("Blood Glucose Level (80 to 300): "))

    # Validate inputs
    if gender not in ['Female', 'Male', 'Other']:
        raise ValueError("Gender must be 'Female', 'Male', or 'Other'")
    if not (0.08 <= age <= 80):
        raise ValueError("Age must be between 0.08 and 80")
    if hypertension not in [0, 1]:
        raise ValueError("Hypertension must be 0 or 1")
    if heart_disease not in [0, 1]:
        raise ValueError("Heart Disease must be 0 or 1")
    if smoking_history not in ['never', 'former', 'current', 'no info']:
        raise ValueError("Smoking History must be 'never', 'former', 'current', or 'no info'")
    if not (10.01 <= bmi <= 95.69):
        raise ValueError("BMI must be between 10.01 and 95.69")
    if not (3.5 <= hba1c_level <= 9.0):
        raise ValueError("HbA1c Level must be between 3.5 and 9.0")
    if not (80 <= blood_glucose <= 300):
        raise ValueError("Blood Glucose Level must be between 80 and 300")

    # Create input DataFrame
    input_data = pd.DataFrame({
        'gender': [gender],
        'age': [age],
        'hypertension': [hypertension],
        'heart_disease': [heart_disease],
        'smoking_history': [smoking_history],
        'bmi': [bmi],
        'HbA1c_level': [hba1c_level],
        'blood_glucose_level': [blood_glucose]
    })

    # Make predictions with individual models
    gbc_pred = gbc_pipeline.predict(input_data)[0]
    abc_pred = abc_pipeline.predict(input_data)[0]
    rfc_pred = rfc_pipeline.predict(input_data)[0]
    ensemble_pred = pipeline.predict(input_data)[0]

    # Display individual model predictions
    print("\nIndividual Model Predictions:")
    print(f"GradientBoostingClassifier: {'Diabetes' if gbc_pred == 1 else 'No Diabetes'}")
    print(f"AdaBoostClassifier: {'Diabetes' if abc_pred == 1 else 'No Diabetes'}")
    print(f"RandomForestClassifier: {'Diabetes' if rfc_pred == 1 else 'No Diabetes'}")

    # Display ensemble prediction
    print("\nFinal Ensemble Prediction (Majority Vote):")
    if ensemble_pred == 0:
        print("The model predicts: No Diabetes")
    else:
        print("The model predicts: Diabetes")

except ValueError as e:
    print(f"Error: {e}. Please enter valid values.")


Enter the following details for diabetes prediction:
