In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.under_sampling import RandomUnderSampler


In [None]:
# Load the dataset
print("Loading dataset...")
df = pd.read_csv("cdc_diabetes_health_indicators.csv")
print(df.head())


In [None]:
# Data Summary
print("Dataset Info:")
print(df.info())



In [None]:
print("\nDataset Description:")
print(df.describe())

In [None]:
# Handling Duplicates
df = df.drop_duplicates()
print(f"\nDuplicates Removed, New Shape: {df.shape}")

In [None]:
# Categorizing Categorical Columns
categorical_columns = ['Sex', 'Education', 'Income', 'HighBP', 'HighChol', 'CholCheck',
                       'Smoker', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity',
                       'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare',
                       'NoDocbcCost', 'DiffWalk']
df[categorical_columns] = df[categorical_columns].astype('category')

In [None]:
# Boxplots for Numeric Columns
numeric_columns = ['BMI', 'MentHlth', 'PhysHlth']
for col in numeric_columns:
    plt.figure(figsize=(6,4))
    sns.boxplot(x=df[col])
    plt.title(f"Outliers in {col}")
    plt.show()

In [None]:
# Handling Outliers using IQR Method
numeric_columns = ['BMI', 'MentHlth', 'PhysHlth']
for col in numeric_columns:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

In [None]:
# Visualization - Distribution of Categorical Features
for col in categorical_columns:
    plt.figure(figsize=(6,4))
    sns.countplot(x=col, data=df)
    plt.title(f"Distribution of {col}")
    plt.show()



In [None]:
# Encoding Categorical Variables
df_encoded = df.copy()
df_encoded[categorical_columns] = df_encoded[categorical_columns].apply(lambda x: x.cat.codes)

In [None]:
# Heatmap to show correlation
plt.figure(figsize=(12,8))
sns.heatmap(df_encoded.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()

In [None]:
# Defining Features and Target
X = df_encoded.drop(columns=["Diabetes_binary"])
y = df_encoded["Diabetes_binary"]



In [None]:
# Handling Class Imbalance
rus = RandomUnderSampler(sampling_strategy={0: 20099, 1: 20099}, random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)


In [None]:
# Splitting Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [None]:
# Standardizing the Features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [None]:
# Model Training and Evaluation
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier()
}

results = {}
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc
    print(f"Accuracy: {acc:.4f}")
    print("Classification Report:\n", classification_report(y_test, y_pred))
    plt.figure(figsize=(5,4))
    sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues', xticklabels=["Non-Diabetic", "Diabetic"], yticklabels=["Non-Diabetic", "Diabetic"])
    plt.title(f"{name} - Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()


In [None]:
# Model Comparison
best_model = max(results, key=results.get)
print(f"Best Performing Model: {best_model} with Accuracy: {results[best_model]:.4f}")

In [None]:
plt.figure(figsize=(8, 5))
sns.barplot(x=list(results.keys()), y=list(results.values()), palette='viridis')
plt.ylabel("Accuracy Score")
plt.title("Model Comparison")
plt.xticks(rotation=45)
plt.show()



In [None]:
# Display Sample Predictions
comparison_df = pd.DataFrame({'Actual': y_test.values, 'Predicted': models[best_model].predict(X_test)})
print("Sample Predictions:")
print(comparison_df.sample(10))
