In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv("C:/Users/Asus/Downloads/diabetes_dataset.csv")

# Drop unnecessary columns
df.drop(columns=['Unnamed: 0'], inplace=True, errors='ignore')

# Standardize string columns
cat_cols = df.select_dtypes(include='object').columns
df[cat_cols] = df[cat_cols].apply(lambda x: x.str.lower().str.strip())

# Fill missing values
for col in df.columns:
    if df[col].isnull().sum() > 0:
        if df[col].dtype == 'object':
            df[col] = df[col].fillna(df[col].mode()[0])
        else:
            df[col] = df[col].fillna(df[col].median())


# Add binary outcome if not present
if 'Diabetes_Outcome' not in df.columns:
    df['Diabetes_Outcome'] = ((df['HbA1c'] >= 6.5) | (df['Fasting_Blood_Glucose'] >= 126)).astype(int)

# --- EDA ---

# # 1. Class distribution
# sns.countplot(x='Diabetes_Outcome', data=df)
# plt.title("Distribution of Diabetes Outcome")
# plt.show()

# # 2. Correlation heatmap
# plt.figure(figsize=(14, 10))
# sns.heatmap(df.select_dtypes(include=[np.number]).corr(), annot=True, cmap='coolwarm', fmt=".2f")
# plt.title("Correlation Heatmap")
# plt.show()

# # 3. Boxplots
# fig, axes = plt.subplots(1, 3, figsize=(18, 5))
# sns.boxplot(data=df, x='Diabetes_Outcome', y='Fasting_Blood_Glucose', ax=axes[0])
# axes[0].set_title('Glucose vs Outcome')
# sns.boxplot(data=df, x='Diabetes_Outcome', y='BMI', ax=axes[1])
# axes[1].set_title('BMI vs Outcome')
# sns.boxplot(data=df, x='Diabetes_Outcome', y='Age', ax=axes[2])
# axes[2].set_title('Age vs Outcome')
# plt.tight_layout()
# plt.show()

# # 4. Categorical features
# cat_vars = ['Sex', 'Ethnicity', 'Smoking_Status', 'Physical_Activity_Level', 'Family_History_of_Diabetes']
# for var in cat_vars:
#     plt.figure(figsize=(6, 4))
#     sns.countplot(x=var, hue='Diabetes_Outcome', data=df)
#     plt.title(f'{var} vs Diabetes Outcome')
#     plt.xticks(rotation=45)
#     plt.tight_layout()
#     plt.show()

################################################################  OBJECTIVE - 1 ####################################################################
###############################################Predict Diabetes Status (Classification Model)#######################################################

# from sklearn.model_selection import train_test_split
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import classification_report

# # Features and Target
# X = df.drop(columns=['Diabetes_Outcome'])
# y = df['Diabetes_Outcome']

# # One-hot encode categorical variables
# X = pd.get_dummies(X, drop_first=True)

# # Train/test split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Model
# model = RandomForestClassifier()
# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)

# print(classification_report(y_test, y_pred))



################################################################  OBJECTIVE - 2 #####################################################################
###############################################Feature Importance – What factors matter most?########################################################

# # Re-create X exactly as it was during training
# X = df.drop(columns=['Diabetes_Outcome'])
# X = pd.get_dummies(X, drop_first=True)

# # Now get feature importances
# importances = pd.Series(model.feature_importances_, index=X.columns)
# importances.sort_values(ascending=False).plot(kind='bar', figsize=(10, 4), title="Feature Importances")
# plt.tight_layout()
# plt.show()



################################################################  OBJECTIVE - 3 #####################################################################
#######################################################Clustering Patients into Risk Groups##########################################################

# from sklearn.cluster import KMeans
# from sklearn.preprocessing import StandardScaler

# features = df[['Fasting_Blood_Glucose', 'BMI', 'Age']]
# scaler = StandardScaler()
# scaled = scaler.fit_transform(features)

# kmeans = KMeans(n_clusters=3, random_state=42)
# df['Risk_Cluster'] = kmeans.fit_predict(scaled)

# sns.scatterplot(data=df, x='Fasting_Blood_Glucose', y='BMI', hue='Risk_Cluster')
# plt.title("Patient Risk Clusters")
# plt.show()



################################################################  OBJECTIVE - 4 #####################################################################
#####################################################Analyze Lifestyle Impact on Diabetes##########################################################

# lifestyle_vars = ['Physical_Activity_Level', 'Smoking_Status', 'Family_History_of_Diabetes']

# for var in lifestyle_vars:
#     sns.countplot(x=var, hue='Diabetes_Outcome', data=df)
#     plt.title(f'{var} vs Diabetes Outcome')
#     plt.xticks(rotation=45)
#     plt.tight_layout()
#     plt.show()



################################################################  OBJECTIVE - 5 #####################################################################
#####################################################Predict HbA1c (Regression Task)#################################################################

# from sklearn.linear_model import LinearRegression
# from sklearn.metrics import mean_squared_error

# # Predict HbA1c
# X = df.drop(columns=['HbA1c'])
# X = pd.get_dummies(X, drop_first=True)
# y = df['HbA1c']

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# reg = LinearRegression()
# reg.fit(X_train, y_train)
# y_pred = reg.predict(X_test)

# print("MSE:", mean_squared_error(y_test, y_pred))


#https://www.kaggle.com/datasets/marshalpatel3558/diabetes-prediction-dataset?resource=download