In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, chi2, f_classif, RFECV
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    average_precision_score, confusion_matrix, matthews_corrcoef
)
import time

# Load Dataset
df = pd.read_csv("/mnt/data/diabetes_012_health_indicators_BRFSS2015.csv")

# Feature Engineering
def categorize_bmi(bmi):
    if bmi < 18.5:
        return "Underweight"
    elif 18.5 <= bmi < 25:
        return "Normal"
    elif 25 <= bmi < 30:
        return "Overweight"
    else:
        return "Obese"

df["BMI_Category"] = df["BMI"].apply(categorize_bmi)
df = pd.get_dummies(df, columns=["BMI_Category"], drop_first=True)
df["BMI_Age"] = df["BMI"] * df["Age"]
df["HighBP_HighChol"] = df["HighBP"] * df["HighChol"]
df["PhysActivity_BMI"] = df["PhysActivity"] * df["BMI"]

# Feature Selection
X = df.drop(columns=["Diabetes_012"])
y = df["Diabetes_012"]

chi2_selector = SelectKBest(score_func=chi2, k='all')
chi2_selector.fit(X, y)
chi2_scores = chi2_selector.scores_

anova_selector = SelectKBest(score_func=f_classif, k='all')
anova_selector.fit(X, y)
anova_scores = anova_selector.scores_

feature_scores = pd.DataFrame({
    'Feature': X.columns,
    'Chi2 Score': chi2_scores,
    'ANOVA F Score': anova_scores
}).sort_values(by='ANOVA F Score', ascending=False)

lowest_features = feature_scores.nsmallest(6, ['ANOVA F Score'])['Feature'].tolist()
X_reduced = X.drop(columns=lowest_features)

# Standardize numerical features
scaler = StandardScaler()
numerical_features = ["BMI", "Age", "BMI_Age", "PhysActivity_BMI"]
X_reduced[numerical_features] = scaler.fit_transform(X_reduced[numerical_features])

# PCA
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X_reduced)
print("Explained Variance Ratio (PCA):", pca.explained_variance_ratio_)

# RFECV
rfecv_selector = RFECV(estimator=LogisticRegression(max_iter=1000), step=1, cv=5, scoring='accuracy')
X_rfecv = rfecv_selector.fit_transform(X_reduced, y)
print("Selected features by RFECV:", X_reduced.columns[rfecv_selector.support_].tolist())
