In [None]:
# -------------------------------------------------------------
# Core Imports & Configuration
# Purpose: Centralize ALL library imports & global styles 
# -------------------------------------------------------------

# Standard Library
import warnings
import time
from collections import namedtuple

# Data Handling
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.cluster.hierarchy import dendrogram, linkage

# Statistics / Distributions
from scipy.stats import randint, uniform

# Machine Learning & Preprocessing
from sklearn.preprocessing import (
    StandardScaler, MinMaxScaler, RobustScaler, PolynomialFeatures, LabelEncoder, OneHotEncoder
)
from sklearn.model_selection import (
    train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
)
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    classification_report, adjusted_rand_score, silhouette_score
)
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier, RandomForestRegressor, VotingClassifier, StackingClassifier
)
from sklearn.svm import SVC
from sklearn.feature_selection import RFE, SelectKBest, chi2, f_classif
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer

# Imbalanced Learning
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.pipeline import Pipeline  # unified Pipeline (works for normal + sampling steps)

# Gradient Boosting (XGBoost)
from xgboost import XGBClassifier, XGBRegressor

# Style & Warning Control
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')

print('All imports consolidated. Environment ready.')

In [None]:
# ===============================
# DATA LOADING & INITIAL AUDIT
# ===============================
# 1. Read raw CSV into DataFrame and create a working copy.
df = pd.read_csv("../data/heart_disease_uci.csv")
df_model = df.copy()

In [None]:
# ===============================
# PREPROCESSING STEP: COLUMN PRUNING
# ===============================
# Remove identifier / source columns that carry no predictive signal.
# (Prevents data leakage and reduces noise.)
df_model.drop(columns=['id','dataset'], inplace=True)

In [None]:
# ===============================
# FEATURE TYPE IDENTIFICATION
# ===============================
# Separate categorical (object) vs numeric (int/float) to drive tailored imputation,
# encoding, and scaling decisions in subsequent steps.
categorical_cols = df_model.select_dtypes(include=['object']).columns
numerical_cols = df_model.select_dtypes(include=['int64', 'float64']).columns

In [None]:
# ===============================
# FEATURE ENGINEERING
# ===============================
# Create domain‑inspired derived metrics intended to capture composite risk signals.
# chol_per_age: normalizes cholesterol by age to adjust for natural aging effects.
# heart_rate_reserve: proxy for cardiovascular capacity (max vs resting pressure influence).
# risk_score: simple additive heuristic combining known binary risk factors.
df_model['chol_per_age'] = df_model['chol'] / (df_model['age'] + 1e-5)
df_model['heart_rate_reserve'] = df_model['thalch'] - df_model['trestbps']
df_model['risk_score'] = (
    (df_model['age'] > 50).astype(int) +
    (df_model['chol'] > 240).astype(int) +
    (df_model['fbs'] == 1).astype(int)
 )

In [None]:
# Structural double‑check after engineered feature additions / imputations.
df_model.info()

In [None]:
# ===============================
# ADVANCED IMPUTATION PIPELINE
# ===============================
# Strategy Overview:
# 1. Median impute robust numeric columns individually.
# 2. Mode impute low-missing categorical columns.
# 3. KNN-impute 'ca' leveraging correlated numeric context.
# 4. Model-based (RandomForest) imputation for high-missing categorical targets: slope & thal.
# 5. Preserve schema & restore into main working frame.

# 1. Copy original
df_temp = df_model.copy()

# 2. Median imputation for specified numeric columns (robust to outliers compared to mean)
numeric_median_cols = ['chol_per_age','heart_rate_reserve','trestbps','chol','thalch','oldpeak']
for col in numeric_median_cols:
    if col in df_temp.columns:
        df_temp[col] = df_temp[col].astype(float)
        if df_temp[col].isna().any():
            df_temp[col].fillna(df_temp[col].median(), inplace=True)

# 3. Mode imputation for specified categorical columns (preserves most frequent valid category)
categorical_mode_cols = ['restecg','fbs','exang']
for col in categorical_mode_cols:
    if col in df_temp.columns and df_temp[col].isna().any():
        df_temp[col].fillna(df_temp[col].mode(dropna=True)[0], inplace=True)

# 4. KNN imputation for 'ca' (captures multivariate structure vs single-column fill)
if 'ca' in df_temp.columns:
    df_temp['ca'] = df_temp['ca'].replace(['?', 'NA', 'N/A', 'na', ''], np.nan)
    knn_features = [c for c in numeric_median_cols if c in df_temp.columns]
    if knn_features:
        knn_df = df_temp[['ca'] + knn_features].copy()
        imputer = KNNImputer(n_neighbors=5)
        imputed = imputer.fit_transform(knn_df)
        df_temp['ca'] = imputed[:, 0]
    else:
        imputer = KNNImputer(n_neighbors=5)
        df_temp[['ca']] = imputer.fit_transform(df_temp[['ca']])
    df_temp['ca'] = df_temp['ca'].round().astype(int)

# 5. Model-based imputation for high-missing categorical: slope & thal
high_missing_targets = ['slope','thal']

def detect_categorical(df):
    """Heuristic selection: include object columns and low-cardinality integers."""
    cats = []
    for c in df.columns:
        if c in ['num'] + high_missing_targets:
            continue
        if df[c].dtype == 'object':
            cats.append(c)
        elif pd.api.types.is_integer_dtype(df[c]) and df[c].nunique() <= 10:
            cats.append(c)
    return sorted(set(cats))

for target_col in high_missing_targets:
    if target_col not in df_temp.columns:
        continue
    missing_mask = df_temp[target_col].isna()
    n_missing = missing_mask.sum()
    if n_missing == 0:
        continue
    print(f"Imputing {n_missing} missing values in '{target_col}' via model-based approach...")
    feature_cols = [c for c in df_temp.columns if c not in ['num'] + high_missing_targets]
    X_full = df_temp[feature_cols].copy()
    y_full = df_temp[target_col]
    categorical_feats = detect_categorical(df_temp[feature_cols])
    numeric_feats = [c for c in feature_cols if c not in categorical_feats]
    X_train = X_full[~missing_mask]
    y_train = y_full[~missing_mask]
    X_pred = X_full[missing_mask]
    preprocessor = ColumnTransformer([
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_feats),
        ('num', 'passthrough', numeric_feats)
    ])
    if y_train.nunique() <= 15 and y_train.dtype != float:
        model = RandomForestClassifier(n_estimators=300, random_state=42)
    else:
        model = RandomForestRegressor(n_estimators=300, random_state=42)
    pipe = Pipeline([('prep', preprocessor), ('model', model)])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_pred)
    df_temp.loc[missing_mask, target_col] = y_pred
    if pd.api.types.is_integer_dtype(y_train) or (y_train.dtype == 'O') or (y_train.nunique() <= 15):
        try:
            df_temp[target_col] = df_temp[target_col].round().astype(int)
        except Exception:
            pass

# 6. Finalize imputed frame
df_model = df_temp.copy()
print("Remaining missing values after preprocessing:")
print(df_model.isna().sum()[df_model.isna().sum() > 0])
print("Done.")

In [None]:
# ===============================
# TARGET BINARIZATION
# ===============================
# Convert multi-class severity target 'num' into binary indicator: 1 = any disease, 0 = none.
df_model['num'] = df_model['num'].apply(lambda x: 1 if x > 0 else 0)

In [None]:
# ===============================
# ONE-HOT ENCODING
# ===============================
# Expand categorical features into dummy variables (drop_first to avoid linear dependence).
df_model = pd.get_dummies(df_model, columns=categorical_cols, drop_first=True)

In [None]:
df_model.columns

In [None]:
# ===============================
# FEATURE / TARGET SPLIT + SCALING
# ===============================
# Separate predictors (X) and binary target (y), then apply StandardScaler to normalize variance.
X = df_model.drop('num', axis=1)
y = df_model['num']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_clean = pd.DataFrame(X_scaled, columns=X.columns)

In [None]:
df_model.to_csv("../data/cleaned_data.csv", index=False)