In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report

# Libraries for handling imbalance (might need to install: pip install imbalanced-learn)
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline # For combining sampling with modeling

import matplotlib.pyplot as plt
import seaborn as sns

# --- 1. Load the Dataset ---

In [3]:
# Read the data that is inside of the CSV
df = pd.read_csv("./Health_Data/cleaned_health.csv")
df

Unnamed: 0,age,sex,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,cp_0,cp_1,cp_2,cp_3
0,52,1,125,212,0,1,168,0,1.0,2,2,3,0,1,0,0,0
1,53,1,140,203,1,0,155,1,3.1,0,0,3,0,1,0,0,0
2,70,1,145,174,0,1,125,1,2.6,0,0,3,0,1,0,0,0
3,61,1,148,203,0,1,161,0,0.0,2,1,3,0,1,0,0,0
4,62,0,138,294,1,1,106,0,1.9,1,3,2,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,68,0,120,211,0,0,115,0,1.5,1,0,2,1,0,0,1,0
298,44,0,108,141,0,1,175,0,0.6,1,0,2,1,0,0,1,0
299,52,1,128,255,0,1,161,1,0.0,2,1,3,0,1,0,0,0
300,59,1,160,273,0,0,125,0,0.0,2,0,2,0,0,0,0,1


In [4]:
# Define original features and target columns
ORIGINAL_FEATURES = [
    'age', 'sex', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
    'exang', 'oldpeak', 'slope', 'ca', 'thal',
    'cp_0', 'cp_1', 'cp_2', 'cp_3'
]
TARGET_COL = 'target' # The column indicating disease presence (0 or 1)

# Define numerical columns for imputation and potential engineering
NUMERICAL_COLS_FOR_PROCESSING = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

# --- 2. Data Preprocessing (Ensuring Cleanliness and Correct Types) ---

In [5]:
print("\n--- Data Preprocessing for Imbalance Handling ---")

# Handle '?' or other non-numeric values if they exist, converting to NaN first
df.replace('?', np.nan, inplace=True)
df.replace('N/A', np.nan, inplace=True)

# Convert all relevant columns to numeric, coercing errors
all_relevant_cols_initial = ORIGINAL_FEATURES + [TARGET_COL]
for col in all_relevant_cols_initial:
    if col in df.columns:
        if col in NUMERICAL_COLS_FOR_PROCESSING:
            df[col] = pd.to_numeric(df[col], errors='coerce')
        else: # Categorical/binary features including target
            df[col] = pd.to_numeric(df[col], errors='coerce').astype('Int64') # Use nullable integer

# Impute missing values after type conversion
print("Missing values before imputation:")
print(df[all_relevant_cols_initial].isnull().sum()[df[all_relevant_cols_initial].isnull().sum() > 0])

for col in all_relevant_cols_initial:
    if col in df.columns and df[col].isnull().any():
        if col in NUMERICAL_COLS_FOR_PROCESSING:
            median_val = df[col].median()
            df[col].fillna(median_val, inplace=True)
            print(f"Filled missing values in '{col}' with its median ({median_val}).")
        else: # Categorical/binary features including target
            mode_val = df[col].mode()[0]
            df[col].fillna(mode_val, inplace=True)
            print(f"Filled missing values in '{col}' with its mode ({mode_val}).")

print("\nMissing values after imputation:")
print(df[all_relevant_cols_initial].isnull().sum())


--- Data Preprocessing for Imbalance Handling ---
Missing values before imputation:
Series([], dtype: int64)

Missing values after imputation:
age         0
sex         0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
cp_0        0
cp_1        0
cp_2        0
cp_3        0
target      0
dtype: int64


# --- 3. Advanced Feature Engineering (Re-applying from previous step) ---

In [6]:
print("\n--- Re-applying Advanced Feature Engineering ---")

# Create a copy of the DataFrame to add new features
df_engineered = df.copy()


--- Re-applying Advanced Feature Engineering ---


# 3.1. Polynomial Features

In [7]:
poly = PolynomialFeatures(degree=2, include_bias=False)
cols_for_poly = ['age', 'trestbps', 'chol']
actual_cols_for_poly = [col for col in cols_for_poly if col in df_engineered.columns]

if actual_cols_for_poly:
    poly_features = poly.fit_transform(df_engineered[actual_cols_for_poly])
    poly_feature_names = poly.get_feature_names_out(actual_cols_for_poly)
    poly_df = pd.DataFrame(poly_features, columns=poly_feature_names, index=df_engineered.index)

    # Drop original columns from poly_df to avoid duplication
    cols_to_drop_from_poly_df = [col for col in actual_cols_for_poly if col in poly_df.columns]
    poly_df = poly_df.drop(columns=cols_to_drop_from_poly_df, errors='ignore')

    df_engineered = pd.concat([df_engineered, poly_df], axis=1)
    print(f"Added polynomial features for: {actual_cols_for_poly}")
else:
    print("Skipping polynomial features: None of the specified columns found.")

Added polynomial features for: ['age', 'trestbps', 'chol']


# 3.2. Interaction Features (Manual)

In [8]:
if 'age' in df_engineered.columns and 'chol' in df_engineered.columns:
    df_engineered['age_x_chol'] = (df_engineered['age'] * df_engineered['chol']).squeeze()
    print("Added interaction feature: 'age_x_chol'")
if 'thalach' in df_engineered.columns and 'exang' in df_engineered.columns:
    df_engineered['thalach_x_exang'] = (df_engineered['thalach'] * df_engineered['exang']).squeeze()
    print("Added interaction feature: 'thalach_x_exang'")

Added interaction feature: 'age_x_chol'
Added interaction feature: 'thalach_x_exang'


# 3.3. Binning / Discretization

In [9]:
if 'age' in df_engineered.columns:
    age_bins = [0, 40, 50, 60, 70, df_engineered['age'].max() + 1]
    age_labels = ['<40', '40-49', '50-59', '60-69', '70+']
    df_engineered['age_group'] = pd.cut(df_engineered['age'], bins=age_bins, labels=age_labels, right=False)
    print("Added binned feature: 'age_group'")

Added binned feature: 'age_group'


# 3.4. Ratio Features

In [10]:
if 'chol' in df_engineered.columns and 'trestbps' in df_engineered.columns:
    df_engineered['chol_to_trestbps_ratio'] = np.where(
        df_engineered['trestbps'] != 0,
        df_engineered['chol'] / df_engineered['trestbps'],
        0
    ).squeeze()
    print("Added ratio feature: 'chol_to_trestbps_ratio'")

Added ratio feature: 'chol_to_trestbps_ratio'


# 3.5. Combining One-Hot Encoded 'cp' into a single 'cp_type' categorical feature

In [11]:
cp_cols = ['cp_0', 'cp_1', 'cp_2', 'cp_3']
actual_cp_cols = [col for col in cp_cols if col in df_engineered.columns]
if len(actual_cp_cols) == 4:
    df_engineered['cp_type'] = df_engineered[actual_cp_cols].idxmax(axis=1)
    df_engineered['cp_type'] = df_engineered['cp_type'].str.replace('cp_', '').astype(int)
    print("Combined one-hot encoded 'cp' into 'cp_type' categorical feature.")
else:
    print("Skipping 'cp_type' combination: Not all one-hot encoded 'cp' columns found.")

# Update FEATURES list to include new engineered features for modeling
ENGINEERED_FEATURES = [
    col for col in df_engineered.columns
    if col not in df.columns and col != TARGET_COL # Exclude target and original columns
]
if 'age_group' in df_engineered.columns:
    df_engineered = pd.get_dummies(df_engineered, columns=['age_group'], prefix='age_group')
    ENGINEERED_FEATURES = [col for col in df_engineered.columns if col.startswith('age_group_')] + \
                          [f for f in ENGINEERED_FEATURES if not f.startswith('age_group')]

FINAL_FEATURES = [col for col in ORIGINAL_FEATURES if col in df_engineered.columns] + ENGINEERED_FEATURES
if 'cp_type' in FINAL_FEATURES:
    FINAL_FEATURES = [f for f in FINAL_FEATURES if f not in cp_cols]

# Separate features (X) and target (y)
X = df_engineered[FINAL_FEATURES]
y = df_engineered[TARGET_COL]

if y.nunique() != 2:
    print(f"Error: The target column '{TARGET_COL}' is not binary. It has {y.nunique()} unique values: {y.unique()}")
    print("Please ensure your 'target' column is binary (e.g., 0 and 1) for classification.")
    exit()

print(f"\nFinal Features (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")
print(f"Original Target distribution:\n{y.value_counts(normalize=True).round(2)}")
print("\nFinal features used for modeling:")
print(X.columns.tolist())

Combined one-hot encoded 'cp' into 'cp_type' categorical feature.

Final Features (X) shape: (302, 27)
Target (y) shape: (302,)
Original Target distribution:
1    0.54
0    0.46
Name: target, dtype: Float64

Final features used for modeling:
['age', 'sex', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'age_group_<40', 'age_group_40-49', 'age_group_50-59', 'age_group_60-69', 'age_group_70+', 'age^2', 'age trestbps', 'age chol', 'trestbps^2', 'trestbps chol', 'chol^2', 'age_x_chol', 'thalach_x_exang', 'chol_to_trestbps_ratio', 'cp_type']


# --- 4. Split Data into Training and Testing Sets ---

In [12]:
# Stratify to maintain original class distribution in train/test splits
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)
print(f"\nTraining set shape: {X_train.shape}, {y_train.shape}")
print(f"Testing set shape: {X_test.shape}, {y_test.shape}")


Training set shape: (226, 27), (226,)
Testing set shape: (76, 27), (76,)


# --- 5. Feature Scaling ---

In [14]:
scaler = StandardScaler()
numerical_features_to_scale = [col for col in NUMERICAL_COLS_FOR_PROCESSING if col in X_train.columns] + \
                              ['age_x_chol', 'thalach_x_exang', 'chol_to_trestbps_ratio']
numerical_features_to_scale = [f for f in numerical_features_to_scale if f in X_train.columns]

if numerical_features_to_scale:
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()
    X_train_scaled[numerical_features_to_scale] = scaler.fit_transform(X_train[numerical_features_to_scale])
    X_test_scaled[numerical_features_to_scale] = scaler.transform(X_test[numerical_features_to_scale])
    print(f"Scaled numerical features: {numerical_features_to_scale}")
else:
    print("No numerical features found for scaling. Using original X_train/X_test.")
    X_train_scaled = X_train
    X_test_scaled = X_test

print("\nFirst 5 rows of scaled training features:")
print(X_train_scaled.head())

# --- Function to evaluate model performance ---
def evaluate_model(model, X_test, y_test, model_name="Model"):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None

    print(f"\n--- {model_name} Performance ---")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred):.4f}")
    print(f"Recall: {recall_score(y_test, y_pred):.4f}")
    print(f"F1-Score: {f1_score(y_test, y_pred):.4f}")
    if y_proba is not None:
        print(f"ROC AUC Score: {roc_auc_score(y_test, y_proba):.4f}")
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

Scaled numerical features: ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'age_x_chol', 'thalach_x_exang', 'chol_to_trestbps_ratio']

First 5 rows of scaled training features:
         age  sex  trestbps      chol  fbs  restecg   thalach  exang  \
18  0.410557    1  0.515901 -0.676764    1        0  0.611286      0   
90 -0.246722    1 -0.804933 -1.153240    0        0  1.717907      0   
75 -1.123095    1 -0.684857 -0.505233    0        1  0.832610      0   
8  -0.904002    1 -0.684857  0.047479    0        0 -0.318276      0   
28  0.081918    0  2.917417  1.534084    0        2 -1.513427      1   

     oldpeak  slope  ...   age^2  age trestbps  age chol  trestbps^2  \
18 -0.851400      2  ...  3364.0        8120.0   12238.0     19600.0   
90 -0.851400      1  ...  2704.0        6136.0    9672.0     13924.0   
75 -0.851400      2  ...  1936.0        5280.0    9680.0     14400.0   
8  -0.149150      2  ...  2116.0        5520.0   11454.0     14400.0   
28  2.133161      1  ...  30

# --- 6. Addressing Class Imbalance ---

# --- 6.1. Baseline Model (without imbalance handling) ---

In [15]:
print("\n--- Training Baseline Logistic Regression Model (No Imbalance Handling) ---")
lr_baseline = LogisticRegression(random_state=42, solver='liblinear', max_iter=1000)
lr_baseline.fit(X_train_scaled, y_train)
evaluate_model(lr_baseline, X_test_scaled, y_test, "Baseline Logistic Regression")

print("\n--- Training Baseline Random Forest Model (No Imbalance Handling) ---")
rf_baseline = RandomForestClassifier(random_state=42, n_estimators=100)
rf_baseline.fit(X_train_scaled, y_train)
evaluate_model(rf_baseline, X_test_scaled, y_test, "Baseline Random Forest")


--- Training Baseline Logistic Regression Model (No Imbalance Handling) ---

--- Baseline Logistic Regression Performance ---
Accuracy: 0.7763
Precision: 0.8000
Recall: 0.7805
F1-Score: 0.7901
ROC AUC Score: 0.8760

Confusion Matrix:
[[27  8]
 [ 9 32]]

Classification Report:
              precision    recall  f1-score   support

         0.0       0.75      0.77      0.76        35
         1.0       0.80      0.78      0.79        41

    accuracy                           0.78        76
   macro avg       0.78      0.78      0.78        76
weighted avg       0.78      0.78      0.78        76


--- Training Baseline Random Forest Model (No Imbalance Handling) ---

--- Baseline Random Forest Performance ---
Accuracy: 0.7763
Precision: 0.7857
Recall: 0.8049
F1-Score: 0.7952
ROC AUC Score: 0.8523

Confusion Matrix:
[[26  9]
 [ 8 33]]

Classification Report:
              precision    recall  f1-score   support

         0.0       0.76      0.74      0.75        35
         1.0       0

# --- 6.2. Strategy 1: Oversampling with SMOTE ---

In [16]:
print("\n--- Strategy 1: Oversampling with SMOTE ---")
# SMOTE only applies to the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

print(f"Original training set shape: {X_train_scaled.shape}")
print(f"SMOTE-resampled training set shape: {X_train_smote.shape}")
print(f"SMOTE-resampled target distribution:\n{y_train_smote.value_counts(normalize=True).round(2)}")

# Train models on SMOTE-resampled data
print("\n--- Training Logistic Regression with SMOTE ---")
lr_smote = LogisticRegression(random_state=42, solver='liblinear', max_iter=1000)
lr_smote.fit(X_train_smote, y_train_smote)
evaluate_model(lr_smote, X_test_scaled, y_test, "Logistic Regression (SMOTE)")

print("\n--- Training Random Forest with SMOTE ---")
rf_smote = RandomForestClassifier(random_state=42, n_estimators=100)
rf_smote.fit(X_train_smote, y_train_smote)
evaluate_model(rf_smote, X_test_scaled, y_test, "Random Forest (SMOTE)")


--- Strategy 1: Oversampling with SMOTE ---


TypeError: cannot safely cast non-equivalent float64 to int64

# --- 6.3. Strategy 2: Undersampling with RandomUnderSampler ---

In [17]:
print("\n--- Strategy 2: Undersampling with RandomUnderSampler ---")
# Undersampling only applies to the training data
rus = RandomUnderSampler(random_state=42)
X_train_rus, y_train_rus = rus.fit_resample(X_train_scaled, y_train)

print(f"Original training set shape: {X_train_scaled.shape}")
print(f"RUS-resampled training set shape: {X_train_rus.shape}")
print(f"RUS-resampled target distribution:\n{y_train_rus.value_counts(normalize=True).round(2)}")

# Train models on RUS-resampled data
print("\n--- Training Logistic Regression with RandomUnderSampler ---")
lr_rus = LogisticRegression(random_state=42, solver='liblinear', max_iter=1000)
lr_rus.fit(X_train_rus, y_train_rus)
evaluate_model(lr_rus, X_test_scaled, y_test, "Logistic Regression (RandomUnderSampler)")

print("\n--- Training Random Forest with RandomUnderSampler ---")
rf_rus = RandomForestClassifier(random_state=42, n_estimators=100)
rf_rus.fit(X_train_rus, y_train_rus)
evaluate_model(rf_rus, X_test_scaled, y_test, "Random Forest (RandomUnderSampler)")


--- Strategy 2: Undersampling with RandomUnderSampler ---
Original training set shape: (226, 27)
RUS-resampled training set shape: (206, 27)
RUS-resampled target distribution:
0    0.5
1    0.5
Name: target, dtype: Float64

--- Training Logistic Regression with RandomUnderSampler ---

--- Logistic Regression (RandomUnderSampler) Performance ---
Accuracy: 0.7895
Precision: 0.8205
Recall: 0.7805
F1-Score: 0.8000
ROC AUC Score: 0.8850

Confusion Matrix:
[[28  7]
 [ 9 32]]

Classification Report:
              precision    recall  f1-score   support

         0.0       0.76      0.80      0.78        35
         1.0       0.82      0.78      0.80        41

    accuracy                           0.79        76
   macro avg       0.79      0.79      0.79        76
weighted avg       0.79      0.79      0.79        76


--- Training Random Forest with RandomUnderSampler ---

--- Random Forest (RandomUnderSampler) Performance ---
Accuracy: 0.7368
Precision: 0.7838
Recall: 0.7073
F1-Score: 0.

# --- 6.4. Strategy 3: Using class_weight in Models ---

In [18]:
print("\n--- Strategy 3: Using `class_weight` parameter in Models ---")

# 'balanced' mode automatically adjusts weights inversely proportional to class frequencies
print("\n--- Training Logistic Regression with `class_weight='balanced'` ---")
lr_weighted = LogisticRegression(random_state=42, solver='liblinear', max_iter=1000, class_weight='balanced')
lr_weighted.fit(X_train_scaled, y_train)
evaluate_model(lr_weighted, X_test_scaled, y_test, "Logistic Regression (Class Weighted)")

print("\n--- Training Random Forest with `class_weight='balanced'` ---")
rf_weighted = RandomForestClassifier(random_state=42, n_estimators=100, class_weight='balanced')
rf_weighted.fit(X_train_scaled, y_train)
evaluate_model(rf_weighted, X_test_scaled, y_test, "Random Forest (Class Weighted)")


--- Strategy 3: Using `class_weight` parameter in Models ---

--- Training Logistic Regression with `class_weight='balanced'` ---

--- Logistic Regression (Class Weighted) Performance ---
Accuracy: 0.7895
Precision: 0.8205
Recall: 0.7805
F1-Score: 0.8000
ROC AUC Score: 0.8774

Confusion Matrix:
[[28  7]
 [ 9 32]]

Classification Report:
              precision    recall  f1-score   support

         0.0       0.76      0.80      0.78        35
         1.0       0.82      0.78      0.80        41

    accuracy                           0.79        76
   macro avg       0.79      0.79      0.79        76
weighted avg       0.79      0.79      0.79        76


--- Training Random Forest with `class_weight='balanced'` ---

--- Random Forest (Class Weighted) Performance ---
Accuracy: 0.8026
Precision: 0.8095
Recall: 0.8293
F1-Score: 0.8193
ROC AUC Score: 0.8578

Confusion Matrix:
[[27  8]
 [ 7 34]]

Classification Report:
              precision    recall  f1-score   support

         0.0

In [19]:
print("\nClass imbalance handling strategies applied and evaluated.")


Class imbalance handling strategies applied and evaluated.
