In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures # For creating polynomial features

# --- 1. Load the Dataset ---

In [2]:
# Read the data that is inside of the CSV
df = pd.read_csv("./Health_Data/cleaned_health.csv")
df

Unnamed: 0,age,sex,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,cp_0,cp_1,cp_2,cp_3
0,52,1,125,212,0,1,168,0,1.0,2,2,3,0,1,0,0,0
1,53,1,140,203,1,0,155,1,3.1,0,0,3,0,1,0,0,0
2,70,1,145,174,0,1,125,1,2.6,0,0,3,0,1,0,0,0
3,61,1,148,203,0,1,161,0,0.0,2,1,3,0,1,0,0,0
4,62,0,138,294,1,1,106,0,1.9,1,3,2,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,68,0,120,211,0,0,115,0,1.5,1,0,2,1,0,0,1,0
298,44,0,108,141,0,1,175,0,0.6,1,0,2,1,0,0,1,0
299,52,1,128,255,0,1,161,1,0.0,2,1,3,0,1,0,0,0
300,59,1,160,273,0,0,125,0,0.0,2,0,2,0,0,0,0,1


In [3]:
# Define original features and target columns
ORIGINAL_FEATURES = [
    'age', 'sex', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
    'exang', 'oldpeak', 'slope', 'ca', 'thal',
    'cp_0', 'cp_1', 'cp_2', 'cp_3'
]
TARGET_COL = 'target' # The column indicating disease presence (0 or 1)

# Define numerical columns for imputation and potential engineering
NUMERICAL_COLS_FOR_PROCESSING = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

# --- 2. Data Preprocessing (Ensuring Cleanliness and Correct Types) ---

In [4]:
print("\n--- Data Preprocessing for Feature Engineering ---")

# Handle '?' or other non-numeric values if they exist, converting to NaN first
df.replace('?', np.nan, inplace=True)
df.replace('N/A', np.nan, inplace=True)

# Convert all relevant columns to numeric, coercing errors
all_relevant_cols = ORIGINAL_FEATURES + [TARGET_COL]
for col in all_relevant_cols:
    if col in df.columns:
        if col in NUMERICAL_COLS_FOR_PROCESSING:
            df[col] = pd.to_numeric(df[col], errors='coerce')
        else: # Categorical/binary features including target
            df[col] = pd.to_numeric(df[col], errors='coerce').astype('Int64') # Use nullable integer

# Impute missing values after type conversion
print("Missing values before imputation:")
print(df[all_relevant_cols].isnull().sum()[df[all_relevant_cols].isnull().sum() > 0])

for col in all_relevant_cols:
    if col in df.columns and df[col].isnull().any():
        if col in NUMERICAL_COLS_FOR_PROCESSING:
            median_val = df[col].median()
            df[col].fillna(median_val, inplace=True)
            print(f"Filled missing values in '{col}' with its median ({median_val}).")
        else: # Categorical/binary features including target
            mode_val = df[col].mode()[0]
            df[col].fillna(mode_val, inplace=True)
            print(f"Filled missing values in '{col}' with its mode ({mode_val}).")

print("\nMissing values after imputation:")
print(df[all_relevant_cols].isnull().sum())


--- Data Preprocessing for Feature Engineering ---
Missing values before imputation:
Series([], dtype: int64)

Missing values after imputation:
age         0
sex         0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
cp_0        0
cp_1        0
cp_2        0
cp_3        0
target      0
dtype: int64


# --- 3. Advanced Feature Engineering ---

In [5]:
print("\n--- Performing Advanced Feature Engineering ---")

# Create a copy of the DataFrame to add new features
df_engineered = df.copy()


--- Performing Advanced Feature Engineering ---


# --- 3.1. Polynomial Features ---

In [6]:
# Create polynomial features for selected numerical columns
# degree=2 means it will create feature^2 and interaction terms (feature1 * feature2)
# include_bias=False ensures it doesn't add a column of all ones
poly = PolynomialFeatures(degree=2, include_bias=False)
cols_for_poly = ['age', 'trestbps', 'chol'] # Select features to create polynomials from

# Ensure selected columns exist before processing
actual_cols_for_poly = [col for col in cols_for_poly if col in df_engineered.columns]

if actual_cols_for_poly:
    # Fit and transform the selected columns
    poly_features = poly.fit_transform(df_engineered[actual_cols_for_poly])
    # Get the names of the new polynomial features
    poly_feature_names = poly.get_feature_names_out(actual_cols_for_poly)
    # Create a DataFrame for the new polynomial features
    poly_df = pd.DataFrame(poly_features, columns=poly_feature_names, index=df_engineered.index)
    # Concatenate the new polynomial features to the main DataFrame
    df_engineered = pd.concat([df_engineered, poly_df], axis=1)
    print(f"Added polynomial features for: {actual_cols_for_poly}")
    print(f"New polynomial features created: {poly_feature_names}")
else:
    print("Skipping polynomial features: None of the specified columns found.")

Added polynomial features for: ['age', 'trestbps', 'chol']
New polynomial features created: ['age' 'trestbps' 'chol' 'age^2' 'age trestbps' 'age chol' 'trestbps^2'
 'trestbps chol' 'chol^2']


# --- 3.2. Interaction Features (Manual) ---

In [7]:
# Create specific interaction terms that might be clinically relevant
# Ensure both interacting columns exist
if 'age' in df_engineered.columns and 'chol' in df_engineered.columns:
    df_engineered['age_x_chol'] = df_engineered['age'] * df_engineered['chol']
    print("Added interaction feature: 'age_x_chol'")
else:
    print("Skipping interaction feature 'age_x_chol': One or both columns not found.")

if 'thalach' in df_engineered.columns and 'exang' in df_engineered.columns:
    df_engineered['thalach_x_exang'] = df_engineered['thalach'] * df_engineered['exang']
    print("Added interaction feature: 'thalach_x_exang'")
else:
    print("Skipping interaction feature 'thalach_x_exang': One or both columns not found.")

ValueError: Cannot set a DataFrame with multiple columns to the single column age_x_chol

# --- 3.3. Binning / Discretization ---

In [None]:
# Convert a continuous numerical feature into categorical bins
if 'age' in df_engineered.columns:
    # Define age bins and labels
    age_bins = [0, 40, 50, 60, 70, df_engineered['age'].max() + 1]
    age_labels = ['<40', '40-49', '50-59', '60-69', '70+']
    df_engineered['age_group'] = pd.cut(df_engineered['age'], bins=age_bins, labels=age_labels, right=False)
    print("Added binned feature: 'age_group'")
    print("Distribution of 'age_group':")
    print(df_engineered['age_group'].value_counts().sort_index())
else:
    print("Skipping binning for 'age': Column not found.")

# --- 3.4. Ratio Features ---

In [None]:
# Create new features as ratios of existing numerical features
if 'chol' in df_engineered.columns and 'trestbps' in df_engineered.columns:
    # Avoid division by zero
    df_engineered['chol_to_trestbps_ratio'] = np.where(
        df_engineered['trestbps'] != 0,
        df_engineered['chol'] / df_engineered['trestbps'],
        0 # Or np.nan, depending on how you want to handle zero blood pressure
    )
    print("Added ratio feature: 'chol_to_trestbps_ratio'")
else:
    print("Skipping ratio feature 'chol_to_trestbps_ratio': One or both columns not found.")

# --- 3.5. Combining One-Hot Encoded 'cp' into a single 'cp_type' categorical feature ---

In [None]:
# This is useful if you want to treat cp as a single categorical feature again
# assuming cp_0, cp_1, cp_2, cp_3 represent different types (e.g., 0, 1, 2, 3)
cp_cols = ['cp_0', 'cp_1', 'cp_2', 'cp_3']
actual_cp_cols = [col for col in cp_cols if col in df_engineered.columns]

if len(actual_cp_cols) == 4: # Ensure all one-hot columns are present
    # Find the column with value 1 for each row
    df_engineered['cp_type'] = df_engineered[actual_cp_cols].idxmax(axis=1)
    # Convert 'cp_type' from 'cp_X' string to integer X
    df_engineered['cp_type'] = df_engineered['cp_type'].str.replace('cp_', '').astype(int)
    print("Combined one-hot encoded 'cp' into 'cp_type' categorical feature.")
    print("Distribution of 'cp_type':")
    print(df_engineered['cp_type'].value_counts().sort_index())
else:
    print("Skipping 'cp_type' combination: Not all one-hot encoded 'cp' columns found.")

# --- Final Check and Display ---

In [None]:
# --- Final Check and Display ---
print("\n--- Advanced Feature Engineering Complete ---")
print(f"New dataset shape: {df_engineered.shape}")
print("\nFirst 10 rows of the DataFrame with engineered features:")
print(df_engineered.head(10))

print("\nNew columns added:")
# Identify new columns by comparing with original columns
new_cols = [col for col in df_engineered.columns if col not in df.columns]
print(new_cols)

print("\nData types of new columns:")
if new_cols:
    print(df_engineered[new_cols].dtypes)
else:
    print("No new columns were added.")
# You can now use df_engineered for further modeling or analysis