# 1. Feature Engineering (Data Prep)
This notebook transforms the raw `cardio_train.csv` into a cleaned, feature-rich dataset ready for training.\n**Output**: `../data/processed/final_dataset.csv`

In [1]:
import pandas as pd
import numpy as np
import os
import sys

# Add source folder to path to import cleaning script
sys.path.append(os.path.abspath('..'))
import src.data.make_dataset as md

In [2]:
# 1. Load and Clean Raw Data (Fix semicolons, remove impossible values)
raw_path = "../cardio_train.csv" # Assume run from notebooks/
if not os.path.exists(raw_path): # Handle path if running from root
    raw_path = "cardio_train.csv"
    
df = md.data_cleaning(raw_path)
print(f"Raw Data Shape: {df.shape}")

Raw Data Shape: (69837, 12)


In [3]:
# 2. Feature Engineering: The "Standard" Set (Champion)
# Combines BMI, MAP, Categories, and Lifestyle Risk.

def apply_standard_features(base_df):
    df = base_df.copy()
    
    # --- Base Calculations ---
    df["BMI"] = df["weight"] / ((df["height"] / 100) ** 2)
    df["map"] = (df["ap_hi"] + 2 * df["ap_lo"]) / 3
    
    # --- Categorical Binning ---
    def get_bmi_cat(bmi):
        if bmi < 18.5: return 0
        elif bmi < 25: return 1
        elif bmi < 30: return 2
        elif bmi < 35: return 3
        elif bmi < 40: return 4
        else: return 5
    df["bmi_cat"] = df["BMI"].apply(get_bmi_cat)
    
    def get_bp_cat(row):
        sys = row["ap_hi"]
        dia = row["ap_lo"]
        if sys > 180 or dia > 120: return 4
        elif sys >= 140 or dia >= 90: return 3
        elif sys >= 130 or dia >= 80: return 2
        elif sys >= 120 and dia < 80: return 1
        else: return 0
    df["bp_cat"] = df.apply(get_bp_cat, axis=1)
    
    df["lifestyle_risk"] = df["smoke"] + df["alco"]
    
    # --- Risk Scores ---
    risk_bmi = (df["BMI"] >= 30).astype(int)
    risk_bp = ((df["ap_hi"] >= 130) | (df["ap_lo"] >= 85)).astype(int)
    risk_gluc = (df["gluc"] > 1).astype(int)
    risk_chol = (df["cholesterol"] > 1).astype(int)
    df["metabolic_score"] = risk_bmi + risk_bp + risk_gluc + risk_chol
    
    # Drop Raw Columns used for calculation
    return df.drop(["weight", "height", "ap_hi", "ap_lo"], axis=1)

df_final = apply_standard_features(df)
df_final = df_final.replace([np.inf, -np.inf], np.nan).dropna()
print(f"Processed Data Shape: {df_final.shape}")

Processed Data Shape: (69837, 14)


In [4]:
# 3. Save to Disk
output_dir = "../data/processed"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

output_path = os.path.join(output_dir, "final_dataset.csv")
df_final.to_csv(output_path, index=False)
print(f"Dataset saved to: {output_path}")

Dataset saved to: ../data/processed/final_dataset.csv
