# Heart Stroke Risk Prediction
## Notebook 2: Preprocessing & Feature Engineering
**Author:** Dev Kapania | IIT Roorkee Research Intern

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import joblib
import warnings
warnings.filterwarnings('ignore')

print('Libraries loaded!')

## 1. Load Raw Data

In [None]:
df = pd.read_csv('../data/raw/heart.csv')
print(f'Raw data shape: {df.shape}')
df.head()

## 2. Handle Missing Values

In [None]:
# Fill numerical nulls with median (robust to outliers)
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
for col in numerical_cols:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].median(), inplace=True)
        print(f'Filled {col} with median: {df[col].median():.2f}')

print(f'\nMissing values after handling: {df.isnull().sum().sum()}')

## 3. Encode Categorical Features

In [None]:
categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
existing_cats = [col for col in categorical_cols if col in df.columns]

le = LabelEncoder()
for col in existing_cats:
    if df[col].dtype == 'object':
        df[col] = le.fit_transform(df[col])
        print(f'Encoded: {col}')

print(f'\nData shape after encoding: {df.shape}')

## 4. Feature Engineering

In [None]:
# Age groups
df['age_group'] = pd.cut(df['age'], bins=[0,40,55,70,100], labels=[0,1,2,3]).astype(int)

# High cholesterol flag
df['high_chol'] = (df['chol'] > 200).astype(int)

# High BP flag
df['high_bp'] = (df['trestbps'] > 140).astype(int)

print('New features added:')
print('- age_group: binned age (0=<40, 1=40-55, 2=55-70, 3=70+)')
print('- high_chol: cholesterol > 200 mg/dl')
print('- high_bp: resting BP > 140 mmHg')
print(f'\nFinal shape: {df.shape}')

## 5. Train/Test Split

In [None]:
X = df.drop('target', axis=1)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f'Train size: {X_train.shape}')
print(f'Test size: {X_test.shape}')
print(f'Train class balance: {dict(y_train.value_counts())}')

## 6. Handle Class Imbalance with SMOTE

In [None]:
print(f'Before SMOTE: {dict(y_train.value_counts())}')

smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print(f'After SMOTE:  {dict(pd.Series(y_train_balanced).value_counts())}')
print(f'New train size: {X_train_balanced.shape}')

## 7. Feature Scaling

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_balanced)
X_test_scaled  = scaler.transform(X_test)

# Save scaler for later use
joblib.dump(scaler, '../models/scaler.pkl')
print('Scaler saved to ../models/scaler.pkl')

# Save processed data
np.save('../data/processed/X_train.npy', X_train_scaled)
np.save('../data/processed/X_test.npy',  X_test_scaled)
np.save('../data/processed/y_train.npy', y_train_balanced)
np.save('../data/processed/y_test.npy',  y_test.values)

print('Processed data saved!')
print(f'\nFinal X_train shape: {X_train_scaled.shape}')
print(f'Final X_test shape:  {X_test_scaled.shape}')