In [4]:
# 📌 Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import joblib

In [5]:
# 📌 Load dataset
df = pd.read_csv('../../data/raw/heart_attack_prediction_dataset.csv')

In [6]:
# 📌 Drop unneeded column
df.drop('Patient ID', axis=1, inplace=True)

In [7]:
# 📌 Convert Blood Pressure to numeric
df[['Systolic BP', 'Diastolic BP']] = df['Blood Pressure'].str.split('/', expand=True).astype(int)
df.drop('Blood Pressure', axis=1, inplace=True)


In [8]:
# 📌 Label encode categorical features
label_cols = ['Sex', 'Diet', 'Hemisphere']
le = LabelEncoder()
for col in label_cols:
    df[col] = le.fit_transform(df[col])

In [9]:
# 📌 One-hot encode categorical columns
df = pd.get_dummies(df, columns=['Country', 'Continent'], drop_first=True)


In [10]:
# 📌 Split into features (X) and target (y)
X = df.drop('Heart Attack Risk', axis=1)
y = df['Heart Attack Risk']


In [11]:
# 📌 Save feature names for later use
feature_names = list(X.columns)
joblib.dump(feature_names, 'feature_names.pkl')

['feature_names.pkl']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

In [None]:
print("Preprocessing complete — scaler and feature names saved.")

Preprocessing complete — scaler and feature names saved.
