In [6]:
import pandas as pd
import numpy as np
import os
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the raw dataset
# Ensure heart.csv is in your /data folder
data_path = os.path.join('..', 'data', 'heart.csv')
df = pd.read_csv(data_path)

print(f"✅ Dataset loaded: {df.shape[0]} rows and {df.shape[1]} columns.")
df.head()

# 1. Initialize the Scaler
scaler = StandardScaler()

# 2. Define the 5 numerical columns you chose to scale
num_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

# 3. FIT and TRANSFORM the data
# This 'teaches' the scaler the mean and standard deviation of your data
df[num_cols] = scaler.fit_transform(df[num_cols])

# 4. Save the FITTED scaler to your /models folder
os.makedirs('../models', exist_ok=True)
joblib.dump(scaler, '../models/scaler.pkl')

print("✅ Scaler fitted to numerical columns and saved to models/scaler.pkl")

# Separate features and target
X = df.drop('target', axis=1)
y = df['target']

# Split into Training (80%) and Temporary (20%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)

# Split the 20% into Validation and Test (10% each)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"✅ Data Split Complete:")
print(f"Train: {X_train.shape[0]} | Val: {X_val.shape[0]} | Test: {X_test.shape[0]}")

# Create data folder if it doesn't exist
os.makedirs('../data', exist_ok=True)

# Save the splits to CSV for Phase 2
X_train.to_csv('../data/X_train.csv', index=False)
X_val.to_csv('../data/X_val.csv', index=False)
X_test.to_csv('../data/X_test.csv', index=False)
y_train.to_csv('../data/y_train.csv', index=False)
y_val.to_csv('../data/y_val.csv', index=False)
y_test.to_csv('../data/y_test.csv', index=False)

print("✅ All data splits saved to the /data folder.")

✅ Dataset loaded: 1025 rows and 14 columns.
✅ Scaler fitted to numerical columns and saved to models/scaler.pkl
✅ Data Split Complete:
Train: 820 | Val: 102 | Test: 103
✅ All data splits saved to the /data folder.
