In [None]:
# 1. Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 2. Load cleaned data
df = pd.read_csv('../data/processed/diabetes_prediction_clean.csv')
print(df.shape)
df.head()

# 3. Separate features and target
X = df.drop('diabetes', axis=1)
y = df['diabetes']

# 4. One-Hot Encoding for categoricals
categorical_cols = ['gender', 'smoking_history']
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

# 5. Train-test split (stratify on y)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

# 6. Align test columns (in case some categories are missing from test set)
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# 7. Scaling numeric features
numeric_cols = ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']
scaler = StandardScaler()
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

# 8. Save processed data for ML
X_train.to_csv('../data/processed/X_train_ml.csv', index=False)
X_test.to_csv('../data/processed/X_test_ml.csv', index=False)
y_train.to_csv('../data/processed/y_train_ml.csv', index=False)
y_test.to_csv('../data/processed/y_test_ml.csv', index=False)

print("Done! Data ready for modeling 🚀")


In [None]:
import joblib
joblib.dump(scaler, '../models/scaler.pkl')
print("Scaler salvat!")
