In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split


In [2]:
df = pd.read_csv('StudentsPerformance.csv')

In [3]:
df['average_score'] = (df['math score'] + df['reading score'] + df['writing score']) / 3

In [4]:
df['performance'] = pd.cut(df['average_score'],
                           bins=[0, 40, 60, 80, 100],
                           labels=[0, 1, 2, 3])

In [5]:
df['test_prep_completed'] = df['test preparation course'].apply(lambda x: 1 if x == 'completed' else 0)
df['standard_lunch'] = df['lunch'].apply(lambda x: 1 if x == 'standard' else 0)

In [6]:
label_encoders = {}
categorical_cols = ['gender', 'race/ethnicity', 'parental level of education']

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [7]:
features = ['gender', 'race/ethnicity', 'parental level of education',
            'test_prep_completed', 'standard_lunch']
X = df[features]
y = df['performance']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [9]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:
import joblib
import os

In [11]:
os.makedirs('../data/processed', exist_ok=True)
os.makedirs('../data/interim', exist_ok=True)

In [19]:
import os

# Create directories if they don't exist
os.makedirs('interim', exist_ok=True)
os.makedirs('processed', exist_ok=True)

In [21]:
joblib.dump(scaler, 'interim/scaler.pkl')

['interim/scaler.pkl']

In [24]:
for col, le in label_encoders.items():
    safe_col = col.replace('/', '_').replace(' ', '_')
    joblib.dump(le, f'interim/{safe_col}_encoder.pkl')

In [25]:
pd.DataFrame(X_train_scaled, columns=features).to_csv('processed/X_train.csv', index=False)
pd.DataFrame(X_test_scaled, columns=features).to_csv('processed/X_test.csv', index=False)
y_train.to_csv('processed/y_train.csv', index=False)
y_test.to_csv('processed/y_test.csv', index=False)