In [11]:
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load cleaned data from EDA
FEATURES_PATH = Path("../data/processed/student_data_eda_cleaned.csv")
df = pd.read_csv(FEATURES_PATH)

# Convert blank spaces to NaN
df = df.replace(r'^\s*$', pd.NA, regex=True)

# Fill missing values
df = df.fillna({
    'attendance_percent': df['attendance_percent'].mean(),
    'test_score': df['test_score'].mean(),
    'discipline_count': 0, 
    'parental_involvement': df['parental_involvement'].mode()[0]
})

# Quick check
print(df.isna().sum()) 


attendance_percent      0
test_score              0
discipline_count        0
parental_involvement    0
dropout_risk            0
dtype: int64


In [12]:

target_col = 'dropout_risk'
X = df.drop(columns=[target_col])
y = df[target_col]


In [13]:
X = pd.get_dummies(X, drop_first=True)
print("Feature matrix shape after encoding:", X.shape)


Feature matrix shape after encoding: (600, 4)


In [14]:
scaler = StandardScaler()
numeric_cols = X.select_dtypes(include='number').columns
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])


In [15]:
PROCESSED_PATH = Path("../data/processed/student_features.csv")
PROCESSED_PATH.parent.mkdir(parents=True, exist_ok=True)

processed_df = pd.concat([X, y], axis=1)
processed_df.to_csv(PROCESSED_PATH, index=False)

print(f"✅ Processed dataset saved to {PROCESSED_PATH}")


✅ Processed dataset saved to ..\data\processed\student_features.csv


In [16]:
# -------------------------------
# STEP 8: Train/Test split
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

train_df = pd.concat([X_train, y_train], axis=1)
test_df  = pd.concat([X_test, y_test], axis=1)

train_path = Path("../data/processed/train.csv")
test_path  = Path("../data/processed/test.csv")

train_df.to_csv(train_path, index=False)
test_df.to_csv(test_path, index=False)

print(f"✅ Train data saved to {train_path}")
print(f"✅ Test data saved to {test_path}")


✅ Train data saved to ..\data\processed\train.csv
✅ Test data saved to ..\data\processed\test.csv


In [17]:

print("Train set:", train_df.shape)
print("Test set:", test_df.shape)

train_df.head()


Train set: (480, 5)
Test set: (120, 5)


Unnamed: 0,attendance_percent,test_score,discipline_count,parental_involvement,dropout_risk
280,0.195251,2.351161,0.099109,1.342883,0
340,-0.86329,-1.411878,-0.5335,0.623481,1
571,1.763691,0.214176,-0.5335,1.342883,0
198,0.132684,-0.049152,0.731717,1.342883,1
518,0.179204,0.130772,-0.5335,-1.534724,0
