In [1]:
# --------------------------------------------------
# Import required libraries for data manipulation,
# numerical computation, and machine learning
# preprocessing tasks
# --------------------------------------------------
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [2]:
# --------------------------------------------------
# Load the processed student dataset from the local
# directory and display the first few rows to
# understand the structure of the data
# --------------------------------------------------
df = pd.read_csv("../data/processed/processed_student_data.csv")
df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3,pass
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,3,4,1,1,3,6,5,6,6,0
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,3,3,1,1,3,4,5,5,6,0
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,3,2,2,3,3,10,7,8,10,1
3,GP,F,15,U,GT3,T,4,2,health,services,...,2,2,1,1,5,2,15,14,15,1
4,GP,F,16,U,GT3,T,3,3,other,other,...,3,2,1,2,5,4,6,10,10,1


In [3]:
# --------------------------------------------------
# Examine the dataset dimensions and inspect
# column data types and non-null counts
# --------------------------------------------------
df.shape
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 34 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   school      395 non-null    object
 1   sex         395 non-null    object
 2   age         395 non-null    int64 
 3   address     395 non-null    object
 4   famsize     395 non-null    object
 5   Pstatus     395 non-null    object
 6   Medu        395 non-null    int64 
 7   Fedu        395 non-null    int64 
 8   Mjob        395 non-null    object
 9   Fjob        395 non-null    object
 10  reason      395 non-null    object
 11  guardian    395 non-null    object
 12  traveltime  395 non-null    int64 
 13  studytime   395 non-null    int64 
 14  failures    395 non-null    int64 
 15  schoolsup   395 non-null    object
 16  famsup      395 non-null    object
 17  paid        395 non-null    object
 18  activities  395 non-null    object
 19  nursery     395 non-null    object
 20  higher    

In [5]:
# --------------------------------------------------
# Separate the dataset into feature variables (X)
# and the target variable (y) for modeling
# --------------------------------------------------
X = df.drop(columns=["pass", "G3"])
y = df["pass"]

In [None]:
# --------------------------------------------------
# Identify categorical and numerical feature
# columns based on their data types
# --------------------------------------------------
categorical_features = X.select_dtypes(include="object").columns
numerical_features = X.select_dtypes(include=["int64", "float64"]).columns

categorical_features, numerical_features


(Index(['school', 'sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob',
        'reason', 'guardian', 'schoolsup', 'famsup', 'paid', 'activities',
        'nursery', 'higher', 'internet', 'romantic'],
       dtype='object'),
 Index(['age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures', 'famrel',
        'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2'],
       dtype='object'))

In [7]:
# --------------------------------------------------
# Encode categorical variables using one-hot
# encoding to convert them into numerical form
# --------------------------------------------------
X_encoded = pd.get_dummies(X, columns=categorical_features, drop_first=True)
X_encoded.head()


Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,...,guardian_mother,guardian_other,schoolsup_yes,famsup_yes,paid_yes,activities_yes,nursery_yes,higher_yes,internet_yes,romantic_yes
0,18,4,4,2,2,0,4,3,4,1,...,True,False,True,False,False,False,True,True,False,False
1,17,1,1,1,2,0,5,3,3,1,...,False,False,False,True,False,False,False,True,True,False
2,15,1,1,1,2,3,4,3,2,2,...,True,False,True,False,True,False,True,True,True,False
3,15,4,2,1,3,0,3,2,2,1,...,True,False,False,True,True,True,True,True,True,True
4,16,3,3,1,2,0,4,3,2,1,...,False,False,False,True,True,False,True,True,False,False


In [8]:
# --------------------------------------------------
# Scale numerical features to have zero mean
# and unit variance using standardization
# --------------------------------------------------
scaler = StandardScaler()
X_encoded[numerical_features] = scaler.fit_transform(
    X_encoded[numerical_features]
)

In [9]:
# --------------------------------------------------
# Split the dataset into training and testing
# sets while preserving class distribution
# --------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [10]:
# --------------------------------------------------
# Verify the dimensions of the training and testing
# feature and target sets after splitting
# --------------------------------------------------
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((316, 41), (79, 41), (316,), (79,))

In [11]:
# --------------------------------------------------
# Save the training and testing feature and target
# sets to CSV files for future use or modeling
# --------------------------------------------------
X_train.to_csv("../data/processed/X_train.csv", index=False)
X_test.to_csv("../data/processed/X_test.csv", index=False)
y_train.to_csv("../data/processed/y_train.csv", index=False)
y_test.to_csv("../data/processed/y_test.csv", index=False)

In [12]:
# --------------------------------------------------
# Print a summary of the preprocessing steps,
# dataset information, and saved files
# --------------------------------------------------
print("PREPROCESSING SUMMARY")
print("-" * 50)

print(f"Total samples           : {df.shape[0]}")
print(f"Total features (raw)    : {df.shape[1] - 1}")  

print("\nTarget Variable:")
print(" - Name                : pass")
print(" - Type                : Binary classification (0 = Fail, 1 = Pass)")

print("\nFeature Processing:")
print(f" - Categorical features encoded : {len(categorical_features)}")
print(f" - Numerical features scaled    : {len(numerical_features)}")
print(" - Encoding method              : One-Hot Encoding")
print(" - Scaling method               : StandardScaler")

print("\nTrain-Test Split:")
print(f" - Training samples : {X_train.shape[0]}")
print(f" - Testing samples  : {X_test.shape[0]}")
print(" - Test size        : 20%")
print(" - Stratified split: Yes")

print("\nOutput Files Saved:")
print(" - X_train.csv")
print(" - X_test.csv")
print(" - y_train.csv")
print(" - y_test.csv")

print("\nStatus: Dataset is fully preprocessed and ready for model training.")


PREPROCESSING SUMMARY
--------------------------------------------------
Total samples           : 395
Total features (raw)    : 33

Target Variable:
 - Name                : pass
 - Type                : Binary classification (0 = Fail, 1 = Pass)

Feature Processing:
 - Categorical features encoded : 17
 - Numerical features scaled    : 15
 - Encoding method              : One-Hot Encoding
 - Scaling method               : StandardScaler

Train-Test Split:
 - Training samples : 316
 - Testing samples  : 79
 - Test size        : 20%
 - Stratified split: Yes

Output Files Saved:
 - X_train.csv
 - X_test.csv
 - y_train.csv
 - y_test.csv

Status: Dataset is fully preprocessed and ready for model training.
