# Data Preprocessing & Baseline Models

## Objective
- Clean and preprocess the dataset
- Encode categorical features
- Build baseline ML models for performance and risk prediction


In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


In [4]:
df = pd.read_csv("/content/student_data.csv")
df.head()


Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3,part_time_job
0,MS,F,20,R,LE3,A,1,2,at_home,teacher,...,2,1,4,5,2,10,14,15,16,yes
1,MS,M,20,R,LE3,A,0,2,services,health,...,3,5,2,3,1,10,8,6,6,no
2,MS,M,17,R,LE3,T,2,1,teacher,at_home,...,2,4,1,2,3,25,11,7,3,yes
3,GP,F,18,R,LE3,T,3,3,at_home,teacher,...,2,5,4,5,4,6,18,18,17,yes
4,MS,F,18,R,LE3,A,1,0,teacher,teacher,...,3,3,2,3,4,6,5,1,2,no


In [5]:
df.isnull().sum()


Unnamed: 0,0
school,0
sex,0
age,0
address,0
famsize,0
Pstatus,0
Medu,0
Fedu,0
Mjob,0
Fjob,0


In [6]:
df_model = df.drop(columns=["G1", "G2"])


In [7]:
cat_cols = df_model.select_dtypes(include="object").columns
cat_cols


Index(['school', 'sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob',
       'reason', 'guardian', 'schoolsup', 'famsup', 'paid', 'activities',
       'nursery', 'higher', 'internet', 'romantic', 'part_time_job'],
      dtype='object')

In [8]:
le = LabelEncoder()

for col in cat_cols:
    df_model[col] = le.fit_transform(df_model[col])


In [9]:
def risk_label(g3):
    if g3 < 10:
        return 0  # High Risk
    elif g3 < 14:
        return 1  # Medium Risk
    else:
        return 2  # Low Risk

df_model["risk"] = df_model["G3"].apply(risk_label)


In [10]:
X = df_model.drop(columns=["G3", "risk"])
y = df_model["risk"]


In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [12]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))


Logistic Regression Accuracy: 0.40500390930414387
              precision    recall  f1-score   support

           0       0.41      0.72      0.52       508
           1       0.35      0.02      0.04       307
           2       0.39      0.32      0.35       464

    accuracy                           0.41      1279
   macro avg       0.39      0.35      0.30      1279
weighted avg       0.39      0.41      0.34      1279



In [13]:
rf = RandomForestClassifier(
    n_estimators=100,
    random_state=42
)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


Random Forest Accuracy: 0.40265832681782643
              precision    recall  f1-score   support

           0       0.41      0.65      0.50       508
           1       0.29      0.05      0.08       307
           2       0.40      0.37      0.39       464

    accuracy                           0.40      1279
   macro avg       0.37      0.35      0.32      1279
weighted avg       0.38      0.40      0.36      1279



## Initial Model Observations

- Random Forest outperforms Logistic Regression
- Risk-based classification is interpretable for teachers
- Model is suitable for explainable ML techniques
- Baseline models provide a strong foundation for improvements
