In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

df = pd.read_csv("../data/raw/student_performance_500.csv")


In [7]:
encoder = LabelEncoder()
df["extracurricular"] = encoder.fit_transform(df["extracurricular"])
df["internet"] = encoder.fit_transform(df["internet"])
df["performance"] = encoder.fit_transform(df["performance"])


In [8]:
X = df.drop("performance", axis=1)
y = df["performance"]


In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [10]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

log_reg = LogisticRegression(max_iter=500)
log_reg.fit(X_train, y_train)
log_pred = log_reg.predict(X_test)

print("LR Accuracy:", accuracy_score(y_test, log_pred))
print(classification_report(y_test, log_pred))


LR Accuracy: 0.95
              precision    recall  f1-score   support

           0       0.97      0.91      0.94        34
           1       1.00      0.97      0.98        33
           2       0.89      0.97      0.93        33

    accuracy                           0.95       100
   macro avg       0.95      0.95      0.95       100
weighted avg       0.95      0.95      0.95       100



In [12]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=120,
    max_depth=7,
    min_samples_leaf=2,
    random_state=42
)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

print("RF Accuracy:", accuracy_score(y_test, rf_pred))
print(classification_report(y_test, rf_pred))


RF Accuracy: 0.77
              precision    recall  f1-score   support

           0       0.90      0.79      0.84        34
           1       0.76      0.88      0.82        33
           2       0.66      0.64      0.65        33

    accuracy                           0.77       100
   macro avg       0.77      0.77      0.77       100
weighted avg       0.77      0.77      0.77       100



In [13]:
import joblib

# Save model
joblib.dump(log_reg, "../models/best_model.joblib")

# Save scaler
joblib.dump(scaler, "../models/scaler.joblib")

# Save label encoder for decoding prediction later
joblib.dump(encoder, "../models/encoder.joblib")

print("Model, scaler, and encoder saved successfully!")


Model, scaler, and encoder saved successfully!
