In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, mean_squared_error, r2_score
import joblib

In [2]:
DATA_PATH = "student_performance_dataset_with_names_500.csv" 
df = pd.read_csv(DATA_PATH)

In [4]:
df.head()

Unnamed: 0,student_id,name,gender,age,attendance_percentage,math_marks,science_marks,english_marks,study_hours_per_week,homework_completion_rate,parent_education_level,final_score,performance_category
0,1,Aarohi Shetty,Female,14,98.42,64,62,70,9,98.12,Graduate,65.33,Medium
1,2,Sahil Gavande,Male,17,94.69,68,63,90,19,60.43,Graduate,73.67,Medium
2,3,Manish Khan,Female,20,76.76,98,41,44,9,84.0,Primary,61.0,Medium
3,4,Aanya Gavande,Male,19,62.32,82,23,81,13,61.29,Post-Graduate,62.0,Medium
4,5,Krishna Joshi,Female,18,67.62,58,23,25,8,50.89,Secondary,35.33,Low


In [3]:
drop_cols = ["student_id", "name"]
features = df.drop(columns=drop_cols + ["final_score", "performance_category"])
target_clf = df["performance_category"]
target_reg = df["final_score"]

In [5]:
numeric_features = ["age", "attendance_percentage", "math_marks", "science_marks", "english_marks", "study_hours_per_week", "homework_completion_rate"]

In [6]:
categorical_features = [c for c in features.columns if c not in numeric_features]

In [7]:
numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

In [8]:
X = df.drop(columns=drop_cols + ["final_score", "performance_category"])
X_numeric = X[numeric_features].copy()
scaler = StandardScaler().fit(X_numeric)
X_numeric_scaled = pd.DataFrame(scaler.transform(X_numeric), columns=numeric_features)

In [9]:
X_categorical = pd.get_dummies(X[categorical_features].astype(str), drop_first=True)

In [10]:
X_prepared = pd.concat([X_numeric_scaled.reset_index(drop=True), X_categorical.reset_index(drop=True)], axis=1)

In [11]:
X_train, X_test, y_train_clf, y_test_clf, y_train_reg, y_test_reg = train_test_split(
    X_prepared, target_clf, target_reg, test_size=0.2, random_state=42, stratify=target_clf
)

In [12]:
clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_train, y_train_clf)

In [14]:
y_pred_clf = clf.predict(X_test)
print("=== Classification Results ===")
print("Accuracy:", accuracy_score(y_test_clf, y_pred_clf))
print("\nClassification report:\n", classification_report(y_test_clf, y_pred_clf))
print("Confusion matrix:\n", confusion_matrix(y_test_clf, y_pred_clf))

=== Classification Results ===
Accuracy: 0.93

Classification report:
               precision    recall  f1-score   support

        High       0.00      0.00      0.00         2
         Low       0.96      0.94      0.95        49
      Medium       0.90      0.96      0.93        49

    accuracy                           0.93       100
   macro avg       0.62      0.63      0.63       100
weighted avg       0.91      0.93      0.92       100

Confusion matrix:
 [[ 0  0  2]
 [ 0 46  3]
 [ 0  2 47]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [15]:
reg = RandomForestRegressor(n_estimators=200, random_state=42)
reg.fit(X_train, y_train_reg)

In [16]:
y_pred_reg = reg.predict(X_test)
print("\n=== Regression Results ===")
print("MSE:", mean_squared_error(y_test_reg, y_pred_reg))
print("RMSE:", mean_squared_error(y_test_reg, y_pred_reg, squared=False))
print("R^2:", r2_score(y_test_reg, y_pred_reg))


=== Regression Results ===
MSE: 6.299911679599992
RMSE: 2.509962485695751
R^2: 0.9682292501972731




In [17]:
cv_scores = cross_val_score(clf, X_prepared, target_clf, cv=5, scoring="accuracy")
print("\nClassification CV accuracy (5-fold):", np.round(cv_scores, 3), " mean:", np.round(cv_scores.mean(), 3))


Classification CV accuracy (5-fold): [0.93 0.94 0.89 0.92 0.9 ]  mean: 0.916


In [20]:
cv_scores_reg = cross_val_score(reg, X_prepared, target_reg, cv=5, scoring="r2")
print("Regression CV R^2 (5-fold):", np.round(cv_scores_reg, 3), " mean:", np.round(cv_scores_reg.mean(), 3))

Regression CV R^2 (5-fold): [0.97  0.966 0.944 0.968 0.967]  mean: 0.963


In [21]:
joblib.dump(clf, "student_perf_classifier_rf.joblib")
joblib.dump(reg, "student_perf_regressor_rf.joblib")
joblib.dump(scaler, "scaler.joblib") 
joblib.dump(X_categorical.columns.tolist(), "categorical_dummies_columns.joblib")

['categorical_dummies_columns.joblib']

In [22]:
def prepare_single_sample(sample_dict):
    """
    sample_dict example:
    {
       "age": 16,
       "attendance_percentage": 92.5,
       "math_marks": 78,
       "science_marks": 85,
       "english_marks": 72,
       "study_hours_per_week": 8,
       "homework_completion_rate": 90.0,
       "gender": "Male",
       "parent_education_level": "Graduate"
    }
    """
    # numeric
    num = pd.DataFrame([ {k: sample_dict[k] for k in numeric_features} ])
    num_scaled = pd.DataFrame(scaler.transform(num), columns=numeric_features)
    # categorical
    cat = pd.get_dummies(pd.DataFrame([sample_dict]), drop_first=True)
    # align cat columns to training columns
    cat_cols = joblib.load("categorical_dummies_columns.joblib")
    for c in cat_cols:
        if c not in cat.columns:
            cat[c] = 0
    cat = cat[cat_cols]
    Xs = pd.concat([num_scaled.reset_index(drop=True), cat.reset_index(drop=True)], axis=1)
    return Xs

In [23]:
example = {
   "age": 16,
   "attendance_percentage": 92.5,
   "math_marks": 78,
   "science_marks": 85,
   "english_marks": 72,
   "study_hours_per_week": 8,
   "homework_completion_rate": 90.0,
   "gender": "Male",
   "parent_education_level": "Graduate"
}
Xs = prepare_single_sample(example)
print("\nPrediction example (class):", clf.predict(Xs)[0])
print("Prediction example (final_score estimate):", reg.predict(Xs)[0])


Prediction example (class): Medium
Prediction example (final_score estimate): 81.19225000000002
