In [153]:

import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier  


In [154]:
df = pd.read_csv("StudentsPerformance.csv")
df.head()


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [155]:
df['avg_score'] = df[['math score', 'reading score', 'writing score']].mean(axis=1)
df['result'] = (df['avg_score'] >= 50).astype(int)   # 1 => Pass, 0 => Fail

print(df['result'].value_counts(normalize=True))
df[['math score','reading score','writing score','avg_score','result']].head()


result
1    0.897
0    0.103
Name: proportion, dtype: float64


Unnamed: 0,math score,reading score,writing score,avg_score,result
0,72,72,74,72.666667,1
1,69,90,88,82.333333,1
2,90,95,93,92.666667,1
3,47,57,44,49.333333,0
4,76,78,75,76.333333,1


In [156]:
X = df[[
    'gender',
    'race/ethnicity',
    'parental level of education',
    'lunch',
    'test preparation course'
]]

y = df['result']

print("X shape:", X.shape, "y shape:", y.shape)
X.head()


X shape: (1000, 5) y shape: (1000,)


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course
0,female,group B,bachelor's degree,standard,none
1,female,group C,some college,standard,completed
2,female,group B,master's degree,standard,none
3,male,group A,associate's degree,free/reduced,none
4,male,group C,some college,standard,none


In [157]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("Train:", X_train.shape, "Test:", X_test.shape)


Train: (800, 5) Test: (200, 5)


In [158]:

categorical_features = X_train.columns.tolist()

cat_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)


preprocessor = ColumnTransformer(
    transformers=[
        ("cat", cat_transformer, categorical_features)
    ],
    remainder='drop',  
    sparse_threshold=0
)


In [159]:

def evaluate_model(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    return acc, prec, rec, f1


In [160]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(max_depth=6, criterion='entropy', random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=150, max_depth=8, random_state=42),
    "K Nearest Neighbors": KNeighborsClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42)
}


In [161]:

results = []
pipeline_store = {}

for name, estimator in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', estimator)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    acc, prec, rec, f1 = evaluate_model(y_test, y_pred)
    print(f"\n{name}")
    print("-"*40)
    print(f"Accuracy: {acc:.4f}  Precision: {prec:.4f}  Recall: {rec:.4f}  F1: {f1:.4f}")
    print(classification_report(y_test, y_pred, zero_division=0))

    results.append((name, acc, prec, rec, f1))
    pipeline_store[name] = pipeline



Logistic Regression
----------------------------------------
Accuracy: 0.8950  Precision: 0.8950  Recall: 1.0000  F1: 0.9446
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        21
           1       0.90      1.00      0.94       179

    accuracy                           0.90       200
   macro avg       0.45      0.50      0.47       200
weighted avg       0.80      0.90      0.85       200


Decision Tree
----------------------------------------
Accuracy: 0.8700  Precision: 0.8964  Recall: 0.9665  F1: 0.9301
              precision    recall  f1-score   support

           0       0.14      0.05      0.07        21
           1       0.90      0.97      0.93       179

    accuracy                           0.87       200
   macro avg       0.52      0.51      0.50       200
weighted avg       0.82      0.87      0.84       200


Random Forest
----------------------------------------
Accuracy: 0.8800  Precision: 0.8974  Recall:

In [162]:

results_df = pd.DataFrame(results, columns=['model','accuracy','precision','recall','f1'])
results_df = results_df.sort_values('accuracy', ascending=False).reset_index(drop=True)
results_df


Unnamed: 0,model,accuracy,precision,recall,f1
0,Logistic Regression,0.895,0.895,1.0,0.944591
1,AdaBoost,0.89,0.894472,0.994413,0.941799
2,XGBoost,0.88,0.905759,0.96648,0.935135
3,Random Forest,0.88,0.897436,0.977654,0.935829
4,Decision Tree,0.87,0.896373,0.96648,0.930108
5,K Nearest Neighbors,0.865,0.9,0.955307,0.926829


In [163]:

best_model_name = results_df.loc[0,'model']
best_pipeline = pipeline_store[best_model_name]

print("Best model:", best_model_name)
with open('student_model.pkl', 'wb') as f:
    pickle.dump(best_pipeline, f)
print("Saved full pipeline as 'student_model.pkl'.")


Best model: Logistic Regression
Saved full pipeline as 'student_model.pkl'.


In [164]:
with open('student_model.pkl','rb') as f:
    loaded_pipeline = pickle.load(f)


sample = pd.DataFrame([{
    'gender': 'female',
    'race/ethnicity': 'group B',
    'parental level of education': "bachelor's degree",
    'lunch': 'standard',
    'test preparation course': 'completed'
}])

print("Model input shape after transformer:", loaded_pipeline.named_steps['preprocessor'].transform(sample).shape)
print("Predicted label:", loaded_pipeline.predict(sample)[0])


Model input shape after transformer: (1, 17)
Predicted label: 1
