In [95]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

# Metrics
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report
)

In [77]:
#Global Variables
csv_path = "StudentPerformanceFactors.csv"
target = "Pass"

In [8]:
#Load data

if not os.path.exists(csv_path):
    raise FileNotFoundError("Path/File not found")
else: 
    df = pd.read_csv(csv_path)


In [13]:
df.head()

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,23,84,Low,High,No,7,73,Low,Yes,0,Low,Medium,Public,Positive,3,No,High School,Near,Male,67
1,19,64,Low,Medium,No,8,59,Low,Yes,2,Medium,Medium,Public,Negative,4,No,College,Moderate,Female,61
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,Medium,Medium,Public,Neutral,4,No,Postgraduate,Near,Male,74
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,Medium,Medium,Public,Negative,4,No,High School,Moderate,Male,71
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,Medium,High,Public,Neutral,4,No,College,Near,Female,70


In [17]:
#Cleaning data

df = df.drop_duplicates().copy()
for col in df.select_dtypes(include = ["object"]).columns:
        df[col] = df[col].astype(str).str.strip()
df = df.dropna(subset = ["Exam_Score"])

In [53]:
#Creating target column

df[target] = [0 if c<60 else 1 for c in df["Exam_Score"]]

In [55]:
#Proportion of passing students
passed = df[target].sum()
print("% of Students Passing:", (passed/6606)*100)

% of Students Passing: 98.98577051165607


In [82]:
#Split data
X = df.drop(columns = [target])
y = df[target]

features = X.columns

cat_cols = [c for c in features if df[c].dtype == "object"]
num_cols = [c for c in features if df[c].dtype == "int64"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

In [91]:


def evaluate_clf(y_true, y_pred, y_proba = None, label = "Model"):

    print(f"\n=== {label} ===")
    print(f"Accuracy : {accuracy_score(y_true, y_pred):.4f}")          # fraction of correct predictions
    print(f"Precision: {precision_score(y_true, y_pred):.4f}")         # of predicted 1s, how many are actually 1
    print(f"Recall   : {recall_score(y_true, y_pred):.4f}")            # of actual 1s, how many we caught
    print(f"F1       : {f1_score(y_true, y_pred):.4f}")                # harmonic mean of precision & recall
    if y_proba is not None:
        print(f"ROC-AUC : {roc_auc_score(y_true, y_proba):.4f}")       # probability threshold-independent metric
    print("\nConfusion Matrix:\n", confusion_matrix(y_true, y_pred))   # [[TN, FP],[FN, TP]]
    print("\nClassification Report:\n", classification_report(y_true, y_pred, digits=4))




In [83]:
#Pre processor

cat_pipe = Pipeline(steps = [
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown = "ignore", sparse_output = False))
])

num_pipe = Pipeline(steps = [
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("scalar", StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers = [
        ("num", num_pipe, num_cols),
        ("cat", cat_pipe, cat_cols)
    ],
    remainder = "drop",
    verbose_feature_names_out = False
)

In [92]:
pipe = Pipeline([
    ("preprocess", preprocessor),
    ("selectkbest", SelectKBest(score_func=f_classif)),
    ("clf", LogisticRegression(class_weight="balanced"))
])

param_grid = {
    "selectkbest__k": [5, 10, 15, 20],  # must match step name
    "clf__C": [0.01, 0.1, 1, 10]
}

clf_model = GridSearchCV(pipe, param_grid, cv=5, scoring="f1")
clf_model.fit(X_train, y_train)

print("Best parameters:", grid.best_params_)
print("Best score:", grid.best_score_)


Best parameters: {'clf__C': 10, 'selectkbest__k': 10}
Best score: 0.9987552930775371


In [100]:
def majority_baseline(y_train, n_pred):
    majority = pd.Series(y_train).mode()[0]

    return np.full(shape = n_pred, fill_value = majority, dtype = int)

y_pred_base = majority_baseline(y_train, n_pred = len(y_test))
evaluate_clf(y_test, y_pred_base, label = "Baseline (Majority Class)")



=== Baseline (Majority Class) ===
Accuracy : 0.9894
Precision: 0.9894
Recall   : 1.0000
F1       : 0.9947

Confusion Matrix:
 [[   0   14]
 [   0 1308]]

Classification Report:
               precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000        14
           1     0.9894    1.0000    0.9947      1308

    accuracy                         0.9894      1322
   macro avg     0.4947    0.5000    0.4973      1322
weighted avg     0.9789    0.9894    0.9841      1322



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [98]:
y_pred = clf_model.predict(X_test)
evaluate_clf(y_test, y_pred, label = "Evaluate Logistic Regression")


=== Evaluate Logistic Regression ===
Accuracy : 0.9955
Precision: 1.0000
Recall   : 0.9954
F1       : 0.9977

Confusion Matrix:
 [[  14    0]
 [   6 1302]]

Classification Report:
               precision    recall  f1-score   support

           0     0.7000    1.0000    0.8235        14
           1     1.0000    0.9954    0.9977      1308

    accuracy                         0.9955      1322
   macro avg     0.8500    0.9977    0.9106      1322
weighted avg     0.9968    0.9955    0.9959      1322

