In [1]:
# ======================================================
# ACADEMIC EARLY WARNING SYSTEM
# Predicting Student Risk Levels (Low / Medium / High)
# ======================================================


In [2]:

# -------------------------------
# 1️ LIBRARIES
# -------------------------------
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.calibration import CalibratedClassifierCV

from xgboost import XGBClassifier

In [3]:

# -------------------------------
# 2️ LOAD DATA
# -------------------------------
df = pd.read_csv("/content/sample_data/student_records.csv")

In [4]:

# -------------------------------
# 3️ FEATURE SELECTION
# -------------------------------
FEATURES = [
    "mid1_exam_30",
    "mid2_exam_30",
    "attendance_pct_100",
    "prev_year_sgpa_10",
    "backlogs"
]

X = df[FEATURES]


In [5]:

# -------------------------------
# 4️ RISK SCORE ENGINEERING (TARGET)
# -------------------------------
# Risk score between 0 and 1
risk_score = (
    0.35 * (df["attendance_pct_100"] < 65).astype(int) +
    0.30 * (df["backlogs"] >= 2).astype(int) +
    0.20 * (df["prev_year_sgpa_10"] < 6.5).astype(int) +
    0.15 * (((df["mid1_exam_30"] + df["mid2_exam_30"]) / 2) < 15).astype(int)
)

# Convert to binary HIGH RISK label
df["high_risk"] = (risk_score >= 0.5).astype(int)

y = df["high_risk"]


In [6]:

# -------------------------------
# 5️ TRAIN / TEST SPLIT
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

In [7]:

# -------------------------------
# 6️ SCALING
# -------------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [8]:
# -------------------------------
# 6.1 SAVE NORMALIZED DATASET
# -------------------------------

# Convert scaled arrays back to DataFrames
X_train_scaled_df = pd.DataFrame(
    X_train_scaled,
    columns=FEATURES,
    index=X_train.index
)

X_test_scaled_df = pd.DataFrame(
    X_test_scaled,
    columns=FEATURES,
    index=X_test.index
)

# Attach target column
train_normalized = pd.concat([X_train_scaled_df, y_train], axis=1)
test_normalized = pd.concat([X_test_scaled_df, y_test], axis=1)

# Save to CSV
train_normalized.to_csv("train_normalized.csv", index=False)
test_normalized.to_csv("test_normalized.csv", index=False)

print(" Normalized train & test datasets saved")


 Normalized train & test datasets saved


In [9]:


# -------------------------------
# 7️ XGBOOST CLASSIFIER
# -------------------------------
base_model = XGBClassifier(
    n_estimators=300,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=(y_train.value_counts()[0] / y_train.value_counts()[1]),
    eval_metric="logloss",
    random_state=42
)

base_model.fit(X_train_scaled, y_train)

In [10]:



# -------------------------------
# 8️ PROBABILITY CALIBRATION
# -------------------------------
calibrated_model = CalibratedClassifierCV(
    base_model,
    method="sigmoid",
    cv=5
)

calibrated_model.fit(X_train_scaled, y_train)

In [11]:



# -------------------------------
# 9️ EVALUATION
# -------------------------------
y_pred = calibrated_model.predict(X_test_scaled)
y_prob = calibrated_model.predict_proba(X_test_scaled)[:, 1]

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))

Confusion Matrix:
 [[582   1]
 [  2 215]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       583
           1       1.00      0.99      0.99       217

    accuracy                           1.00       800
   macro avg       1.00      0.99      1.00       800
weighted avg       1.00      1.00      1.00       800

ROC-AUC: 0.99991305103904


In [12]:


# -------------------------------
# 10 SAVE MODEL
# -------------------------------
joblib.dump(calibrated_model, "academic_risk_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(FEATURES, "model_features.pkl")

print("\nAcademic Early Warning System saved successfully.")





In [13]:
'''import joblib
import pandas as pd
import shap
import numpy as np

# ======================================================
# LOAD ARTIFACTS (ONCE)
# ======================================================
MODEL = joblib.load("academic_risk_model.pkl")
SCALER = joblib.load("scaler.pkl")
FEATURES = joblib.load("model_features.pkl")

# SHAP background (safe minimal background)
BACKGROUND = np.zeros((1, len(FEATURES)))
EXPLAINER = shap.Explainer(MODEL.predict_proba, BACKGROUND)

# ======================================================
# INFERENCE + NATURAL LANGUAGE EXPLANATION
# ======================================================
def predict_student_risk(student_dict):
    """
    Input:
        student_dict: dict with keys matching FEATURES

    Output:
        dict with probability + explanation text
    """

    # -----------------------------
    # 1. Prepare input
    # -----------------------------
    df = pd.DataFrame([student_dict])[FEATURES]
    scaled = SCALER.transform(df)

    # -----------------------------
    # 2. Predict probability
    # -----------------------------
    prob = MODEL.predict_proba(scaled)[0][1] * 100

    # -----------------------------
    # 3. SHAP explanation
    # -----------------------------
    shap_values = EXPLAINER(scaled)

    increases_risk = []
    decreases_risk = []

    for i, feature in enumerate(FEATURES):
        contribution = shap_values.values[0][i][1]
        if contribution > 0:
            increases_risk.append(feature)
        elif contribution < 0:
            decreases_risk.append(feature)

    # -----------------------------
    # 4. Construct explanation text
    # -----------------------------
    if increases_risk:
        explanation = (
            f"{', '.join(increases_risk)} "
            f"{'increase' if len(increases_risk) > 1 else 'increases'} "
            "the risk of failing."
        )
    else:
        explanation = "No risk factors detected."

    # -----------------------------
    # 5. Final output (same style as before)
    # -----------------------------
    return {
        "probability": f"Predicted probability of failing: {prob:.2f}%",
        "explanation_text": explanation
    }'''


In [14]:
'''student = {
    "mid1_exam_30": 10,
    "mid2_exam_30": 5,
    "attendance_pct_100": 42,
    "prev_year_sgpa_10": 4.4,
    "backlogs": 0
}

result = predict_student_risk(student)
print(result)'''


{'probability': 'Predicted probability of failing: 99.02%', 'explanation_text': 'mid1_exam_30, mid2_exam_30, attendance_pct_100, prev_year_sgpa_10 increase the risk of failing.'}
