In [None]:
#classify the test_recs.csv split
from predict_classification import load_and_classify
import pandas as pd

#read the test set you created (with Student_id, Mechanisms, features…)
test_recs = pd.read_csv("test_recs.csv")

#run your SVM pipeline on that SAME CSV
preds = load_and_classify("test_recs.csv")
# preds has columns: pred_int, pred_label, P_low, P_med, P_high

#attach Student_id and Mechanisms so you can recommend later
test_with_preds = pd.concat([
    test_recs[["Student_id","Stress Coping Mechanisms"]].reset_index(drop=True),
    preds.reset_index(drop=True)
], axis=1)

#write out for Stage 2
test_with_preds.to_csv("predictions.csv", index=False)
print("✅ Wrote predictions.csv")


✅ Wrote predictions.csv


In [None]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

#reload train_recs (has features, coping, true category)
train = pd.read_csv("train_recs.csv")
train["Mechanisms"] = train["Stress Coping Mechanisms"].str.split(",")
train["Success"]    = (train["Stress Level Category"]=="Low").astype(int)

#define exactly the features you trained on
feature_cols = [
  'Age','Academic Performance (GPA)','Study Hours Per Week',
  'Social_Media_Usage_per_week','Sleep Duration (Hours per night)',
  'Physical Exercise (Hours per week)','Family Support','Financial Stress',
  'Peer Pressure','Relationship Stress','Counseling Attendance','Diet Quality',
  'Cognitive Distortions','Family Mental Health History','Medical Condition',
  'Substance Use','Gender_Female','Gender_Male','Gender_Other','Stress_Ratio'
]

#fit k-NN on train features
X_train = train[feature_cols].values
knn = NearestNeighbors(n_neighbors=50, metric="euclidean").fit(X_train)

#reload test_recs (with features & coping) and predictions
test_recs = pd.read_csv("test_recs.csv")
test_recs["Mechanisms"] = test_recs["Stress Coping Mechanisms"].str.split(",")

preds = pd.read_csv("predictions.csv")[[
    "Student_id","pred_int","pred_label","P_low","P_med","P_high"
]]

test = test_recs.merge(preds, on="Student_id", how="left")

#recommendation function
def recommend_knn(row, knn=knn, train_df=train, feature_cols=feature_cols, k=50, m=5):
    x = row[feature_cols].values.reshape(1,-1)
    _, idxs = knn.kneighbors(x, n_neighbors=k)
    neigh = train_df.iloc[idxs[0]]
    stats = {}
    for mechs, succ in zip(neigh["Mechanisms"], neigh["Success"]):
        for mech in mechs:
            stats.setdefault(mech, {"used":0,"succ":0})
            stats[mech]["used"] += 1
            stats[mech]["succ"] += succ
    mech_df = pd.DataFrame([
        {"Mechanism":mech,
         "SuccessRate":v["succ"]/v["used"]}
        for mech,v in stats.items()
    ])
    already = set(row["Mechanisms"])
    mech_df = mech_df[~mech_df["Mechanism"].isin(already)]
    return ",".join(mech_df.sort_values("SuccessRate",ascending=False).head(m)["Mechanism"])

#apply recommendations
test["recommendations"] = test.apply(recommend_knn, axis=1)

#compute P_category_drop
def compute_p_drop(row):
    if row.pred_int==2: return row.P_med + row.P_low
    if row.pred_int==1: return row.P_low
    return 0.0

test["P_category_drop"] = test.apply(compute_p_drop, axis=1)

# classification metrics on the hold-out test set
true = test["Stress Level Category"].map({'Low':0,'Medium':1,'High':2})
print("=== Classification on hold-out ===")
print(classification_report(true, test["pred_int"], target_names=['Low','Medium','High']))
print("Confusion matrix:\n", confusion_matrix(true, test["pred_int"]))

# simple recommendation summaries
print(f"\nAverage P_category_drop = {test['P_category_drop'].mean():.3f}")
rec_df = (
    test["recommendations"]
      .str.split(",")
      .explode()
      .value_counts()
      .rename_axis("Mechanism")
      .reset_index(name="Count")
)
print("\nTop recommended mechanisms:\n", rec_df.head(10))

# recommendation‐model accuracy (train & test)
thr = 0.5

# TRAIN
train["P_category_drop"] = None  # placeholder
# we need the same P_category_drop on train; recompute via compute_p_drop if you merged preds into train_recs.csv
# assume train_recs.csv also had "pred_int","P_low","P_med" etc. so:
train_preds = pd.read_csv("train_predictions.csv")[["Student_id","pred_int","P_low","P_med","P_high"]]
train = train.merge(train_preds, on="Student_id", how="left")
train["P_category_drop"] = train.apply(compute_p_drop, axis=1)
train["PredSuccess"] = (train["P_category_drop"] > thr).astype(int)

print("\n=== Recommendation Accuracy on TRAIN ===")
print("Accuracy :", accuracy_score(train["Success"], train["PredSuccess"]))
print(classification_report(train["Success"], train["PredSuccess"], target_names=["Fail","Success"]))

# TEST
test["TrueSuccess"] = (test["Stress Level Category"]=="Low").astype(int)
test["PredSuccess"] = (test["P_category_drop"] > thr).astype(int)

print("\n=== Recommendation Accuracy on TEST ===")
print("Accuracy :", accuracy_score(test["TrueSuccess"], test["PredSuccess"]))
print(classification_report(test["TrueSuccess"], test["PredSuccess"], target_names=["Fail","Success"]))

# 8) save final
out_cols = [
  "Student_id","pred_label","P_low","P_med","P_high","P_category_drop",
  "Stress Coping Mechanisms","recommendations"
]
test[out_cols].to_csv("knn_recommendations.csv", index=False)
print("✅ Wrote knn_recommendations.csv")


=== Classification on hold-out ===
              precision    recall  f1-score   support

         Low       0.63      0.51      0.57       519
      Medium       0.67      0.71      0.69       527
        High       0.64      0.71      0.67       682

    accuracy                           0.65      1728
   macro avg       0.65      0.64      0.64      1728
weighted avg       0.65      0.65      0.65      1728

Confusion matrix:
 [[265  87 167]
 [ 53 376  98]
 [100 101 481]]

Average P_category_drop = 0.291

Top recommended mechanisms:
                  Mechanism  Count
0          Watching Sports   1091
1               Travelling    941
2                     Yoga    919
3      Spending Time Alone    880
4  Social Media Engagement    874
5                 Exercise    872
6  Walking or Nature Walks    794
7                  Reading    793
8               Meditation    746
9       Talking to Friends    730

=== Recommendation Accuracy on TRAIN ===
Accuracy : 0.6830357142857143
          

PermissionError: [Errno 13] Permission denied: 'knn_recommendations.csv'

In [None]:
import joblib, json

#persist the fitted KNN
joblib.dump(knn, "knn_model.joblib")

#persist the feature columns list
with open("rec_feature_columns.json","w") as f:
    json.dump(feature_cols, f)

print("✅ Saved knn_model.joblib and rec_feature_columns.json")
