In [2]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertModel
import torch
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import re

# Load your dataset
df = pd.read_excel("PHQ9_Student_Depression_Dataset_Updated.xlsx")

# Combine all PHQ9 answers into one text
df['combined_text'] = df.iloc[:, 1:11].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)
df['combined_text'] = df['combined_text'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x.lower()))

# Encode Severity Level
le = LabelEncoder()
df['label'] = le.fit_transform(df['Severity Level'])

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased")

# Convert text to BERT embeddings
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Create embeddings
embeddings = np.stack(df['combined_text'].apply(get_bert_embedding).values)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(embeddings, df['label'], test_size=0.2, random_state=42)

# Train Random Forest classifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=le.classes_))


                   precision    recall  f1-score   support

             Mild       1.00      1.00      1.00         5
          Minimal       1.00      1.00      1.00         6
         Moderate       1.00      1.00      1.00         9
Moderately Severe       1.00      1.00      1.00        11
           Severe       1.00      1.00      1.00        19

         accuracy                           1.00        50
        macro avg       1.00      1.00      1.00        50
     weighted avg       1.00      1.00      1.00        50



In [3]:
import joblib
joblib.dump(clf, "bert_rf_model.pkl")
joblib.dump(le, "label_encoder.pkl")

['label_encoder.pkl']

In [4]:
print(X_test)

[[-0.02315622  0.13593969  0.27234566 ... -0.1625643  -0.06535875
  -0.26565418]
 [ 0.09229317  0.09208242  0.13666631 ... -0.05577538  0.00715546
  -0.17037666]
 [-0.3104102   0.18086855  0.45210716 ... -0.1806463   0.11102752
  -0.14592333]
 ...
 [ 0.04119097 -0.04306969  0.2966259  ... -0.04961918  0.04527418
  -0.1550191 ]
 [-0.02671829  0.05145797  0.1348214  ... -0.11044373  0.07188667
  -0.08782027]
 [ 0.02813066  0.20905516  0.28225234 ... -0.22059192  0.07705928
  -0.1026603 ]]


In [None]:
import re
import torch
import joblib
import numpy as np
from transformers import BertTokenizer, BertModel

# Load saved classifier and label encoder
clf = joblib.load("bert_rf_model.pkl")
label_encoder = joblib.load("label_encoder.pkl")

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased")

# PHQ-9 Questions
phq9_questions = [
    "1. Little interest or pleasure in doing things:",
    "2. Feeling down, depressed, or hopeless:",
    "3. Trouble falling or staying asleep, or sleeping too much:",
    "4. Feeling tired or having little energy:",
    "5. Poor appetite or overeating:",
    "6. Feeling bad about yourself — or that you are a failure or have let yourself or your family down:",
    "7. Trouble concentrating on things, such as reading the newspaper or watching television:",
    "8. Moving or speaking slowly or being restless more than usual:",
    "9. Thoughts that you would be better off dead or of hurting yourself in some way:"
]

# Function to get BERT embedding
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy().reshape(1, -1)

# Predict from user input
def predict_severity():
    print("\n🧠 Please answer the following PHQ-9 questions:\n")
    responses = [input(q + " ") for q in phq9_questions]
    combined_text = " ".join(responses)
    cleaned_text = re.sub(r"[^a-zA-Z\s]", "", combined_text.lower())
    
    embedding = get_bert_embedding(cleaned_text)
    pred_label = clf.predict(embedding)[0]
    severity = label_encoder.inverse_transform([pred_label])[0]
    
    print(f"\n📊 Predicted Depression Severity: **{severity}**")

# Run the function
predict_severity()



🧠 Please answer the following PHQ-9 questions:

