In [9]:
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim

# Load dataset
df = pd.read_csv("/content/final_training_dataset.csv")

# Extract inputs and labels
texts = df['post'].astype(str).tolist()
label_cols = df.columns[1:]
labels_raw = df[label_cols]

# Convert string labels (if necessary) to binary 0/1
labels = labels_raw.applymap(lambda x: 1 if str(x).strip().lower() in ['1', 'true', 'yes'] else 0)
labels_tensor = torch.tensor(labels.values).float()

# Load modern embedding model (BGE)
embed_model = SentenceTransformer("BAAI/bge-base-en-v1.5")
embeddings = embed_model.encode(texts, batch_size=32, convert_to_tensor=True)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    embeddings, labels_tensor, test_size=0.2, random_state=42
)

train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# Define the classifier
class DepressionClassifier(nn.Module):
    def __init__(self, input_dim, output_dim=9):
        super(DepressionClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 512)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.fc2 = nn.Linear(512, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.dropout(self.relu(self.fc1(x)))
        return self.sigmoid(self.fc2(x))

# Model, loss, optimizer
model = DepressionClassifier(input_dim=embeddings.shape[1])
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=2e-4)

# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop
EPOCHS = 200
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)

        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{EPOCHS}, Loss: {avg_loss:.4f}")

trained_model = model  # for inference later
"Training complete ✅"


  labels = labels_raw.applymap(lambda x: 1 if str(x).strip().lower() in ['1', 'true', 'yes'] else 0)


Epoch 1/200, Loss: 0.5765
Epoch 2/200, Loss: 0.5011
Epoch 3/200, Loss: 0.4551
Epoch 4/200, Loss: 0.4296
Epoch 5/200, Loss: 0.4145
Epoch 6/200, Loss: 0.4037
Epoch 7/200, Loss: 0.3950
Epoch 8/200, Loss: 0.3890
Epoch 9/200, Loss: 0.3824
Epoch 10/200, Loss: 0.3791
Epoch 11/200, Loss: 0.3743
Epoch 12/200, Loss: 0.3721
Epoch 13/200, Loss: 0.3691
Epoch 14/200, Loss: 0.3663
Epoch 15/200, Loss: 0.3645
Epoch 16/200, Loss: 0.3618
Epoch 17/200, Loss: 0.3595
Epoch 18/200, Loss: 0.3586
Epoch 19/200, Loss: 0.3566
Epoch 20/200, Loss: 0.3545
Epoch 21/200, Loss: 0.3523
Epoch 22/200, Loss: 0.3498
Epoch 23/200, Loss: 0.3497
Epoch 24/200, Loss: 0.3473
Epoch 25/200, Loss: 0.3471
Epoch 26/200, Loss: 0.3457
Epoch 27/200, Loss: 0.3443
Epoch 28/200, Loss: 0.3426
Epoch 29/200, Loss: 0.3416
Epoch 30/200, Loss: 0.3388
Epoch 31/200, Loss: 0.3375
Epoch 32/200, Loss: 0.3371
Epoch 33/200, Loss: 0.3347
Epoch 34/200, Loss: 0.3350
Epoch 35/200, Loss: 0.3330
Epoch 36/200, Loss: 0.3305
Epoch 37/200, Loss: 0.3306
Epoch 38/2

'Training complete ✅'

In [10]:
import pandas as pd
import torch

# Load test dataset
test_df = pd.read_csv("/content/final_testing_dataset.csv")

# Extract post texts
test_texts = test_df['post'].astype(str).tolist()

# Encode posts using the same SentenceTransformer model
test_embeddings = embed_model.encode(test_texts, batch_size=32, convert_to_tensor=True).to(device)

# Put model in eval mode
trained_model.eval()

# Predict for each embedding
all_predictions = []
with torch.no_grad():
    outputs = trained_model(test_embeddings)
    predictions = (outputs > 0.5).int().cpu().numpy()

# Convert predictions to DataFrame
predicted_labels_df = pd.DataFrame(predictions, columns=label_cols)

# Combine with original test_df (optional)
final_df = pd.concat([test_df, predicted_labels_df], axis=1)

# Save results (optional)
final_df.to_csv("/content/test_predictions.csv", index=False)

# Preview results
print(final_df.head())


   Unnamed: 0                                               post  anger  \
0           0  I feel unloved, I feel like a burden to everyo...      0   
1           1  my grandfather had mental health problem, two ...      0   
2           2  The older I get the more and more I feel isola...      0   
3           3  I can’t handle poverty any more. Everything is...      1   
4           4  This my first Valentine’s Day post separation ...      0   

   brain dysfunction (forget)  emptiness  hopelessness  loneliness  sadness  \
0                           0          1             1           1        1   
1                           0          0             1           0        1   
2                           0          1             1           1        1   
3                           0          1             1           0        1   
4                           0          1             0           1        0   

   suicide intent  worthlessness  post  anger  brain dysfunction (forget) 

In [11]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Load true labels from test.csv (if available)
# Assuming your test.csv contains ground-truth labels
true_labels = test_df[label_cols].applymap(lambda x: 1 if str(x).strip().lower() in ['1', 'true', 'yes'] else 0).values

# Predict using trained model
trained_model.eval()
with torch.no_grad():
    test_embeddings = embed_model.encode(test_df['post'].astype(str).tolist(), batch_size=32, convert_to_tensor=True).to(device)
    outputs = trained_model(test_embeddings)
    predictions = (outputs > 0.5).int().cpu().numpy()

# Compute metrics
accuracy = accuracy_score(true_labels, predictions)
f1_micro = f1_score(true_labels, predictions, average='micro')
f1_macro = f1_score(true_labels, predictions, average='macro')
precision = precision_score(true_labels, predictions, average='micro')
recall = recall_score(true_labels, predictions, average='micro')

# Print results
print(f"F1 Score (Micro): {f1_micro:.4f}")
print(f"F1 Score (Macro): {f1_macro:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")


  true_labels = test_df[label_cols].applymap(lambda x: 1 if str(x).strip().lower() in ['1', 'true', 'yes'] else 0).values


F1 Score (Micro): 0.7563
F1 Score (Macro): 0.6219
Precision: 0.7789
Recall: 0.7350


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
