In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Architecture II – Dual-Tower Hybrid Network

In [2]:
import torch
from transformers import AutoTokenizer, AutoModel

In [None]:

Model = AutoModel.from_pretrained("mental/mental-roberta-base")


In [None]:
tokenizer = AutoTokenizer.from_pretrained("mental/mental-roberta-base")

In [6]:
import os
import pandas as pd
import numpy as np

In [7]:
student_file = "/content/drive/MyDrive/mental_health_prdiction/cleaned_student_dataset.csv"
reddit_file =  "/content/drive/MyDrive/mental_health_prdiction/cleaned_reddit_dataset.csv"

student_df = pd.read_csv(student_file)
reddit_df = pd.read_csv(reddit_file)

In [None]:
student_df.head()

In [None]:
student_df.columns

In [10]:
student_df = student_df.rename(columns={"Depression":"binary_label"})

In [11]:
student_df = student_df.drop(columns="serialized_text")

In [12]:
# encoding categorical variables
from sklearn.preprocessing import LabelEncoder

#columns that contain words/categories
cat_cols = ['Gender', 'Sleep Duration', 'Dietary Habits']

le = LabelEncoder()
for col in cat_cols:
    student_df[col] = le.fit_transform(student_df[col].astype(str))

In [13]:
# notice therer is 3 ? in the finaciual column
# Replace '?' with NaN and convert to numeric
student_df['Financial Stress'] = student_df['Financial Stress'].replace('?', np.nan)
student_df['Financial Stress'] = pd.to_numeric(student_df['Financial Stress'], errors='coerce')

# we  Fill NaN values with median (for Financial Stress column only)
financial_median = student_df['Financial Stress'].median()
student_df['Financial Stress'] = student_df['Financial Stress'].fillna(financial_median)

In [14]:
# FEATURE SCALING (NORMALIZATION)
from sklearn.preprocessing import StandardScaler
# all student columns except the label
student_features = [col for col in student_df.columns if col != 'binary_label']

scaler = StandardScaler()
student_df[student_features] = scaler.fit_transform(student_df[student_features])


In [15]:
import pandas as pd

def one_to_many_join(reddit_df, student_df):
    # Separate by label (0 = Normal, 1 = Depressed)
    reddit_0 = reddit_df[reddit_df['binary_label'] == 0].reset_index(drop=True)
    reddit_1 = reddit_df[reddit_df['binary_label'] == 1].reset_index(drop=True)

    student_0 = student_df[student_df['binary_label'] == 0].reset_index(drop=True)
    student_1 = student_df[student_df['binary_label'] == 1].reset_index(drop=True)

    # Pair every 'Normal' Reddit post with a random 'Normal' student profile
    # .sample(replace=True) is the "Many" part of "One-to-Many"
    student_0_sampled = student_0.sample(n=len(reddit_0), replace=True, random_state=42).reset_index(drop=True)
    combined_0 = pd.concat([reddit_0, student_0_sampled], axis=1)

    # Pair every 'Depressed' Reddit post with a random 'Depressed' student profile
    student_1_sampled = student_1.sample(n=len(reddit_1), replace=True, random_state=42).reset_index(drop=True)
    combined_1 = pd.concat([reddit_1, student_1_sampled], axis=1)

    # Merge back together and shuffle
    final_df = pd.concat([combined_0, combined_1]).sample(frac=1, random_state=42).reset_index(drop=True)

    #  we Deduplicate 'binary_label' since it appeared in both dataframes
    final_df = final_df.loc[:, ~final_df.columns.duplicated()].copy()

    # Rename remaining binary_label to target for the model
    final_df = final_df.rename(columns={'binary_label': 'target'})

    return final_df


In [None]:
# 1. Rename the column in student_df if it hasn't been done yet
# (Your file shows the column is still called 'Depression' in the raw data)
if 'Depression' in student_df.columns:
    student_df = student_df.rename(columns={'Depression': 'binary_label'})

# 2. Run the join function on the variables currently in memory
# Notice: No quotes here, because we are using the variables you created in Cell 48
final_df = one_to_many_join(reddit_df, student_df)

print(final_df.head())

In [None]:
one_to_many_join(reddit_df, student_df)

In [19]:
import torch
import torch.nn as nn

class DualTowerModel(nn.Module):
    def __init__(self, roberta_model, num_student_features):
        super(DualTowerModel, self).__init__()
        # TOWER 1: REDDIT TEXT (NLP)
        self.roberta = roberta_model
        # TOWER 2: STUDENT STATS (TABULAR)
        self.student_tower = nn.Sequential(
            nn.Linear(num_student_features, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 32),
            nn.ReLU()
        )

        # THE FUSION LAYER 768 (RoBERTa output) + 32 (Student Tower output) = 800
        self.classifier = nn.Sequential(
            nn.Linear(768 + 32, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 2) # Final Output: [Normal, Depressed]
        )

    def forward(self, input_ids, attention_mask, student_stats):
        # Process Text
        text_outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        # We use the [CLS] token representation (pooler_output)
        text_features = text_outputs.pooler_output

        # Process Student Stats
        student_features = self.student_tower(student_stats)

        #  Concatenate the two towers
        combined = torch.cat((text_features, student_features), dim=1)

        # Final Classification
        logits = self.classifier(combined)
        return logits

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# combines RoBERTa (Model) and the Student Tower
model = DualTowerModel(Model, num_student_features=10).to(device)

# the optimizer to train the WHOLE hybrid system
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# Loss Function
criterion = nn.CrossEntropyLoss()

In [21]:
import torch
from torch.utils.data import Dataset

class HybridDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.df = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

        # Identify the student stat columns (excluding text and target)
        self.stat_columns = [col for col in dataframe.columns if col not in ['target', 'clean_text']]

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        # Get the text for RoBERTa
        text = str(self.df.iloc[index]['clean_text'])

        # Get the 10 student stats as a Tensor
        stats = self.df.iloc[index][self.stat_columns].values.astype(float)
        stats = torch.tensor(stats, dtype=torch.float)

        # Get the Label (Target)
        label = self.df.iloc[index]['target']

        # Tokenize the text
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'stats': stats,
            'label': torch.tensor(label, dtype=torch.long)
        }

In [None]:
from torch.utils.data import DataLoader

# Create the Dataset object
train_dataset = HybridDataset(
    dataframe = final_df,
    tokenizer=tokenizer,
    max_len=128 # Use 128 for faster training on Colab
)

# Define the train_loader 
train_loader = DataLoader(
    train_dataset,
    batch_size=16, #rows to process at once
    shuffle=True
)

In [None]:
from tqdm import tqdm 

def train_model(model, data_loader, optimizer, criterion, device):
    model.train() 
    total_loss = 0

    # Iterate over the batches from the DataLoader
    for batch in tqdm(data_loader, desc="Training"):
        # Prepare the data
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        stats = batch['stats'].to(device)
        labels = batch['label'].to(device)

        # Clear previous gradients
        optimizer.zero_grad()

        # Forward Pass 
        outputs = model(input_ids, attention_mask, stats)

        # Calculate Loss
        loss = criterion(outputs, labels)

        # Backward Pass 
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(data_loader)


avg_loss = train_model(model, train_loader, optimizer, criterion, device)
print(f"Average Training Loss: {avg_loss:.4f}")

In [None]:
import os
save_folder = '/content/drive/MyDrive/mental_health_prediction'
model_save_path = f"{save_folder}/dual_tower_mental_health.pth"
os.makedirs(save_folder, exist_ok=True)
torch.save(model.state_dict(), model_save_path)

In [None]:
from sklearn.model_selection import train_test_split
# Split the final_df into 80% Training and 20% Testing
train_df, test_df = train_test_split(
    final_df,
    test_size=0.20,
    random_state=42,
    stratify=final_df['target']

print(f"Training samples: {len(train_df)}")
print(f"Testing samples: {len(test_df)}")

In [None]:
# 1. Training Loader
train_dataset = HybridDataset(train_df, tokenizer, max_len=128)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# 2. Testing Loader
test_dataset = HybridDataset(test_df, tokenizer, max_len=128)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
from sklearn.metrics import classification_report, accuracy_score

def evaluate_model(model, data_loader, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            stats = batch['stats'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask, stats)_, preds = torch.max(outputs, dim=1)

            all_preds.extend(preds.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())

    return all_labels, all_preds

# the evaluation
y_true, y_pred = evaluate_model(model, test_loader, device)

print("\n--- Model Performance Results ---")
print(f"Overall Accuracy: {accuracy_score(y_true, y_pred):.4f}")
print(classification_report(y_true, y_pred, target_names=['Normal', 'Depressed']))

In [None]:
import pickle
scaler_path = '/content/drive/MyDrive/mental_health_prediction/scaler.pkl'
encoder_path = '/content/drive/MyDrive/mental_health_prediction/label_encoder.pkl'

# Save the Scaler
with open(scaler_path, 'wb') as f:
    pickle.dump(scaler, f)

# Save the LabelEncoder (le)
with open(encoder_path, 'wb') as f:
    pickle.dump(le, f)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Calculate the confusion matrix
cm = confusion_matrix(y_true, y_pred)

# Create the plot
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',xticklabels=['Normal', 'Depressed'],yticklabels=['Normal', 'Depressed'])

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix: Mental Health Dual-Tower Model')
plt.show()

# Save the plot image
plt.savefig('/content/drive/MyDrive/mental_health_prediction/confusion_matrix.png')