In [1]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.


In [1]:
import re
from transformers import BertTokenizer
import pandas as pd

# Load full datasets
reddit_df = pd.read_csv("C:/Users/priya/OneDrive/Desktop/depression_dataset_reddit_cleaned.csv")
mental_health_df = pd.read_csv("C:/Users/priya/OneDrive/Desktop/mental_health.csv")

# Rename columns for consistency
reddit_df.rename(columns={"clean_text": "text", "is_depression": "label"}, inplace=True)

# Merge both datasets
combined_df = pd.concat([reddit_df, mental_health_df], ignore_index=True)

# Text Cleaning Function
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    return text.strip()

# Apply cleaning
combined_df["text"] = combined_df["text"].astype(str).apply(clean_text)

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenization & Encoding
def encode_texts(texts, tokenizer, max_length=128):
    return tokenizer(
        list(texts), padding='max_length', truncation=True, max_length=max_length, return_tensors="pt"
    )

# Encode texts
encoded_texts = encode_texts(combined_df["text"], tokenizer)

# Convert to DataFrame for easier handling
encoded_df = pd.DataFrame({
    "input_ids": encoded_texts["input_ids"].tolist(),
    "attention_mask": encoded_texts["attention_mask"].tolist(),
    "label": combined_df["label"].tolist()
})

# Save processed dataset
processed_file_path = "C:/Users/priya/OneDrive/Desktop/processed_depression_dataset.csv"
encoded_df.to_csv(processed_file_path, index=False)

# Display sample rows after preprocessing
encoded_df.head()


Unnamed: 0,input_ids,attention_mask,label
0,"[101, 2057, 3305, 2008, 2087, 2111, 2040, 7514...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
1,"[101, 6160, 2000, 1054, 6245, 1055, 4638, 1999...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
2,"[101, 3087, 2842, 2612, 1997, 5777, 2062, 2043...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
3,"[101, 1045, 2310, 2785, 1997, 11812, 2105, 103...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
4,"[101, 3637, 2003, 2026, 4602, 1998, 2087, 1633...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1


In [3]:
import re
import torch
import pandas as pd
from transformers import BertTokenizer
from torch.utils.data import Dataset, DataLoader

# Loaded dataset
processed_file_path = "C:/Users/priya/OneDrive/Desktop/processed_depression_dataset.csv"
processed_df = pd.read_csv(processed_file_path)

# custom dataset class
class DepressionDataset(Dataset):
    def __init__(self, data):
        self.input_ids = torch.tensor([eval(x) for x in data["input_ids"]], dtype=torch.long)
        self.attention_mask = torch.tensor([eval(x) for x in data["attention_mask"]], dtype=torch.long)
        self.labels = torch.tensor(data["label"].tolist(), dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attention_mask[idx], self.labels[idx]

# dataset instance
dataset = DepressionDataset(processed_df)

# Split dataset (80% train, 20% validation)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# Define DataLoaders
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

print("DataLoaders are ready!")


DataLoaders are ready!


In [5]:
import torch.nn as nn
from transformers import BertModel

# Define the BERT + RNN model
class BertRNNClassifier(nn.Module):
    def __init__(self, bert_model_name="bert-base-uncased", hidden_dim=256, num_classes=2):
        super(BertRNNClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.rnn = nn.RNN(input_size=768, hidden_size=hidden_dim, num_layers=1, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)
        self.dropout = nn.Dropout(0.3)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():
            bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        
        rnn_out, _ = self.rnn(bert_output.last_hidden_state)
        rnn_out = rnn_out[:, -1, :]  # Get the last RNN output
        output = self.fc(self.dropout(rnn_out))
        return self.softmax(output)


In [7]:
import torch.nn.functional as F

# Function to calculate accuracy
def compute_accuracy(preds, labels):
    pred_classes = torch.argmax(preds, dim=1)  # Get class with highest probability
    correct = (pred_classes == labels).sum().item()
    total = labels.size(0)
    return correct / total


In [9]:
import torch.optim as optim
import torch
from tqdm import tqdm


# Check device (use GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Initialize model, loss function, and optimizer
model = BertRNNClassifier().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

# Training loop
epochs = 5
for epoch in range(epochs):
    model.train()
    total_loss = 0
    total_acc = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs}")


    for input_ids, attention_mask, labels in train_loader:
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_acc += compute_accuracy(outputs, labels)  # Calculate accuracy
        progress_bar.set_postfix(loss=loss.item(), accuracy=compute_accuracy(outputs, labels))



    avg_loss = total_loss / len(train_loader)
    avg_acc = total_acc / len(train_loader)

    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}, Accuracy: {avg_acc:.4f}")


print("Training complete!")


Using device: cpu


Epoch 1/5:   0%|          | 0/1786 [1:32:32<?, ?it/s, accuracy=0.833, loss=0.48] 

Epoch 1/5, Loss: 0.4989, Accuracy: 0.8210



Epoch 1/5:   0%|          | 0/1786 [1:32:32<?, ?it/s, accuracy=0.833, loss=0.48]

Epoch 2/5:   0%|          | 0/1786 [00:03<?, ?it/s, accuracy=0.75, loss=0.564][A
Epoch 2/5:   0%|          | 0/1786 [00:06<?, ?it/s, accuracy=0.875, loss=0.442][A
Epoch 2/5:   0%|          | 0/1786 [00:09<?, ?it/s, accuracy=0.938, loss=0.39] [A
Epoch 2/5:   0%|          | 0/1786 [00:11<?, ?it/s, accuracy=1, loss=0.339]   [A
Epoch 2/5:   0%|          | 0/1786 [00:14<?, ?it/s, accuracy=0.812, loss=0.488][A
Epoch 2/5:   0%|          | 0/1786 [00:17<?, ?it/s, accuracy=0.875, loss=0.439][A
Epoch 2/5:   0%|          | 0/1786 [00:20<?, ?it/s, accuracy=1, loss=0.323]    [A
Epoch 2/5:   0%|          | 0/1786 [00:23<?, ?it/s, accuracy=0.938, loss=0.362][A
Epoch 2/5:   0%|          | 0/1786 [00:26<?, ?it/s, accuracy=0.75, loss=0.541] [A
Epoch 2/5:   0%|          | 0/1786 [00:29<?, ?it/s, accuracy=0.812, loss=0.497][A
Epoch 2/5:   0%|          | 0/1786 [00:32<?, ?it/s, accuracy=0.875, loss=0.44] [A
Epoch 

Epoch 2/5, Loss: 0.4591, Accuracy: 0.8543


Epoch 2/5:   0%|          | 0/1786 [1:33:02<?, ?it/s, accuracy=0.667, loss=0.646]
Epoch 3/5:   0%|          | 0/1786 [1:33:16<?, ?it/s, accuracy=0.667, loss=0.65] 

Epoch 3/5, Loss: 0.4719, Accuracy: 0.8411



Epoch 3/5:   0%|          | 0/1786 [1:33:16<?, ?it/s, accuracy=0.667, loss=0.65]

Epoch 4/5:   0%|          | 0/1786 [00:03<?, ?it/s, accuracy=0.875, loss=0.438][A
Epoch 4/5:   0%|          | 0/1786 [00:06<?, ?it/s, accuracy=0.812, loss=0.501][A
Epoch 4/5:   0%|          | 0/1786 [00:10<?, ?it/s, accuracy=0.875, loss=0.427][A
Epoch 4/5:   0%|          | 0/1786 [00:13<?, ?it/s, accuracy=0.812, loss=0.501][A
Epoch 4/5:   0%|          | 0/1786 [00:17<?, ?it/s, accuracy=0.688, loss=0.626][A
Epoch 4/5:   0%|          | 0/1786 [00:20<?, ?it/s, accuracy=0.812, loss=0.5]  [A
Epoch 4/5:   0%|          | 0/1786 [00:23<?, ?it/s, accuracy=0.938, loss=0.375][A
Epoch 4/5:   0%|          | 0/1786 [00:26<?, ?it/s, accuracy=0.938, loss=0.377][A
Epoch 4/5:   0%|          | 0/1786 [00:29<?, ?it/s, accuracy=0.938, loss=0.377][A
Epoch 4/5:   0%|          | 0/1786 [00:32<?, ?it/s, accuracy=0.75, loss=0.563] [A
Epoch 4/5:   0%|          | 0/1786 [00:35<?, ?it/s, accuracy=0.938, loss=0.376][A
Epoc

Epoch 4/5, Loss: 0.4740, Accuracy: 0.8396


Epoch 4/5:   0%|          | 0/1786 [1:31:19<?, ?it/s, accuracy=1, loss=0.316]
Epoch 5/5:   0%|          | 0/1786 [1:33:55<?, ?it/s, accuracy=0.833, loss=0.48] 

Epoch 5/5, Loss: 0.4507, Accuracy: 0.8622
Training complete!


In [11]:
model.eval()
total_acc = 0

with torch.no_grad():
    for input_ids, attention_mask, labels in val_loader:
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        outputs = model(input_ids, attention_mask)
        total_acc += compute_accuracy(outputs, labels)

val_acc = total_acc / len(val_loader)
print(f"Validation Accuracy: {val_acc:.4f}")


Validation Accuracy: 0.8718


In [None]:
# Save the trained model
model_save_path = "C:/Users/priya/OneDrive/Desktop/Depression detection/bert_rnn_depression_model.pth"
torch.save(model.state_dict(), model_save_path)

print(f"Model saved to {model_save_path}")