In [2]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

# Load data
df = pd.read_csv("Mental-Health-Twitter.csv")

# Clean text
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'#', '', text)  # Remove hashtag symbols but keep text
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    return text.lower().strip()

df['cleaned_text'] = df['post_text'].apply(clean_text)

# Check class balance
print(df['label'].value_counts())

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    df['cleaned_text'], df['label'], 
    test_size=0.2, stratify=df['label'], random_state=42
)

  from .autonotebook import tqdm as notebook_tqdm


label
1    10000
0    10000
Name: count, dtype: int64


In [3]:
# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Tokenize data
MAX_LEN = 128  # Reduced from 256 for memory efficiency

def tokenize(texts):
    return tokenizer(
        texts.tolist(),
        add_special_tokens=True,
        max_length=MAX_LEN,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

train_encodings = tokenize(X_train)
test_encodings = tokenize(X_test)

# Create datasets
class DepressionDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels.values[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = DepressionDataset(train_encodings, y_train)
test_dataset = DepressionDataset(test_encodings, y_test)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [4]:
# Initialize model
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False
)

# Training parameters
batch_size = 16
epochs = 3
learning_rate = 2e-5

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size)

optimizer = AdamW(model.parameters(), lr=learning_rate)

# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(epochs):
    model.train()
    total_loss = 0
    
    for batch in train_loader:
        optimizer.zero_grad()
        inputs = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**inputs)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    
    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs} - Train loss: {avg_train_loss:.4f}")

    # Validation check
    model.eval()
    correct = 0
    total = 0
    
    with torch.no_grad():
        for batch in test_loader:
            inputs = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**inputs)
            predictions = torch.argmax(outputs.logits, dim=1)
            correct += (predictions == inputs['labels']).sum().item()
            total += inputs['labels'].size(0)
    
    accuracy = correct / total
    print(f"Validation Accuracy: {accuracy:.4f}\n")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 - Train loss: 0.4977
Validation Accuracy: 0.7867

Epoch 2/3 - Train loss: 0.3334
Validation Accuracy: 0.8117

Epoch 3/3 - Train loss: 0.1856
Validation Accuracy: 0.8110



In [5]:
from sklearn.metrics import classification_report, confusion_matrix

def evaluate_model(loader):
    model.eval()
    predictions = []
    true_labels = []
    
    with torch.no_grad():
        for batch in loader:
            inputs = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**inputs)
            preds = torch.argmax(outputs.logits, dim=1)
            
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(inputs['labels'].cpu().numpy())
    
    print(classification_report(true_labels, predictions))
    print("Confusion Matrix:\n", confusion_matrix(true_labels, predictions))

print("Test Set Evaluation:")
evaluate_model(test_loader)

Test Set Evaluation:
              precision    recall  f1-score   support

           0       0.81      0.81      0.81      2000
           1       0.81      0.81      0.81      2000

    accuracy                           0.81      4000
   macro avg       0.81      0.81      0.81      4000
weighted avg       0.81      0.81      0.81      4000

Confusion Matrix:
 [[1621  379]
 [ 377 1623]]


In [6]:
# Save for later use in chatbot
model.save_pretrained("depression_bert_model")
tokenizer.save_pretrained("depression_bert_tokenizer")

('depression_bert_tokenizer\\tokenizer_config.json',
 'depression_bert_tokenizer\\special_tokens_map.json',
 'depression_bert_tokenizer\\vocab.txt',
 'depression_bert_tokenizer\\added_tokens.json')

In [8]:
from torch.utils.data import WeightedRandomSampler
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Check if class imbalance exists
class_counts = df['label'].value_counts()
print("Class distribution:\n", class_counts)

# Set this flag based on your dataset
class_imbalance = class_counts[0] / class_counts[1] > 1.5  # Example threshold

if class_imbalance:
    print("\nApplying class imbalance mitigation...")
    class_weights = compute_class_weight(
        'balanced', 
        classes=np.unique(df['label']), 
        y=df['label']
    )
    class_weights = torch.tensor(class_weights, dtype=torch.float32)
    
    # Create sampler
    sampler = WeightedRandomSampler(
        weights=class_weights[df['label']],  # Get weights for each sample
        num_samples=len(df['label']),
        replacement=True
    )
    
    # Modify your DataLoader to use the sampler
    train_loader = torch.utils.data.DataLoader(
        train_dataset, 
        batch_size=batch_size, 
        sampler=sampler  # Instead of shuffle=True
    )

Class distribution:
 label
1    10000
0    10000
Name: count, dtype: int64


In [9]:
def predict_depression(text):
    cleaned = clean_text(text)
    inputs = tokenizer(cleaned, return_tensors='pt', truncation=True, max_length=MAX_LEN)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    return {
        "depression_prob": probs[0][1].item(),
        "prediction": torch.argmax(probs).item()
    }

# Test sample
print(predict_depression("I've been feeling hopeless and can't get out of bed"))
# Output: {'depression_prob': 0.92, 'prediction': 1}

{'depression_prob': 0.6891149878501892, 'prediction': 1}
