In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.optim as optim
import torch.nn as nn
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification #light version of BeRT
import multiprocessing as mp
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


#PREPARE
df = pd.read_csv('../../data/Mental-Health-Twitter.csv')
tweets = df['post_text'].tolist()
labels = df['label'].tolist()  # (0 = no depression, 1 = depression)
mp.set_start_method('spawn', force=True)
# training / testing sets splits
train_texts, val_texts, train_labels, val_labels = train_test_split(tweets, labels, test_size=0.1, random_state=42)


#TOKENIZATION + DATALOADERS
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

#tokenization
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

#convert to torch tensors
train_dataset = TensorDataset(torch.tensor(train_encodings['input_ids']),
                              torch.tensor(train_encodings['attention_mask']),
                              torch.tensor(train_labels))
val_dataset = TensorDataset(torch.tensor(val_encodings['input_ids']),
                            torch.tensor(val_encodings['attention_mask']),
                            torch.tensor(val_labels))

#DataLoaders
# train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True) #DataLoader for the training dataset
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=4)  # num_workers for parallel processing (imrpove CPU efficiency)

val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

#print(f"Number of batches per epoch: {len(train_loader)}")


#MODEL SET UP + CLASSIFICATION
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
model.to(device)
optimizer = optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.01) #L2 Regularization

#training loop
model.train()
for epoch in range(3):  # 3 epochs
    for batch in train_loader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        outputs = model(**inputs)

        loss = outputs[0]
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        print(f"Epoch {epoch}, Loss: {loss.item()}")

#EVALUATE
model.eval()
total_eval_accuracy = 0
total_eval_loss = 0
all_labels = []
all_predictions = []

for batch in val_loader:
    batch = tuple(t.to(device) for t in batch)
    with torch.no_grad():
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        outputs = model(**inputs)
        loss = outputs[0]
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        total_eval_loss += loss.item()
        all_labels.extend(batch[2].cpu().numpy())
        all_predictions.extend(predictions.cpu().numpy())

average_test_loss = total_eval_loss / len(val_loader)
print(f"Validation Loss: {average_test_loss}")

#accuracy, precision, recall, and F1 score
accuracy = accuracy_score(all_labels, all_predictions)
precision = precision_score(all_labels, all_predictions, average='binary')
recall = recall_score(all_labels, all_predictions, average='binary')
f1 = f1_score(all_labels, all_predictions, average='binary')

# Print the metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")




Number of batches per epoch: 1125


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 0, Loss: 0.6312433481216431
Epoch 0, Loss: 0.6847678422927856
Epoch 0, Loss: 0.7862696051597595
Epoch 0, Loss: 0.7430391311645508
Epoch 0, Loss: 0.7497271299362183
Epoch 0, Loss: 0.7545411586761475
Epoch 0, Loss: 0.6894848942756653
Epoch 0, Loss: 0.6850561499595642
Epoch 0, Loss: 0.6710523962974548
Epoch 0, Loss: 0.6861380934715271
Epoch 0, Loss: 0.7476205229759216
Epoch 0, Loss: 0.6361508369445801
Epoch 0, Loss: 0.6758924126625061
Epoch 0, Loss: 0.6124328374862671
Epoch 0, Loss: 0.6376152038574219
Epoch 0, Loss: 0.6788704991340637
Epoch 0, Loss: 0.6947864294052124
Epoch 0, Loss: 0.612500786781311
Epoch 0, Loss: 0.7122936844825745
Epoch 0, Loss: 0.6679898500442505
Epoch 0, Loss: 0.6573264002799988
Epoch 0, Loss: 0.6570717692375183
Epoch 0, Loss: 0.6732985377311707
Epoch 0, Loss: 0.6344191431999207
Epoch 0, Loss: 0.663353443145752
Epoch 0, Loss: 0.6128486394882202
Epoch 0, Loss: 0.597825288772583
Epoch 0, Loss: 0.6976607441902161
Epoch 0, Loss: 0.523962676525116
Epoch 0, Loss: 0.6