This notebook uses the simpler dataset
https://www.kaggle.com/datasets/infamouscoder/depression-reddit-cleaned/data

In [1]:
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

from datasets import Dataset, load_dataset

import numpy as np
import pandas as pd
import evaluate
import accelerate
from transformers import AutoTokenizer, pipeline
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoConfig

from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
raw_df = pd.read_csv("./data/depression_dataset_reddit_cleaned.csv")

In [3]:
raw_df

Unnamed: 0,clean_text,is_depression
0,we understand that most people who reply immed...,1
1,welcome to r depression s check in post a plac...,1
2,anyone else instead of sleeping more when depr...,1
3,i ve kind of stuffed around a lot in my life d...,1
4,sleep is my greatest and most comforting escap...,1
...,...,...
7726,is that snow,0
7727,moulin rouge mad me cry once again,0
7728,trying to shout but can t find people on the list,0
7729,ughh can t find my red sox hat got ta wear thi...,0


In [4]:
train_temp, test_df = train_test_split(raw_df, test_size=0.15, stratify=raw_df['is_depression'], shuffle=True)

In [5]:
train_df, validation_df = train_test_split(train_temp, test_size=0.17, stratify=train_temp['is_depression'], shuffle=True)

In [6]:
train_df

Unnamed: 0,clean_text,is_depression
3677,my alt acc look like it s made by someone with...,1
7215,why do most video i play skip and jump,0
5741,missing the fab five,0
384,i just got another thing i have to look out an...,1
6448,didnt announce reading lineup,0
...,...,...
6138,amsterdamant unfortunately i didn t dream abou...,0
2048,i just took the pill i don t know how many it ...,1
7661,another set of ipod earbuds dying left going q...,0
6591,kristenkreuk fiuhh nice to get info from you i...,0


In [7]:
train_df = train_df.rename(columns={'clean_text': 'text', 'is_depression': 'label'})
validation_df = validation_df.rename(columns={'clean_text': 'text', 'is_depression': 'label'})
test_df = test_df.rename(columns={'clean_text': 'text', 'is_depression': 'label'})

In [8]:
model_name = 'roberta-base'
rob_tokenizer = AutoTokenizer.from_pretrained(model_name)
max_len = 512

def process_labels(example):
    new_label = {'not depression': 0, 'depression': 1}.get(example['label'], example['label'])
    return {'label': new_label}
    
def process_text(example, tokenizer=rob_tokenizer, max_len=512, padding='max_length', truncation=True):
    return tokenizer(example['text'], max_length=max_len, padding=padding, truncation=truncation)

In [9]:
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
validation_dataset = Dataset.from_pandas(validation_df)

In [10]:
train_dataset = train_dataset.map(process_labels)
train_dataset = train_dataset.map(process_text, batched=True)

test_dataset = test_dataset.map(process_labels)
test_dataset = test_dataset.map(process_text, batched=True)

validation_dataset = validation_dataset.map(process_labels)
validation_dataset = validation_dataset.map(process_text, batched=True)

Map: 100%|██████████| 5453/5453 [00:00<00:00, 13647.15 examples/s]
Map: 100%|██████████| 5453/5453 [00:00<00:00, 6189.10 examples/s]
Map: 100%|██████████| 1160/1160 [00:00<00:00, 20594.17 examples/s]
Map: 100%|██████████| 1160/1160 [00:00<00:00, 6664.25 examples/s]
Map: 100%|██████████| 1118/1118 [00:00<00:00, 21902.78 examples/s]
Map: 100%|██████████| 1118/1118 [00:00<00:00, 6482.60 examples/s]


In [11]:
NUM_LABELS = 2 ## 0 or 1 for no depression and depression
MODEL_LABEL = "infamous_coder"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=NUM_LABELS)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
accuracy = evaluate.load("accuracy")
f1_score = evaluate.load('f1')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=predictions, references=labels)["accuracy"],
        "f1": f1_score.compute(predictions=predictions, references=labels, average="macro")["f1"]
    }

In [13]:
training_args = TrainingArguments(
    output_dir=f"./results/{MODEL_LABEL}",
    evaluation_strategy="epoch",  # evaluates on dev after each epoch
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=8,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True, # <- Load best model based on F1
    metric_for_best_model="f1", # <- Use F1 as selection metric
    greater_is_better=True, # <- Higher F1 is better
    fp16=True
)




In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
    # pos_weight=pos_weight # for weighted loss trainer
)


  trainer = Trainer(


In [15]:
train_log = trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1971,0.080483,0.979428,0.979423
2,0.0994,0.114607,0.971377,0.971376
3,0.0318,0.10739,0.976744,0.976744
4,0.0241,0.119255,0.981216,0.981216
5,0.0122,0.122986,0.981216,0.981213
6,0.0045,0.137387,0.981216,0.981213
7,0.001,0.153274,0.979428,0.979425
8,0.0014,0.154373,0.980322,0.980319


In [16]:
model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [26]:
import torch
import torch.nn.functional as F

In [21]:
device = device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [31]:
# Example input text
text = "I am quite happy right now."

# Tokenize input
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)

# Get model outputs
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

# Convert logits to probabilities
probs = F.softmax(logits, dim=-1)

# Get predicted class
predicted_class = torch.argmax(probs, dim=1).item()
confidence = probs[0][predicted_class].item()

# Output result
print(f"Predicted Class: {predicted_class} ({'Depressed' if predicted_class == 1 else 'Not Depressed'})")
print(f"Confidence: {confidence:.2f}")

Predicted Class: 1 (Depressed)
Confidence: 1.00
