In [2]:
# Cell 1: Setup and Imports
import os
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F

import lightning.pytorch as pl
from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint
from lightning.pytorch.profilers import SimpleProfiler  # Updated for Lightning 2.0

from transformers import RobertaTokenizer, RobertaForSequenceClassification, get_linear_schedule_with_warmup
from torchmetrics import Accuracy

# Set seed for reproducibility
pl.seed_everything(42)

torch.cuda.empty_cache()
torch.backends.cudnn.deterministic = False  # For deterministic results
torch.backends.cudnn.benchmark = True  # Disabling to ensure deterministic algorithm
torch.backends.cuda.matmul.allow_tf32 = True
torch.set_float32_matmul_precision('high')  # Optimize matmul precision

Seed set to 42


In [3]:
# Cell 2: Data Module and Dataset Definition

class DepressionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = str(self.texts[index])
        label = self.labels[index]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

class DepressionDataModule(pl.LightningDataModule):
    def __init__(self, df, tokenizer, max_length=128, batch_size=16):
        super().__init__()
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.batch_size = batch_size

    def setup(self, stage=None):
        # Split the dataset into training and validation sets (80/20 split)
        train_df, val_df = train_test_split(
            self.df, test_size=0.2, random_state=42, stratify=self.df['label']
        )
        self.train_dataset = DepressionDataset(
            train_df['body'].tolist(), train_df['label'].tolist(), self.tokenizer, self.max_length
        )
        self.val_dataset = DepressionDataset(
            val_df['body'].tolist(), val_df['label'].tolist(), self.tokenizer, self.max_length
        )

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=4)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, shuffle=False, num_workers=4)

In [4]:
# Cell 3: Model Definition

from transformers import AutoModelForSequenceClassification, get_linear_schedule_with_warmup

class DepressionClassifier(pl.LightningModule):
    def __init__(self, n_classes, steps_per_epoch=None, n_epochs=None, lr=2e-5):
        super().__init__()
        # Use DistilRoBERTa for faster training
        self.model = AutoModelForSequenceClassification.from_pretrained('distilroberta-base', num_labels=n_classes).train()
        self.steps_per_epoch = steps_per_epoch
        self.n_epochs = n_epochs
        self.lr = lr
        # Separate accuracy metrics for training and validation
        self.train_acc = Accuracy(task="multiclass", num_classes=n_classes)
        self.val_acc = Accuracy(task="multiclass", num_classes=n_classes)

    def forward(self, input_ids, attention_mask, labels=None):
        return self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

    def training_step(self, batch, batch_idx):
        outputs = self(
            input_ids=batch['input_ids'],
            attention_mask=batch['attention_mask'],
            labels=batch['labels']
        )
        loss = outputs.loss
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        self.train_acc.update(preds, batch['labels'])
        self.log('train_loss', loss, prog_bar=True)
        return loss

    def on_train_epoch_end(self):
        # Compute and print training accuracy for the epoch
        train_epoch_acc = self.train_acc.compute()
        print(f"Epoch {self.current_epoch} - Training Accuracy: {train_epoch_acc:.4f}")
        self.train_acc.reset()

    def validation_step(self, batch, batch_idx):
        outputs = self(
            input_ids=batch['input_ids'],
            attention_mask=batch['attention_mask'],
            labels=batch['labels']
        )
        loss = outputs.loss
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        self.val_acc.update(preds, batch['labels'])
        self.log('val_loss', loss, prog_bar=True)
        return loss

    def on_validation_epoch_end(self):
        # Compute and print validation accuracy for the epoch
        val_epoch_acc = self.val_acc.compute()
        print(f"Epoch {self.current_epoch} - Validation Accuracy: {val_epoch_acc:.4f}")
        self.val_acc.reset()

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.lr)
        if self.steps_per_epoch is None or self.n_epochs is None:
            return optimizer
        total_steps = self.steps_per_epoch * self.n_epochs
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=int(0.1 * total_steps), num_training_steps=total_steps
        )
        return [optimizer], [{'scheduler': scheduler, 'interval': 'step'}]


In [5]:
# Cell 4: Training Setup and Execution

# Load your dataset (update the path to your CSV file)
df = pd.read_csv('reddit_depression_dataset.csv')  # CSV should have "text" and "label" columns

print("Original dataset shape:", df.shape)

# Drop rows where 'text' or 'label' contains NaN values
df = df.dropna(subset=['body', 'label'])
print("Dataset shape after dropping NaNs:", df.shape)

# Initialize the tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Create the DataModule
data_module = DepressionDataModule(df, tokenizer, max_length=128, batch_size=128)
data_module.setup()

# Calculate steps per epoch (for learning rate scheduling)
steps_per_epoch = len(data_module.train_dataloader())
n_epochs = 1  # Adjust the number of epochs as needed

# Create the LightningModule model (ensure n_classes matches your dataset)
n_classes = df['label'].nunique()
model = DepressionClassifier(n_classes=n_classes, steps_per_epoch=steps_per_epoch, n_epochs=n_epochs, lr=2e-5)

# Set up callbacks for early stopping and checkpointing
early_stop_callback = EarlyStopping(monitor='val_loss', patience=3, verbose=True, mode='min')
checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',
    dirpath='checkpoints_new',
    filename='best-checkpoint',
    save_top_k=1,
    mode='min'
)

# Setup the profiler (using SimpleProfiler for Lightning 2.0)
profiler = SimpleProfiler(dirpath='profiler_logs')

# Configure the Trainer for Lightning 2.0 (using mixed precision for speed and memory efficiency)
trainer = pl.Trainer(
    max_epochs=n_epochs,
    accelerator='gpu',
    devices=1,
    callbacks=[early_stop_callback, checkpoint_callback],
    profiler=profiler,
    precision="bf16-mixed",
    log_every_n_steps=10,
    enable_progress_bar=True,
)

# Start training using the DataModule
trainer.fit(model, datamodule=data_module)


  df = pd.read_csv('reddit_depression_dataset.csv')  # CSV should have "text" and "label" columns


Original dataset shape: (2470778, 8)
Dataset shape after dropping NaNs: (2009643, 8)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using bfloat16 Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type                             | Params | Mode 
-----------------------------------------------------------------------
0 | model     | RobertaForSequenceClassification | 82.1 M | train
1 | train_acc | MulticlassAccuracy               | 0      | train
2 | val_acc   | MulticlassAccuracy               | 0      | train
-----------------------------------------------------------------------
82.1 M    Trainable

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Epoch 0 - Validation Accuracy: 0.8125


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved. New best score: 0.135


Epoch 0 - Validation Accuracy: 0.9487
Epoch 0 - Training Accuracy: 0.9385


`Trainer.fit` stopped: `max_epochs=1` reached.
FIT Profiler Report

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|  Action                                                                                                                                                               	|  Mean duration (s)	|  Num calls      	|  Total time (s) 	|  Percentage %   	|
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|  Total                                                                                                                             

In [7]:
# Resume Training

df = pd.read_csv('reddit_depression_dataset.csv') # CSV should have "text" and "label" columns

print("Original dataset shape:", df.shape)

# Drop rows where 'text' or 'label' contains NaN values
df = df.dropna(subset=['body', 'label'])
print("Dataset shape after dropping NaNs:", df.shape)

# Initialize the tokenizer (make sure it matches the one used during training)
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Create the DataModule
data_module = DepressionDataModule(df, tokenizer, max_length=128, batch_size=128)
data_module.setup()

# Specify the checkpoint path (update if necessary)
checkpoint_path = "checkpoints_new/best-checkpoint.ckpt"

# Resume model training from checkpoint
# Make sure to pass in any necessary parameters that your model's __init__ requires
model = DepressionClassifier.load_from_checkpoint(
    checkpoint_path,
    n_classes=df['label'].nunique(),
    steps_per_epoch=len(data_module.train_dataloader()),
    n_epochs=1,  # or the total epochs you plan to train for
    lr=2e-5
)

# Setup callbacks (reinitialize if needed)
early_stop_callback = EarlyStopping(monitor='val_loss', patience=3, verbose=True, mode='min')
checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',
    dirpath='checkpoints',
    filename='best-checkpoint',
    save_top_k=1,
    mode='min'
)

# Configure the Trainer (ensure settings match your available GPU and desired precision)
trainer = pl.Trainer(
    max_epochs=1,  # Adjust total epochs as needed; training will resume from the checkpoint's epoch
    accelerator='gpu',
    devices=1,
    callbacks=[early_stop_callback, checkpoint_callback],
    precision="bf16-mixed",  # Mixed precision for efficiency
    log_every_n_steps=10,
    enable_progress_bar=True,
)

# Resume training
trainer.fit(model, datamodule=data_module)

  df = pd.read_csv('reddit_depression_dataset.csv') # CSV should have "text" and "label" columns


Original dataset shape: (2470778, 8)
Dataset shape after dropping NaNs: (2009643, 8)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using bfloat16 Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type                             | Params | Mode 
-----------------------------------------------------------------------
0 | model     | RobertaForSequenceClassification | 82.1 M | train
1 | train_acc | MulticlassAccuracy               | 0      | train
2 | val_acc   | MulticlassAccuracy               | 0      | train
-----------------------------------------------------------------------
82.1 M    Trainable

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Epoch 0 - Validation Accuracy: 0.9727


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved. New best score: 0.129


Epoch 0 - Validation Accuracy: 0.9510
Epoch 0 - Training Accuracy: 0.9499


`Trainer.fit` stopped: `max_epochs=1` reached.


In [None]:
# Cell 5: Load Best Model and Save

# Get the path to the best checkpoint
best_model_path = checkpoint_callback.best_model_path
print("Best model saved at:", best_model_path)

# Load the best model from checkpoint
best_model = DepressionClassifier(2).load_from_checkpoint(best_model_path)

# Optionally, save the best model to a desired location (e.g., a .pt file)
torch.save(best_model.state_dict(), "best_depression_classifier.pt")
print("Best model state_dict saved to best_depression_classifier.pt")


In [5]:
n_classes = 2  # e.g., binary classification: 0 = Not Depressed, 1 = Depressed

# Specify the path to your checkpoint file
checkpoint_path = "checkpoints_new/best-checkpoint.ckpt"

# Load the model from the checkpoint. Pass any necessary parameters expected by your model's __init__
model = DepressionClassifier.load_from_checkpoint(checkpoint_path, n_classes=n_classes)
model.eval()

# Move the model to the appropriate device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Initialize the tokenizer (ensure it matches the one used during training)
tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base')

def predict(text):
    # Tokenize the input text and move tensors to the same device as the model
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    # Run the model in evaluation mode
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    prediction = torch.argmax(logits, dim=1).item()
    return "Depressed" if prediction == 1 else "Not Depressed"

# Example usage
sample_text = "I feel happy and am looking forward to tommorrow"
print("Prediction:", predict(sample_text))

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Prediction: Not Depressed
