Roberta with 3 Classes

https://huggingface.co/MichalMlodawski/nsfw-text-detection-large

In [1]:
import pandas as pd
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_scheduler
from tqdm import tqdm
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# Step 1: Load and preprocess the dataset
# Load the dataset from the CSV file
data = pd.read_csv(r"C:\VSCode Codes\ResAI\Fine Tuning Pre-Trained RoBERTa Model\Classified Synthetic Dataset.csv")

In [7]:
# Split the dataset into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data["text"].tolist(),
    data["Label"].tolist(),
    test_size=0.2,
    random_state=42
)

In [8]:
# Step 2: Tokenize the text
tokenizer = AutoTokenizer.from_pretrained("TostAI/nsfw-text-detection-large")

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [9]:
# Step 3: Create a Dataset class
class NSFWTextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])  # Labels for 3 classes: 0 (SAFE), 1 (QUESTIONABLE), 2 (UNSAFE)
        return item

train_dataset = NSFWTextDataset(train_encodings, train_labels)
val_dataset = NSFWTextDataset(val_encodings, val_labels)


In [10]:
# Step 4: Load the pre-trained model
model = AutoModelForSequenceClassification.from_pretrained(
    "TostAI/nsfw-text-detection-large",
    num_labels=3  # Three classes: 0 (SAFE), 1 (QUESTIONABLE), 2 (UNSAFE)
)

# Freeze all layers except the classification head
for param in model.base_model.parameters():  # `base_model` refers to the backbone (e.g., RoBERTa)
    param.requires_grad = False

In [11]:
# Step 5: Create data loaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

In [12]:
# Step 6: Define the optimizer (only for trainable parameters)
optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=2e-5)

In [13]:
# Step 7: Define the learning rate scheduler
num_training_steps = len(train_loader) * 3  # 3 epochs
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [14]:
# Step 8: Training loop
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

progress_bar = tqdm(range(num_training_steps))

for epoch in range(3):  # 3 epochs
    print(f"Epoch {epoch + 1}/3")
    model.train()
    for batch in train_loader:
        # Move data to device (GPU or CPU)
        batch = {k: v.to(device) for k, v in batch.items()}
        
        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        
        progress_bar.update(1)

    # Validation loop
    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            val_loss += outputs.loss.item()
            predictions = torch.argmax(outputs.logits, dim=-1)
            correct += (predictions == batch["labels"]).sum().item()
            total += batch["labels"].size(0)

    val_loss /= len(val_loader)
    accuracy = correct / total
    print(f"Validation Loss: {val_loss:.4f}, Accuracy: {accuracy:.4f}")

  0%|          | 0/3090 [00:00<?, ?it/s]

Epoch 1/3


 33%|███▎      | 1030/3090 [08:06<15:42,  2.19it/s]

Validation Loss: 0.1968, Accuracy: 0.9485
Epoch 2/3


 67%|██████▋   | 2060/3090 [17:27<08:04,  2.12it/s]   

Validation Loss: 0.1923, Accuracy: 0.9408
Epoch 3/3


100%|██████████| 3090/3090 [26:50<00:00,  2.10it/s]  

Validation Loss: 0.1971, Accuracy: 0.9369


In [15]:
# Step 9: Save the updated model
model.save_pretrained("./finetuned_NSFW_text_detection_RoBERTa_model")
tokenizer.save_pretrained("./finetuned_NSFW_text_detection_RoBERTa_model")

print("Model training complete and saved.")

Model training complete and saved.
