In [1]:
# Install necessary libraries
!pip install transformers datasets wandb torch psutil scikit-learn sacremoses

# Import required modules
import wandb
import psutil
import pandas as pd
import torch
import sacremoses
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
from sklearn.model_selection import train_test_split

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hInstalling collected packages: sacremoses
Successfully installed sacremoses-0.1.1


KeyboardInterrupt: 

In [None]:
# Initialize W&B
wandb.login(key="d81dc4e998e5ec3da77e251749a564584ae99b50")
wandb.init(
    project="herbert-hate-detector",  # Replace with your project name
    entity="m-baloniakk",            # Replace with your W&B username/entity
    name="fine-tune-ban-pl-2",       # Optional: Run name
    config={                         # Hyperparameters for tracking
        "learning_rate": 2e-5,
        "epochs": 3,
        "batch_size": 16,
        "model": "HerBERT",
        "dataset": "BAN_PL_1"
    },
    settings=wandb.Settings(start_method="fork")  # Fixes some runtime issues with multiprocessing
)

# Load dataset
dataset_path = "/kaggle/input/ban-pl-1/BAN-PL.csv"  # Update this path
data = pd.read_csv(dataset_path, encoding="utf-8")

# Preview the dataset
print(data.head())

In [None]:
# Load HerBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-base-cased")

# Tokenize text
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

# Split dataset into training and validation sets
texts = data["Text"].tolist()
labels = data["Class"].tolist()

train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

In [None]:
# Tokenize splits
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128, return_tensors="pt")
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128, return_tensors="pt")

# Define PyTorch dataset class
class BANPLDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

# Create PyTorch datasets
train_dataset = BANPLDataset(train_encodings, train_labels)
val_dataset = BANPLDataset(val_encodings, val_labels)

In [None]:
# Load the pre-trained HerBERT model
model = AutoModelForSequenceClassification.from_pretrained("allegro/herbert-base-cased", num_labels=2)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",          # output directory
    evaluation_strategy="steps",     # how often to evaluate
    logging_dir="./logs",            # directory for storing logs
    logging_steps=100,               # log every 100 steps
    save_steps=300,                  # save model every 100 steps
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    gradient_accumulation_steps=1,
    fp16=True,
    num_train_epochs=3,              # number of training epochs
    weight_decay=0.01,               # strength of weight decay
    push_to_hub=False,               # don't push to HuggingFace hub
    load_best_model_at_end=True,     # load the best model when finished
    report_to="wandb",               # log to W&B
    remove_unused_columns=False,     # ensure all columns are used
    metric_for_best_model="accuracy", # metric to track for best model
    greater_is_better=True,          # whether higher metrics are better
)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Define the metrics function
def compute_metrics(p):
    preds, labels = p
    preds = preds.argmax(axis=-1)  # Get the predicted class labels (max probability)

    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


In [None]:
trainer = Trainer(
    model=model,                         # your model
    args=training_args,                  # training arguments
    train_dataset=train_dataset,         # your training dataset
    eval_dataset=val_dataset,           # your eval dataset
    tokenizer=tokenizer,                 # tokenizer
    compute_metrics=compute_metrics,     # metrics function (e.g., accuracy)
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()


In [None]:
# Finish W&B run
wandb.finish()