In [1]:
# Install necessary libraries
!pip install transformers datasets wandb torch psutil scikit-learn sacremoses

# Import required modules
import wandb
import psutil
import pandas as pd
import torch
import sacremoses
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
from sklearn.model_selection import train_test_split

from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sacremoses
Successfully installed sacremoses-0.1.1


In [2]:
# Load dataset
dataset_path = "/kaggle/input/ban-pl-1/BAN-PL.csv"  # Update this path
data = pd.read_csv(dataset_path, encoding="utf-8")

# Preview the dataset
print(data.head())

     id                                               Text  Class
0  2200  \n\n\n            Polska wtedy oficjalnie powi...      0
1  2201  \n  Gigantyczna różnica\n\n{USERNAME}: biorac ...      0
2  2202  \n\n            {USERNAME}: Moj kumpel budowla...      0
3  2203         kura, rzodkiewka za 3pln to nie jest tanio      0
4  2204  {USERNAME}: większość nie idzie w marszu za PO...      0


In [3]:
# Load HerBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-base-cased")

# Tokenize text
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

# Split dataset into training and validation sets
texts = data["Text"].tolist()
labels = data["Class"].tolist()

train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

tokenizer_config.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/907k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/556k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

In [4]:
# Tokenize splits
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128, return_tensors="pt")
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128, return_tensors="pt")

# Define PyTorch dataset class
class BANPLDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

# Create PyTorch datasets
train_dataset = BANPLDataset(train_encodings, train_labels)
val_dataset = BANPLDataset(val_encodings, val_labels)

In [5]:
# Load the pre-trained HerBERT model
model = AutoModelForSequenceClassification.from_pretrained("allegro/herbert-base-cased", num_labels=2)

# Define training arguments
training_args = TrainingArguments(
    output_dir="/kaggle/working/results",          # output directory
    evaluation_strategy="epoch",         # evaluate at the end of each epoch
    save_strategy="epoch",
    logging_dir="/kaggle/working/results/logs",           # directory for storing logs
    per_device_train_batch_size=32,      # smaller batch size for more granular updates
    per_device_eval_batch_size=32,       # smaller batch size for evaluation
    gradient_accumulation_steps=2,       # accumulate gradients over multiple steps
    learning_rate=1e-5,                  # lower learning rate
    num_train_epochs=5,                  # fewer epochs
    weight_decay=0.01,                   # regularization
    fp16=True,                           # mixed precision training
    load_best_model_at_end=True,         # load the best model when finished
    metric_for_best_model="accuracy",    # use accuracy for determining the best model
    greater_is_better=True,              # higher is better
    report_to="wandb",                   # log to W&B
    remove_unused_columns=False,         # ensure all columns are used
)

pytorch_model.bin:   0%|          | 0.00/654M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allegro/herbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Initialize W&B
wandb.login(key=user_secrets.get_secret("wandb_key"))
wandb.init(
    project="herbert-hate-detector",  #W&B Project name
    entity="m-baloniakk",            #W&B entity
    name="fine-tune-ban-run-2",       #W&B Run Name
    config=training_args.to_dict(),
    settings=wandb.Settings(start_method="fork", _disable_stats=True)  # Fixes some runtime issues with multiprocessing
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mm-baloniak[0m ([33mm-baloniakk[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Tracking run with wandb version 0.19.1
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20250122_230215-vzhhlluz[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mfine-tune-ban-run-2[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/m-baloniakk/herbert-hate-detector[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/m-baloniakk/herbert-hate-detector/runs/vzhhlluz[0m


In [7]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Define the metrics function
def compute_metrics(p):
    preds, labels = p
    preds = preds.argmax(axis=-1)  # Get the predicted class labels (max probability)

    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [8]:
trainer = Trainer(
    model=model,                         # your model
    args=training_args,                  # training arguments
    train_dataset=train_dataset,         # your training dataset
    eval_dataset=val_dataset,           # your eval dataset
    tokenizer=tokenizer,                 # tokenizer
    compute_metrics=compute_metrics,     # metrics function (e.g., accuracy)
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()

# Finish W&B run
wandb.finish()

  trainer = Trainer(
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.268217,0.89,0.89,0.890006,0.89
2,No log,0.236345,0.910625,0.910623,0.910643,0.910625
3,No log,0.229794,0.91375,0.91375,0.913759,0.91375
4,0.275400,0.22917,0.914583,0.914583,0.914587,0.914583
5,0.275400,0.229293,0.912917,0.912906,0.91305,0.912917


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:           eval/accuracy ▁▇████
[34m[1mwandb[0m:                 eval/f1 ▁▇████
[34m[1mwandb[0m:               eval/loss █▂▁▁▁▁
[34m[1mwandb[0m:          eval/precision ▁▇████
[34m[1mwandb[0m:             eval/recall ▁▇████
[34m[1mwandb[0m:            eval/runtime ▁██▇▅▆
[34m[1mwandb[0m: eval/samples_per_second █▁▁▂▄▃
[34m[1mwandb[0m:   eval/steps_per_second █▁▁▂▄▃
[34m[1mwandb[0m:             train/epoch ▁▃▅▅▆███
[34m[1mwandb[0m:       train/global_step ▁▃▅▅▆███
[34m[1mwandb[0m:         train/grad_norm ▁
[34m[1mwandb[0m:     train/learning_rate ▁
[34m[1mwandb[0m:              train/loss ▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:            eval/accuracy 0.91458
[34m[1mwandb[0m:                  eval/f1 0.91458
[34m[1mwandb[0m:               