In [1]:
# Install necessary libraries
!pip install transformers datasets wandb torch psutil scikit-learn sacremoses

# Import required modules
import wandb
import psutil
import pandas as pd
import torch
import sacremoses
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
from sklearn.model_selection import train_test_split

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: sacremoses
Successfully installed sacremoses-0.1.1


In [2]:
# Initialize W&B
wandb.login(key="d81dc4e998e5ec3da77e251749a564584ae99b50")
wandb.init(
    project="herbert-hate-detector",  # Replace with your project name
    entity="m-baloniakk",            # Replace with your W&B username/entity
    name="fine-tune-ban-pl-2",       # Optional: Run name
    config={                         # Hyperparameters for tracking
        "learning_rate": 2e-5,
        "epochs": 3,
        "batch_size": 16,
        "model": "HerBERT",
        "dataset": "BAN_PL_1"
    },
    settings=wandb.Settings(start_method="fork")  # Fixes some runtime issues with multiprocessing
)

# Load dataset
dataset_path = "/kaggle/input/ban-pl-1/BAN-PL.csv"  # Update this path
data = pd.read_csv(dataset_path, encoding="utf-8")

# Preview the dataset
print(data.head())

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mm-baloniak[0m ([33mm-baloniakk[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


     id                                               Text  Class
0  2200  \n\n\n            Polska wtedy oficjalnie powi...      0
1  2201  \n  Gigantyczna różnica\n\n{USERNAME}: biorac ...      0
2  2202  \n\n            {USERNAME}: Moj kumpel budowla...      0
3  2203         kura, rzodkiewka za 3pln to nie jest tanio      0
4  2204  {USERNAME}: większość nie idzie w marszu za PO...      0


In [3]:
# Load HerBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-base-cased")

# Tokenize text
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

# Split dataset into training and validation sets
texts = data["Text"].tolist()
labels = data["Class"].tolist()

train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

tokenizer_config.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/907k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/556k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

In [4]:
# Tokenize splits
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128, return_tensors="pt")
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128, return_tensors="pt")

# Define PyTorch dataset class
class BANPLDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

# Create PyTorch datasets
train_dataset = BANPLDataset(train_encodings, train_labels)
val_dataset = BANPLDataset(val_encodings, val_labels)

In [9]:
# Load the pre-trained HerBERT model
model = AutoModelForSequenceClassification.from_pretrained("allegro/herbert-base-cased", num_labels=2)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",          # output directory
    evaluation_strategy="steps",     # how often to evaluate
    logging_dir="./logs",            # directory for storing logs
    logging_steps=50,               # log every 100 steps
    save_steps=50,                  # save model every 111 steps
    per_device_train_batch_size=128,  # batch size per device during training
    per_device_eval_batch_size=128,   # batch size for evaluation
    gradient_accumulation_steps=2,
    fp16=True,
    num_train_epochs=10,              # number of training epochs
    weight_decay=0.01,               # strength of weight decay
    push_to_hub=False,               # don't push to HuggingFace hub
    load_best_model_at_end=True,     # load the best model when finished
    report_to="wandb",               # log to W&B
    remove_unused_columns=False,     # ensure all columns are used
    metric_for_best_model="accuracy", # metric to track for best model
    greater_is_better=True,          # whether higher metrics are better
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allegro/herbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Define the metrics function
def compute_metrics(p):
    preds, labels = p
    preds = preds.argmax(axis=-1)  # Get the predicted class labels (max probability)

    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [10]:
!rm -r /kaggle/working/results

In [11]:
trainer = Trainer(
    model=model,                         # your model
    args=training_args,                  # training arguments
    train_dataset=train_dataset,         # your training dataset
    eval_dataset=val_dataset,           # your eval dataset
    tokenizer=tokenizer,                 # tokenizer
    compute_metrics=compute_metrics,     # metrics function (e.g., accuracy)
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()

  trainer = Trainer(
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
50,0.3674,0.226217,0.908542,0.908537,0.90859,0.908542
100,0.1844,0.217282,0.9175,0.917423,0.918868,0.9175
150,0.1252,0.227634,0.9225,0.922486,0.922911,0.9225
200,0.0778,0.271127,0.921875,0.921868,0.921971,0.921875
250,0.0492,0.28522,0.925833,0.925834,0.925839,0.925833
300,0.0321,0.32686,0.922917,0.922912,0.922971,0.922917
350,0.022,0.33132,0.924583,0.92458,0.924616,0.924583


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


In [12]:
# Finish W&B run
wandb.finish()

0,1
eval/accuracy,▁▆▁▆▂▅▇▇█▇██
eval/f1,▁▆▁▆▂▅▇▇█▇██
eval/loss,▂▁▄▅▂▁▂▅▅██▅
eval/precision,▁▆▂▆▂▅▇▇█▇██
eval/recall,▁▆▁▆▂▅▇▇█▇██
eval/runtime,▆▃▁▇▃▇▇▇▇▃█▇
eval/samples_per_second,▃▆█▂▆▂▂▂▂▆▁▂
eval/steps_per_second,▂▇█▂▇▂▂▂▂▇▁▂
train/epoch,▁▁▂▂▃▃▄▄▁▁▂▂▃▃▄▄▅▅▆▆████
train/global_step,▁▁▂▂▃▃▄▄▁▁▂▂▃▃▄▄▅▅▆▆████

0,1
eval/accuracy,0.92583
eval/f1,0.92583
eval/loss,0.28522
eval/precision,0.92584
eval/recall,0.92583
eval/runtime,19.7487
eval/samples_per_second,243.054
eval/steps_per_second,0.962
total_flos,1.230938761396224e+16
train/epoch,9.74667


In [15]:
!zip -r results.zip results/

  adding: results/ (stored 0%)
  adding: results/checkpoint-350/ (stored 0%)
  adding: results/checkpoint-350/trainer_state.json (deflated 75%)
  adding: results/checkpoint-350/model.safetensors (deflated 9%)
  adding: results/checkpoint-350/scheduler.pt (deflated 55%)
  adding: results/checkpoint-350/tokenizer.json (deflated 82%)
  adding: results/checkpoint-350/tokenizer_config.json (deflated 76%)
  adding: results/checkpoint-350/rng_state.pth (deflated 25%)
  adding: results/checkpoint-350/training_args.bin (deflated 51%)
  adding: results/checkpoint-350/optimizer.pt (deflated 17%)
  adding: results/checkpoint-350/merges.txt (deflated 60%)
  adding: results/checkpoint-350/vocab.json (deflated 62%)
  adding: results/checkpoint-350/config.json (deflated 50%)
  adding: results/checkpoint-350/special_tokens_map.json (deflated 53%)
  adding: results/checkpoint-370/ (stored 0%)
  adding: results/checkpoint-370/trainer_state.json (deflated 75%)
  adding: results/checkpoint-370/model.safete