In [None]:
!pip -q install "transformers>=4.42" "datasets>=2.20" "evaluate>=0.4.2" "scikit-learn>=1.5" "accelerate>=0.30" "tqdm>=4.66"


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [6]:
import sys, numpy, torch, transformers, datasets, sklearn
print(sys.version)
print("numpy", numpy.__version__)
print("torch", torch.__version__)
print("transformers", transformers.__version__)
print("datasets", datasets.__version__)


3.12.11 (main, Jun  4 2025, 08:56:18) [GCC 11.4.0]
numpy 2.0.2
torch 2.8.0+cu126
transformers 4.56.0
datasets 4.0.0


# Clone the dataset

In [7]:
REPO_URL = "" # some url...
BRANCH   = "main"
VERIFY_DATA = True

import os, subprocess, shutil, csv
from pathlib import Path

%cd /content
repo_dir = REPO_URL.rsplit("/", 1)[-1].replace(".git", "")
if Path(repo_dir).exists():
    print(f"Repo '{repo_dir}' exists — pulling latest…")
    %cd /content/{repo_dir}
    if BRANCH:
        subprocess.run(["git", "fetch", "origin", BRANCH], check=True)
        subprocess.run(["git", "checkout", BRANCH], check=True)
        subprocess.run(["git", "reset", "--hard", f"origin/{BRANCH}"], check=True)
    else:
        subprocess.run(["git", "pull"], check=True)
else:
    print("Cloning the repo")
    subprocess.run(["git", "clone", REPO_URL], check=True)
    %cd /content/{repo_dir}
    if BRANCH:
        subprocess.run(["git", "checkout", BRANCH], check=True)

print("Repo root:", Path.cwd())

def short_tree(root=".", max_depth=2):
    root = Path(root)
    for p in sorted(root.rglob("*")):
        depth = len(p.relative_to(root).parts)
        if depth <= max_depth:
            print("  " * (depth-1) + ("├─ " if depth>0 else "") + p.name)

short_tree(".", max_depth=2)


if VERIFY_DATA:
    expected = [
        "src/data/processed_bert/train.csv",
        "src/data/processed_bert/train_augmented.csv",
        "src/data/processed_bert/val.csv",
        "src/data/processed_bert/test.csv",
    ]
    missing = []
    for rel in expected:
        p = Path(rel)
        if p.exists():
            print(f"{rel} ({p.stat().st_size/1e6:.2f} MB) exists")

    def peek_csv(path, n=2):
        path = Path(path)
        try:
            with path.open(newline="", encoding="utf-8") as f:
                reader = csv.DictReader(f)
                headers = reader.fieldnames or []
                print(f"\n{path} headers:", headers)
                required = {"text", "clean_text", "label"}
                lowered = {h.lower() for h in headers}
                print("Columns OK." if required.issubset(lowered)
                      else f"Missing columns: {sorted(required - lowered)}")
                for _, row in zip(range(n), reader):
                    row_show = {k: (v[:80]+"…") if isinstance(v, str) and len(v) > 80 else v
                                for k, v in row.items()}
                    print(row_show)
        except Exception as e:
            print(f"Peek failed for {path}: {e}")

    for rel in expected:
        if Path(rel).exists():
            peek_csv(rel)


/content
Cloning the repo
/content/nlp-offensive-language-classifier
Repo root: /content/nlp-offensive-language-classifier
├─ .DS_Store
├─ .git
  ├─ HEAD
  ├─ branches
  ├─ config
  ├─ description
  ├─ hooks
  ├─ index
  ├─ info
  ├─ logs
  ├─ objects
  ├─ packed-refs
  ├─ refs
├─ .gitignore
├─ .idea
  ├─ inspectionProfiles
  ├─ libraries
  ├─ misc.xml
  ├─ modules.xml
  ├─ nlp-offensive-language-classifier.iml
  ├─ prettier.xml
  ├─ vcs.xml
  ├─ workspace.xml
├─ .python-version
├─ LICENSE
├─ README.md
├─ models
  ├─ .DS_Store
  ├─ .gitkeep
  ├─ logreg
  ├─ majority
├─ notebooks
  ├─ stage1_evaluation.ipynb
  ├─ {01_eda.ipynb}
├─ predictions
  ├─ .DS_Store
  ├─ logreg
  ├─ majority
├─ requirements.txt
├─ results
  ├─ stage1_confusion_matrices.png
  ├─ stage1_overall_metrics.csv
  ├─ stage1_per_class_comparison.png
├─ run_phase1.py
├─ src
  ├─ .DS_Store
  ├─ __init__.py
  ├─ config.py
  ├─ data
  ├─ data_augmentation.py
  ├─ data_loader.py
  ├─ majority_baseline.py
  ├─ preprocessing.py

In [8]:
import torch
import pandas as pd
import numpy as np
from pathlib import Path
import json

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"[setup] Device: {device}")
if device.type == 'cuda':
    print(f"[setup] GPU: {torch.cuda.get_device_name(0)}")

[setup] Device: cuda
[setup] GPU: Tesla T4


# Config

In [9]:
BASE_DIR = Path('src/data')
MODELS_DIR = Path('models')
RESULTS_DIR = Path('results')

VAL_PATH = BASE_DIR / 'processed_bert/val.csv'
TEST_PATH = BASE_DIR / 'processed_bert/test.csv'

CONFIG = {
    'model_name': 'bert-base-uncased',
    'max_length': 128,
    'num_labels': 3,
    'seed': 42,
}

CLASS_NAMES = {0: 'hate_speech', 1: 'offensive', 2: 'neither'}

np.random.seed(CONFIG['seed'])
torch.manual_seed(CONFIG['seed'])

<torch._C.Generator at 0x7f6824108470>

# Data loading

In [10]:
def load_train_data(use_augmented=False):
    if use_augmented:
        path = BASE_DIR / 'processed_bert/train_augmented.csv'
        print(f"Loading AUGMENTED training data")
    else:
        path = BASE_DIR / 'processed_bert/train.csv'
        print(f"Loading ORIGINAL training data")

    df = pd.read_csv(path)
    print(f"Loaded {len(df):,} samples from {path.name}")
    return df

In [11]:
train_orig = load_train_data(use_augmented=False)
train_aug = load_train_data(use_augmented=True)
val_df = pd.read_csv(VAL_PATH)
test_df = pd.read_csv(TEST_PATH)

Loading ORIGINAL training data
Loaded 17,347 samples from train.csv
Loading AUGMENTED training data
Loaded 22,263 samples from train_augmented.csv


In [12]:
print(f"Val: {len(val_df):,} samples")

Val: 3,718 samples


In [13]:
print(f"Test: {len(test_df):,} samples")

Test: 3,718 samples


In [14]:
def peek_data(df, name="Dataset"):
    print(f"\n[peek] {name}")
    print(f"  Shape: {df.shape}")
    print(f"  Columns: {list(df.columns)}")

    dist = df['label'].value_counts().sort_index()
    print(f"  Classes:")
    for label, count in dist.items():
        pct = (count / len(df)) * 100
        print(f"    {CLASS_NAMES[label]:12s}: {count:5d} ({pct:5.1f}%)")


peek_data(train_orig, "Original Training")
peek_data(train_aug, "Augmented Training")
peek_data(val_df, "Validation")


[peek] Original Training
  Shape: (17347, 3)
  Columns: ['text', 'label', 'clean_text']
  Classes:
    hate_speech :  1001 (  5.8%)
    offensive   : 13432 ( 77.4%)
    neither     :  2914 ( 16.8%)

[peek] Augmented Training
  Shape: (22263, 3)
  Columns: ['text', 'label', 'clean_text']
  Classes:
    hate_speech :  3003 ( 13.5%)
    offensive   : 13432 ( 60.3%)
    neither     :  5828 ( 26.2%)

[peek] Validation
  Shape: (3718, 3)
  Columns: ['text', 'label', 'clean_text']
  Classes:
    hate_speech :   215 (  5.8%)
    offensive   :  2879 ( 77.4%)
    neither     :   624 ( 16.8%)


# Class Weights Calculation

In [15]:
def calculate_class_weights(df):
    from sklearn.utils.class_weight import compute_class_weight

    labels = df['label'].values
    unique_labels = np.array([0, 1, 2])

    weights = compute_class_weight(
        class_weight='balanced',
        classes=unique_labels,
        y=labels
    )

    print(f"[weights] Computed for {len(df):,} samples:")
    for label, weight in zip(unique_labels, weights):
        print(f"  {CLASS_NAMES[label]:12s}: {weight:.4f}")

    return torch.tensor(weights, dtype=torch.float32)

In [16]:
weights_orig = calculate_class_weights(train_orig)
weights_aug = calculate_class_weights(train_aug)

[weights] Computed for 17,347 samples:
  hate_speech : 5.7766
  offensive   : 0.4305
  neither     : 1.9843
[weights] Computed for 22,263 samples:
  hate_speech : 2.4712
  offensive   : 0.5525
  neither     : 1.2733


# Tokenization

In [17]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [18]:
from datasets import Dataset

train_orig_dataset = Dataset.from_pandas(train_orig[['clean_text', 'label']])
print(f"Original train: {len(train_orig_dataset):,} samples")

train_aug_dataset = Dataset.from_pandas(train_aug[['clean_text', 'label']])
print(f"Augmented train: {len(train_aug_dataset):,} samples")

val_dataset = Dataset.from_pandas(val_df[['clean_text', 'label']])
print(f"Validation: {len(val_dataset):,} samples")

Original train: 17,347 samples
Augmented train: 22,263 samples
Validation: 3,718 samples


In [19]:
def tokenize_texts(examples):
    return tokenizer(
        examples['clean_text'],
        padding='max_length',
        truncation=True,
        max_length=128
    )

In [20]:
train_orig_tok = train_orig_dataset.map(
    tokenize_texts,
    batched=True,
    desc="Tokenizing original"
)

Tokenizing original:   0%|          | 0/17347 [00:00<?, ? examples/s]

In [21]:
train_aug_tok = train_aug_dataset.map(
    tokenize_texts,
    batched=True,
    desc="Tokenizing augmented"
)

Tokenizing augmented:   0%|          | 0/22263 [00:00<?, ? examples/s]

In [22]:
val_tok = val_dataset.map(
    tokenize_texts,
    batched=True,
    desc="Tokenizing validation"
)

Tokenizing validation:   0%|          | 0/3718 [00:00<?, ? examples/s]

In [23]:
train_orig_tok = train_orig_tok.rename_column('label', 'labels')
train_aug_tok = train_aug_tok.rename_column('label', 'labels')
val_tok = val_tok.rename_column('label', 'labels')

train_orig_tok.set_format('torch')
train_aug_tok.set_format('torch')
val_tok.set_format('torch')

In [24]:
sample = train_orig_tok[0]

print(f"Label: {sample['labels']} ({CLASS_NAMES[int(sample['labels'])]})")
print(f"Input IDs shape: {sample['input_ids'].shape}")
print(f"Attention mask shape: {sample['attention_mask'].shape}")

decoded = tokenizer.decode(sample['input_ids'], skip_special_tokens=True)
print(f"Decoded text (first 100 chars): '{decoded[:100]}...'")

num_real_tokens = sample['attention_mask'].sum().item()
print(f"Real tokens: {num_real_tokens} / 128")

  Label: 1 (offensive)
  Input IDs shape: torch.Size([128])
  Attention mask shape: torch.Size([128])
  Decoded text (first 100 chars): 'lmfaoooooo gay as fuck rt @ user get these two faggots off my tl [ url ]...'
  Real tokens: 28 / 128


# Setting up the training

In [37]:
def get_training_config(use_augmented=False):
    train_config = {
        'num_train_epochs': 4,
        'learning_rate': 3e-5,
        'per_device_train_batch_size': 16,
        'per_device_eval_batch_size': 32,
        'gradient_accumulation_steps': 2,
        'weight_decay': 0.01,
        'warmup_ratio': 0.1,
        'fp16': True,
        'seed': CONFIG['seed'],
        'eval_strategy': 'epoch',
        'save_strategy': 'epoch',
        'save_total_limit': 2,
        'load_best_model_at_end': True,
        'metric_for_best_model': 'f1_macro',
        'greater_is_better': True,
        'logging_steps': 100,
    }

    if use_augmented:
        train_config['output_dir'] = str(MODELS_DIR / 'bert_augmented')
        train_config['run_name'] = 'bert_augmented'
    else:
        train_config['output_dir'] = str(MODELS_DIR / 'bert_original')
        train_config['run_name'] = 'bert_original'

    print(f"[config] Training config for {train_config['run_name']}")
    print(f"  Output: {train_config['output_dir']}")
    print(f"  Epochs: {train_config['num_train_epochs']}")
    print(f"  Effective batch: {train_config['per_device_train_batch_size'] * train_config['gradient_accumulation_steps']}")

    return train_config


In [38]:
config_orig = get_training_config(use_augmented=False)
config_aug = get_training_config(use_augmented=True)

[config] Training config for bert_original
  Output: models/bert_original
  Epochs: 4
  Effective batch: 32
[config] Training config for bert_augmented
  Output: models/bert_augmented
  Epochs: 4
  Effective batch: 32


In [39]:
config_orig

{'num_train_epochs': 4,
 'learning_rate': 3e-05,
 'per_device_train_batch_size': 16,
 'per_device_eval_batch_size': 32,
 'gradient_accumulation_steps': 2,
 'weight_decay': 0.01,
 'warmup_ratio': 0.1,
 'fp16': True,
 'seed': 42,
 'eval_strategy': 'epoch',
 'save_strategy': 'epoch',
 'save_total_limit': 2,
 'load_best_model_at_end': True,
 'metric_for_best_model': 'f1_macro',
 'greater_is_better': True,
 'logging_steps': 100,
 'output_dir': 'models/bert_original',
 'run_name': 'bert_original'}

In [40]:
config_aug

{'num_train_epochs': 4,
 'learning_rate': 3e-05,
 'per_device_train_batch_size': 16,
 'per_device_eval_batch_size': 32,
 'gradient_accumulation_steps': 2,
 'weight_decay': 0.01,
 'warmup_ratio': 0.1,
 'fp16': True,
 'seed': 42,
 'eval_strategy': 'epoch',
 'save_strategy': 'epoch',
 'save_total_limit': 2,
 'load_best_model_at_end': True,
 'metric_for_best_model': 'f1_macro',
 'greater_is_better': True,
 'logging_steps': 100,
 'output_dir': 'models/bert_augmented',
 'run_name': 'bert_augmented'}

# Define Metrics Function

In [41]:
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    accuracy = accuracy_score(labels, predictions)
    macro_f1 = f1_score(labels, predictions, average='macro')
    weighted_f1 = f1_score(labels, predictions, average='weighted')

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions,
        average=None,
        labels=[0, 1, 2]
    )

    return {
        'accuracy': accuracy,
        'f1_macro': macro_f1,
        'f1_weighted': weighted_f1,
        'hate_speech_precision': precision[0],
        'hate_speech_recall': recall[0],
        'hate_speech_f1': f1[0],
        'offensive_precision': precision[1],
        'offensive_recall': recall[1],
        'offensive_f1': f1[1],
        'neither_precision': precision[2],
        'neither_recall': recall[2],
        'neither_f1': f1[2],
    }

# Custom Trainer with Weighted Loss

In [51]:
from transformers import Trainer
from torch import nn

class WeightedTrainer(Trainer):
    def __init__(self, class_weights, **kwargs):
        if 'tokenizer' in kwargs:
            kwargs['processing_class'] = kwargs.pop('tokenizer')
        super().__init__(**kwargs)
        self.loss_fn = nn.CrossEntropyLoss(weight=class_weights.to(self.args.device))

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        loss = self.loss_fn(outputs.logits.view(-1, 3), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# Train Function

# Training original model

In [45]:
print(f"{train_orig_tok.column_names}")

['clean_text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask']


The tokenized datasets still contain the original clean_text column, which is a string and can't be converted to tensors for training. That's why we are going to go ahead and remove them and then double check that our columns are in proper format

In [46]:
train_orig_tok = train_orig_tok.remove_columns(['clean_text'])
train_aug_tok = train_aug_tok.remove_columns(['clean_text'])
val_tok = val_tok.remove_columns(['clean_text'])

In [48]:
train_orig_tok.set_format('torch')
train_aug_tok.set_format('torch')
val_tok.set_format('torch')

In [54]:
from transformers import BertForSequenceClassification, TrainingArguments, EarlyStoppingCallback
import json
from pathlib import Path
import warnings

# We know that this is gonna happen so we'll just suppress it
warnings.filterwarnings('ignore', message='Some weights of BertForSequenceClassification were not initialized')

def train_bert(use_augmented=False):
    if use_augmented:
        print("\n" + "="*60)
        print("TRAINING ON AUGMENTED DATA")
        print("="*60)
        train_dataset = train_aug_tok
        class_weights = weights_aug
        config = config_aug
    else:
        print("\n" + "="*60)
        print("TRAINING ON ORIGINAL DATA")
        print("="*60)
        train_dataset = train_orig_tok
        class_weights = weights_orig
        config = config_orig

    print(f"[setup] Training samples: {len(train_dataset):,}")
    print(f"[setup] Validation samples: {len(val_tok):,}")
    print(f"[setup] Output directory: {config['output_dir']}")

    output_dir = Path(config['output_dir'])
    output_dir.mkdir(parents=True, exist_ok=True)

    print("\n[model] Loading BERT-base-uncased...")
    model = BertForSequenceClassification.from_pretrained(
        'bert-base-uncased',
        num_labels=3,
        ignore_mismatched_sizes=True
    )
    print(f"[model] Parameters: {sum(p.numel() for p in model.parameters()):,}")
    print(f"[model] Classifier layer initialized randomly (expected)")

    training_config = {k: v for k, v in config.items() if k != 'run_name'}

    training_args = TrainingArguments(
        **training_config,
        report_to='none',
        push_to_hub=False,
        remove_unused_columns=True,
        dataloader_drop_last=False,
        dataloader_num_workers=2,
    )

    steps_per_epoch = len(train_dataset) // (config['per_device_train_batch_size'] * config['gradient_accumulation_steps'])
    total_steps = steps_per_epoch * config['num_train_epochs']

    print(f"\n[train] Training configuration:")
    print(f"  Epochs: {config['num_train_epochs']}")
    print(f"  Steps per epoch: {steps_per_epoch}")
    print(f"  Total steps: {total_steps}")
    print(f"  Effective batch size: {config['per_device_train_batch_size'] * config['gradient_accumulation_steps']}")
    print(f"  Learning rate: {config['learning_rate']}")

    print(f"\n[train] Using class weights: {class_weights.numpy()}")

    trainer = WeightedTrainer(
        class_weights=class_weights,
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_tok,
        processing_class=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    # Training
    print("\n[train] Starting training")
    print("-" * 60)

    try:
        train_result = trainer.train()
    except Exception as e:
        print(f"[ERROR] Training failed: {e}")
        return None

    print("-" * 60)
    print(f"\n[train] Training complete!")
    print(f"[train] Final loss: {train_result.training_loss:.4f}")

    # Saving everything
    best_model_dir = output_dir / 'best_model'
    print(f"\n[save] Saving model to {best_model_dir}")

    # Save model and tokenizer
    trainer.save_model(best_model_dir)
    tokenizer.save_pretrained(best_model_dir)

    # Save training history
    history_file = output_dir / 'training_history.json'

    log_history = []
    for entry in trainer.state.log_history:
        clean_entry = {}
        for k, v in entry.items():
            if hasattr(v, 'item'):  # numpy scalar
                clean_entry[k] = v.item()
            elif isinstance(v, (list, dict, str, int, float, bool, type(None))):
                clean_entry[k] = v
            else:
                clean_entry[k] = str(v)
        log_history.append(clean_entry)

    history = {
        'loss_history': log_history,
        'best_metric': float(trainer.state.best_metric) if trainer.state.best_metric else None,
        'best_model_checkpoint': trainer.state.best_model_checkpoint,
        'total_steps': int(trainer.state.global_step),
        'epochs_trained': float(trainer.state.epoch),
    }

    with open(history_file, 'w') as f:
        json.dump(history, f, indent=2)
    print(f"[save] Training history saved")

    # Save class weights
    weights_file = output_dir / 'class_weights.json'
    weights_dict = {str(i): float(w) for i, w in enumerate(class_weights.numpy())}
    with open(weights_file, 'w') as f:
        json.dump(weights_dict, f, indent=2)
    print(f"[save] Class weights saved")

    # Save config
    config_file = output_dir / 'training_config.json'
    with open(config_file, 'w') as f:
        json.dump(config, f, indent=2)
    print(f"[save] Training config saved")

    # Save dataset info
    dataset_info = {
        'train_samples': len(train_dataset),
        'val_samples': len(val_tok),
        'augmented': use_augmented,
        'dataset_name': 'augmented' if use_augmented else 'original'
    }
    info_file = output_dir / 'dataset_info.json'
    with open(info_file, 'w') as f:
        json.dump(dataset_info, f, indent=2)
    print(f"[save] Dataset info saved")

    print("\n" + "="*60)
    print("TRAINING COMPLETE")
    print("="*60)
    print(f"Model saved to: {output_dir}")

    if trainer.state.best_metric:
        print(f"\nBest validation F1: {trainer.state.best_metric:.4f}")
        print(f"Training stopped at epoch: {trainer.state.epoch:.1f}")

    return str(output_dir)

In [53]:
model_dir_original = train_bert(use_augmented=False)
print(f"\nOriginal model saved to: {model_dir_original}")


TRAINING ON ORIGINAL DATA
[check] Dataset columns: ['labels', 'input_ids', 'token_type_ids', 'attention_mask']
[setup] Training samples: 17,347
[setup] Validation samples: 3,718
[setup] Output directory: models/bert_original

[model] Loading BERT-base-uncased...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[model] Parameters: 109,484,547
[model] Classifier layer initialized randomly (expected)

[train] Training configuration:
  Epochs: 4
  Steps per epoch: 542
  Total steps: 2168
  Effective batch size: 32
  Learning rate: 3e-05

[train] Using class weights: [5.776557  0.4304894 1.9843285]

[train] Starting training...
[train] Validation runs after each epoch
[train] Estimated time: 15-25 minutes on Colab GPU

------------------------------------------------------------


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted,Hate Speech Precision,Hate Speech Recall,Hate Speech F1,Offensive Precision,Offensive Recall,Offensive F1,Neither Precision,Neither Recall,Neither F1
1,0.5346,0.476008,0.854761,0.732568,0.878337,0.275676,0.711628,0.397403,0.975851,0.8562,0.912118,0.879121,0.897436,0.888184
2,0.3856,0.520018,0.89645,0.763734,0.90383,0.378462,0.572093,0.455556,0.964809,0.914206,0.938826,0.869173,0.926282,0.896819
3,0.3395,0.528357,0.879505,0.749186,0.892879,0.325243,0.623256,0.427432,0.966504,0.891976,0.927746,0.875193,0.910256,0.89238
4,0.2029,0.670096,0.894836,0.760344,0.901153,0.387821,0.562791,0.459203,0.955988,0.920458,0.937887,0.876972,0.891026,0.883943


------------------------------------------------------------

[train] Training complete!
[train] Final loss: 0.4074

[save] Saving model to models/bert_original/best_model
[save] Training history saved
[save] Class weights saved
[save] Training config saved
[save] Dataset info saved

TRAINING COMPLETE
Model saved to: models/bert_original

Best validation F1: 0.7637
Training stopped at epoch: 4.0

Original model saved to: models/bert_original


# Training augmented model

In [55]:
model_dir_augmented = train_bert(use_augmented=True)
print(f"\n[complete] Augmented model saved to: {model_dir_augmented}")


TRAINING ON AUGMENTED DATA
[check] Dataset columns: ['labels', 'input_ids', 'token_type_ids', 'attention_mask']
[setup] Training samples: 22,263
[setup] Validation samples: 3,718
[setup] Output directory: models/bert_augmented

[model] Loading BERT-base-uncased...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[model] Parameters: 109,484,547
[model] Classifier layer initialized randomly (expected)

[train] Training configuration:
  Epochs: 4
  Steps per epoch: 695
  Total steps: 2780
  Effective batch size: 32
  Learning rate: 3e-05

[train] Using class weights: [2.4711955 0.5524866 1.2733356]

[train] Starting training
------------------------------------------------------------


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted,Hate Speech Precision,Hate Speech Recall,Hate Speech F1,Offensive Precision,Offensive Recall,Offensive F1,Neither Precision,Neither Recall,Neither F1
1,0.4056,0.382379,0.89376,0.758213,0.900388,0.387622,0.553488,0.455939,0.968889,0.908649,0.937802,0.827004,0.942308,0.880899
2,0.2459,0.529157,0.901022,0.760615,0.904155,0.409091,0.502326,0.450939,0.946906,0.935394,0.941115,0.9,0.879808,0.889789
3,0.1355,0.675929,0.906939,0.753195,0.905829,0.44,0.409302,0.424096,0.94142,0.948941,0.945165,0.896104,0.884615,0.890323
4,0.0506,0.846072,0.903712,0.750012,0.90338,0.422535,0.418605,0.420561,0.938747,0.947551,0.943129,0.904841,0.86859,0.886345


------------------------------------------------------------

[train] Training complete!
[train] Final loss: 0.2646

[save] Saving model to models/bert_augmented/best_model
[save] Training history saved
[save] Class weights saved
[save] Training config saved
[save] Dataset info saved

TRAINING COMPLETE
Model saved to: models/bert_augmented

Best validation F1: 0.7606
Training stopped at epoch: 4.0

[complete] Augmented model saved to: models/bert_augmented
