In [1]:
import os
import random
import numpy as np
import torch

def set_seeds(seed):
    """Set seeds for reproducibility """
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        

set_seeds(seed=42)

In [2]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
PRETRAINED_MODEL = "bert-base-multilingual-cased"

# Data

In [3]:
import pandas as pd

kaggle_path = '/kaggle/input/unlp-2025-shared-task-span-identification/train.parquet'
df = pd.read_parquet(kaggle_path) #pd.read_parquet('train.parquet')

In [4]:
cv_split = pd.read_csv("/kaggle/input/unlp25-cross-validation-split/cv_split.csv")

df = df.merge(cv_split, on="id")
df['is_valid'] = (df['fold'] == 4).astype(int)

In [5]:
df.head()

Unnamed: 0,id,content,lang,manipulative,techniques,trigger_words,fold,is_valid
0,0bb0c7fa-101b-4583-a5f9-9d503339141c,Новий огляд мапи DeepState від російського вій...,uk,True,"[euphoria, loaded_language]","[[27, 63], [65, 88], [90, 183], [186, 308]]",1,0
1,7159f802-6f99-4e9d-97bd-6f565a4a0fae,Недавно 95 квартал жёстко поглумился над русск...,ru,True,"[loaded_language, cherry_picking]","[[0, 40], [123, 137], [180, 251], [253, 274]]",3,0
2,e6a427f1-211f-405f-bd8b-70798458d656,🤩\nТим часом йде евакуація Бєлгородського авто...,uk,True,"[loaded_language, euphoria]","[[55, 100]]",1,0
3,1647a352-4cd3-40f6-bfa1-d87d42e34eea,В Україні найближчим часом мають намір посилит...,uk,False,,,2,0
4,9c01de00-841f-4b50-9407-104e9ffb03bf,"Расчёты 122-мм САУ 2С1 ""Гвоздика"" 132-й бригад...",ru,True,[loaded_language],"[[114, 144]]",2,0


# Targets Prep

## Classification

In [6]:
from collections.abc import Iterable

techniques = ['straw_man', 'appeal_to_fear', 'fud', 'bandwagon', 'whataboutism', 'loaded_language', 'glittering_generalities', 'euphoria', 'cherry_picking', 'cliche']

for col in techniques:
    df[col] = 0

import numpy as np
for ind, row in df.iterrows():
    if isinstance(row['techniques'], Iterable):
        for t in row['techniques']:
            df.loc[ind, t] = 1

df['clf_labels'] = list(df[techniques].values)
df.drop(columns=techniques, inplace=True)

In [7]:
df.head()

Unnamed: 0,id,content,lang,manipulative,techniques,trigger_words,fold,is_valid,clf_labels
0,0bb0c7fa-101b-4583-a5f9-9d503339141c,Новий огляд мапи DeepState від російського вій...,uk,True,"[euphoria, loaded_language]","[[27, 63], [65, 88], [90, 183], [186, 308]]",1,0,"[0, 0, 0, 0, 0, 1, 0, 1, 0, 0]"
1,7159f802-6f99-4e9d-97bd-6f565a4a0fae,Недавно 95 квартал жёстко поглумился над русск...,ru,True,"[loaded_language, cherry_picking]","[[0, 40], [123, 137], [180, 251], [253, 274]]",3,0,"[0, 0, 0, 0, 0, 1, 0, 0, 1, 0]"
2,e6a427f1-211f-405f-bd8b-70798458d656,🤩\nТим часом йде евакуація Бєлгородського авто...,uk,True,"[loaded_language, euphoria]","[[55, 100]]",1,0,"[0, 0, 0, 0, 0, 1, 0, 1, 0, 0]"
3,1647a352-4cd3-40f6-bfa1-d87d42e34eea,В Україні найближчим часом мають намір посилит...,uk,False,,,2,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,9c01de00-841f-4b50-9407-104e9ffb03bf,"Расчёты 122-мм САУ 2С1 ""Гвоздика"" 132-й бригад...",ru,True,[loaded_language],"[[114, 144]]",2,0,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]"


## Span

In [8]:
import spacy

from spacy.training.iob_utils import biluo_to_iob, doc_to_biluo_tags
from tqdm.autonotebook import tqdm
tqdm.pandas()

df.trigger_words = df.trigger_words.apply(lambda x: [] if x is None else x)
df['target'] = df.trigger_words.apply(lambda x: [[y[0], y[1], 'TRIGGER'] for y in x])

def resolve_overlapping_spans(spans):
    if not spans:
        return []
    spans = sorted(spans, key=lambda x: x[0])  # Sort by start index
    resolved = [spans[0]]
    for current in spans[1:]:
        last = resolved[-1]
        if current[0] < last[1]:  # Overlap
            new_span = (last[0], max(last[1], current[1]), 'TRIGGER')
            resolved[-1] = new_span
            print('resolved')
        else:
            resolved.append(current)
    return resolved

df['target'] = df.target.apply(resolve_overlapping_spans)

nlp = spacy.blank("xx")

def convert_to_conll(row):
    data = {
        "text": row['content'],
        "label": row['target']
    }
    doc = nlp(data["text"])
    ents = []
    for start, end, label in data["label"]:
        span = doc.char_span(start, end, label=label)
        if span is not None:
            ents.append(span)
        else:
            pass
        #TODO fix not align to token case
        '''
            print(
                "Skipping span (does not align to tokens):",
                start,
                end,
                label,
                doc.text[start:end],
            )
        '''
    doc.ents = ents
    return {
        'tokens': list([t.text for t in doc]),
        'labels': list(biluo_to_iob(doc_to_biluo_tags(doc)))
    }

df['conll'] = df.progress_apply(convert_to_conll, axis=1)

resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved


  0%|          | 0/3822 [00:00<?, ?it/s]

In [9]:
label2id = {'O': 0, 'B-TRIGGER': 1, 'I-TRIGGER': 2}

df['tokens'] = df.conll.str['tokens']
df['ner_tags'] = df.conll.str['labels'].apply(lambda x: [label2id[t] for t in x])

df_train = df[df.is_valid == 0]
df_valid = df[df.is_valid == 1]

In [10]:
df_train.head()

Unnamed: 0,id,content,lang,manipulative,techniques,trigger_words,fold,is_valid,clf_labels,target,conll,tokens,ner_tags
0,0bb0c7fa-101b-4583-a5f9-9d503339141c,Новий огляд мапи DeepState від російського вій...,uk,True,"[euphoria, loaded_language]","[[27, 63], [65, 88], [90, 183], [186, 308]]",1,0,"[0, 0, 0, 0, 0, 1, 0, 1, 0, 0]","[[27, 63, TRIGGER], [65, 88, TRIGGER], [90, 18...","{'tokens': ['Новий', 'огляд', 'мапи', 'DeepSta...","[Новий, огляд, мапи, DeepState, від, російсько...","[0, 0, 0, 0, 1, 2, 2, 2, 0, 1, 2, 2, 2, 0, 1, ..."
1,7159f802-6f99-4e9d-97bd-6f565a4a0fae,Недавно 95 квартал жёстко поглумился над русск...,ru,True,"[loaded_language, cherry_picking]","[[0, 40], [123, 137], [180, 251], [253, 274]]",3,0,"[0, 0, 0, 0, 0, 1, 0, 0, 1, 0]","[[0, 40, TRIGGER], [123, 137, TRIGGER], [180, ...","{'tokens': ['Недавно', '95', 'квартал', 'жёстк...","[Недавно, 95, квартал, жёстко, поглумился, над...","[1, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,e6a427f1-211f-405f-bd8b-70798458d656,🤩\nТим часом йде евакуація Бєлгородського авто...,uk,True,"[loaded_language, euphoria]","[[55, 100]]",1,0,"[0, 0, 0, 0, 0, 1, 0, 1, 0, 0]","[[55, 100, TRIGGER]]","{'tokens': ['🤩', ' ', 'Тим', 'часом', 'йде', '...","[🤩, \n, Тим, часом, йде, евакуація, Бєлгородсь...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, ..."
3,1647a352-4cd3-40f6-bfa1-d87d42e34eea,В Україні найближчим часом мають намір посилит...,uk,False,,[],2,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",[],"{'tokens': ['В', 'Україні', 'найближчим', 'час...","[В, Україні, найближчим, часом, мають, намір, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,9c01de00-841f-4b50-9407-104e9ffb03bf,"Расчёты 122-мм САУ 2С1 ""Гвоздика"" 132-й бригад...",ru,True,[loaded_language],"[[114, 144]]",2,0,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]","[[114, 144, TRIGGER]]","{'tokens': ['Расчёты', '122-мм', 'САУ', '2С1',...","[Расчёты, 122-мм, САУ, 2С1, "", Гвоздика, "", 13...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


# Datasets

In [11]:
import os
os.makedirs('data', exist_ok=True)

df_train[['tokens', 'clf_labels', 'ner_tags']].to_json(
    './data/train_processed.json', orient='records', lines=True)
df_valid[['tokens', 'clf_labels', 'ner_tags']].to_json(
    './data/valid_processed.json', orient='records', lines=True)

In [12]:
from datasets import load_dataset

raw_datasets_ua = load_dataset(
    "json",
    data_files={
        'train': './data/train_processed.json',
        'val': './data/valid_processed.json'
    }
)
raw_datasets_ua

Generating train split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'clf_labels', 'ner_tags'],
        num_rows: 3058
    })
    val: Dataset({
        features: ['tokens', 'clf_labels', 'ner_tags'],
        num_rows: 764
    })
})

## Tokenization

In [13]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [14]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_token_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_token_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_token_labels
    tokenized_inputs["sequence_labels"] = examples["clf_labels"]
    
    return tokenized_inputs

In [15]:
tokenized_datasets_ua = raw_datasets_ua.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets_ua["train"].column_names,
)

Map:   0%|          | 0/3058 [00:00<?, ? examples/s]

Map:   0%|          | 0/764 [00:00<?, ? examples/s]

In [16]:
tokenized_datasets_ua['train'].to_pandas().head()

Unnamed: 0,input_ids,token_type_ids,attention_mask,labels,sequence_labels
0,"[101, 100325, 555, 41824, 97744, 20785, 18891,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, ...","[0, 0, 0, 0, 0, 1, 0, 1, 0, 0]"
1,"[101, 21124, 95227, 11978, 69055, 50680, 10517...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[0, 0, 0, 0, 0, 1, 0, 0, 1, 0]"
2,"[101, 100, 61059, 60019, 550, 12265, 546, 1085...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 1, 0, 1, 0, 0]"
3,"[101, 511, 21567, 15861, 61394, 10191, 12025, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,"[101, 525, 18291, 56604, 10292, 17484, 118, 14...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]"


# Model

In [17]:
import torch
import torch.nn as nn
from transformers import AutoConfig, AutoModel
from transformers import BertModel, BertPreTrainedModel

In [18]:
class BertForTokenSequenceClassification(BertPreTrainedModel):
    def __init__(self, model_name, num_token_labels, num_sequence_labels):
        bert_model = BertModel.from_pretrained(model_name)
        super().__init__(bert_model.config)
        self.bert = bert_model
        hidden_size = self.config.hidden_size

        # Token Classification Head
        self.token_classifier = nn.Linear(hidden_size, num_token_labels)

        # Sequence Classification Head
        self.sequence_classifier = nn.Linear(hidden_size, num_sequence_labels)

        # Initialize weights and apply final processing
        self.post_init()

    def forward(self, input_ids, attention_mask, labels=None, sequence_labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state  # Shape: (batch, seq_len, hidden)

        # Token Classification Output (Apply to each token)
        token_logits = self.token_classifier(sequence_output)  # (batch, seq_len, num_token_labels)

        # Sequence Classification Output (Use [CLS] token's representation)
        cls_output = sequence_output[:, 0, :]  # Take first token (CLS)
        sequence_logits = self.sequence_classifier(cls_output)  # (batch, num_sequence_labels)

        loss = None
        if labels is not None and sequence_labels is not None:
            token_loss_fn = nn.CrossEntropyLoss()
            seq_loss_fn = nn.BCEWithLogitsLoss()  # For multi-label classification

            token_loss = token_loss_fn(token_logits.view(-1, token_logits.shape[-1]), labels.view(-1))
            seq_loss = seq_loss_fn(sequence_logits, sequence_labels.float())

            loss = token_loss + seq_loss  # Combine losses

        return {
            "loss": loss,
            "token_logits": token_logits,
            "sequence_logits": sequence_logits,
        }

## Init and Test

In [19]:
sample = tokenized_datasets_ua['train'][0]

# Convert input to batch format (add batch dimension)
input_ids = torch.tensor([sample["input_ids"]])
attention_mask = torch.tensor([sample["attention_mask"]])
token_labels = torch.tensor([sample["labels"]])
sequence_labels = torch.tensor([sample["sequence_labels"]])

In [20]:
model = BertForTokenSequenceClassification(
    model_name=PRETRAINED_MODEL,
    num_token_labels=3,
    num_sequence_labels=10
)

model

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

BertForTokenSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), 

## Metrics

In [21]:
!pip install evaluate seqeval

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m26.5 kB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hBuilding wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=433a1b3828c92a35aeedda0f209e10a78ee8d9c29c9540be961856e758045f5d
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval, evaluate
Successfully installed evaluate-0.4.3 seqeva

In [22]:
import evaluate
import numpy as np
from sklearn.metrics import f1_score
seqeval = evaluate.load("seqeval")

label_names = list(label2id.keys())

def compute_metrics(eval_pred):
    # print(eval_pred.predictions[0].shape)
    # print(eval_pred.predictions[1].shape)
    
    token_logits, sequence_logits = eval_pred.predictions
    token_labels, sequence_labels = eval_pred.label_ids

    # Token classification metrics
    token_metrics = compute_token_metrics(token_logits, token_labels)
    
    # Sequence classification metrics (multi-label)
    sequence_metrics = compute_sequence_metrics(sequence_logits, sequence_labels)

    return {
        **{f"sequence_{key}": value for key, value in sequence_metrics.items()},
        **{f"token_{key}": value for key, value in token_metrics.items()}
    }


def compute_token_metrics(logits, labels):
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }
    

def compute_sequence_metrics(logits, labels):
    predictions = (logits >= 0.0).astype(int)

    return {
        "f1": f1_score(labels, predictions, average="macro")
    }

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

## Train

In [23]:
EPOCHS = 5

In [24]:
from transformers import DataCollatorForTokenClassification

class CustomDataCollator(DataCollatorForTokenClassification):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
    
    def __call__(self, features):
        # Separate token-level and sequence-level labels
        sequence_labels = [f.pop("sequence_labels") for f in features]
        
        # Use Hugging Face's built-in collator for token classification
        batch = super().torch_call(features)
        
        # Convert sequence labels to tensor
        batch["sequence_labels"] = torch.tensor(sequence_labels, dtype=torch.int64)
        
        return batch

# Use the custom data collator
data_collator = CustomDataCollator(tokenizer=tokenizer)

In [25]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW([
    {'params': list(model.bert.parameters()), 'lr': 2e-5},
    {'params': list(model.token_classifier.parameters()), 'lr': 1e-4},
    {'params': list(model.sequence_classifier.parameters()), 'lr': 1e-4}
])

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0.1*EPOCHS*(tokenized_datasets_ua['train'].num_rows/16),
    num_training_steps=EPOCHS*(tokenized_datasets_ua['train'].num_rows/16)
)



In [26]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=EPOCHS,
    
    output_dir="./results",
    logging_strategy="steps",
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets_ua["train"],
    eval_dataset=tokenized_datasets_ua["val"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    optimizers=(optimizer, scheduler),
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Sequence F1,Token Precision,Token Recall,Token F1,Token Accuracy
1,0.6644,0.749648,0.116178,0.012184,0.024149,0.016196,0.781049
2,0.7332,0.730565,0.169089,0.017191,0.025387,0.0205,0.785329
3,0.6353,0.725438,0.235562,0.021934,0.047059,0.029921,0.780908
4,0.5596,0.825035,0.23099,0.026801,0.029721,0.028186,0.784698
5,0.3799,0.810022,0.235821,0.032787,0.048297,0.039059,0.782384


TrainOutput(global_step=960, training_loss=0.6700409390032291, metrics={'train_runtime': 874.7457, 'train_samples_per_second': 17.479, 'train_steps_per_second': 1.097, 'total_flos': 3685202400667932.0, 'train_loss': 0.6700409390032291, 'epoch': 5.0})

In [34]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "/kaggle/working/results/checkpoint-384"
# model_checkpoint = "/kaggle/working/results/checkpoint-576"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at /kaggle/working/results/checkpoint-384 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0


In [35]:
preds = token_classifier.predict(df_valid.content.tolist())

In [36]:
val_sub = [str([(p['start'], p['end']) for p in row]) for row in preds]

In [37]:
from copy import deepcopy

def safe_string(row):
    if row is None:
        return '[]'
    else:
        return str([(s[0], s[1]) for s in row])

valid_sub = deepcopy(df_valid)
valid_sub['trigger_words'] = valid_sub.trigger_words.apply(safe_string)
valid_sub_gt = deepcopy(valid_sub[['id', 'trigger_words']])
valid_sub_hat = deepcopy(valid_sub[['id', 'trigger_words']])
valid_sub_hat['trigger_words'] = val_sub

In [38]:
import pandas as pd
import pandas.api.types
from sklearn.metrics import f1_score
import ast


class ParticipantVisibleError(Exception):
    """Custom exception for participant-visible errors."""
    pass


def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str) -> float:
    """
    Compute span-level F1 score based on overlap.

    Parameters:
    - solution (pd.DataFrame): Ground truth DataFrame with row ID and token labels.
    - submission (pd.DataFrame): Submission DataFrame with row ID and token labels.
    - row_id_column_name (str): Column name for the row identifier.

    Returns:
    - float: The token-level weighted F1 score.

    Example:
    >>> solution = pd.DataFrame({
    ...     "id": [1, 2, 3],
    ...     "trigger_words": [[(612, 622), (725, 831)], [(300, 312)], []]
    ... })
    >>> submission = pd.DataFrame({
    ...     "id": [1, 2, 3],
    ...     "trigger_words": [[(612, 622), (700, 720)], [(300, 312)], [(100, 200)]]
    ... })
    >>> score(solution, submission, "id")
    0.16296296296296295
    """
    if not all(col in solution.columns for col in ["id", "trigger_words"]):
        raise ValueError("Solution DataFrame must contain 'id' and 'trigger_words' columns.")
    if not all(col in submission.columns for col in ["id", "trigger_words"]):
        raise ValueError("Submission DataFrame must contain 'id' and 'trigger_words' columns.")
    
    def safe_parse_spans(trigger_words):
        if isinstance(trigger_words, str):
            try:
                return ast.literal_eval(trigger_words)
            except (ValueError, SyntaxError):
                return []
        if isinstance(trigger_words, (list, tuple)):
            return trigger_words
        return []

    def extract_tokens_from_spans(spans):
        tokens = set()
        for start, end in spans:
            tokens.update(range(start, end))
        return tokens
    
    solution = solution.copy()
    submission = submission.copy()

    solution["trigger_words"] = solution["trigger_words"].apply(safe_parse_spans)
    submission["trigger_words"] = submission["trigger_words"].apply(safe_parse_spans)

    merged = pd.merge(
        solution,
        submission,
        on="id",
        suffixes=("_solution", "_submission")
    )

    total_true_tokens = 0
    total_pred_tokens = 0
    overlapping_tokens = 0

    for _, row in merged.iterrows():
        true_spans = row["trigger_words_solution"]
        pred_spans = row["trigger_words_submission"]

        true_tokens = extract_tokens_from_spans(true_spans)
        pred_tokens = extract_tokens_from_spans(pred_spans)

        total_true_tokens += len(true_tokens)
        total_pred_tokens += len(pred_tokens)
        overlapping_tokens += len(true_tokens & pred_tokens)

    precision = overlapping_tokens / total_pred_tokens if total_pred_tokens > 0 else 0
    recall = overlapping_tokens / total_true_tokens if total_true_tokens > 0 else 0
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return f1

In [39]:
score(solution=valid_sub_gt, submission=valid_sub_hat, row_id_column_name='id')

0.40750596564023467