In [1]:
import pandas as pd

kaggle_path = '/kaggle/input/unlp-2025-shared-task-span-identification/train.parquet'
df = pd.read_parquet(kaggle_path) #pd.read_parquet('train.parquet')


In [2]:
import spacy

from spacy.training.iob_utils import biluo_to_iob, doc_to_biluo_tags
from tqdm.autonotebook import tqdm
tqdm.pandas()

df.trigger_words = df.trigger_words.apply(lambda x: [] if x is None else x)
df['target'] = df.trigger_words.apply(lambda x: [[y[0], y[1], 'TRIGGER'] for y in x])

def resolve_overlapping_spans(spans):
    if not spans:
        return []
    spans = sorted(spans, key=lambda x: x[0])  # Sort by start index
    resolved = [spans[0]]
    for current in spans[1:]:
        last = resolved[-1]
        if current[0] < last[1]:  # Overlap
            new_span = (last[0], max(last[1], current[1]), 'TRIGGER')
            resolved[-1] = new_span
            print('resolved')
        else:
            resolved.append(current)
    return resolved

df['target'] = df.target.apply(resolve_overlapping_spans)

nlp = spacy.blank("xx")

def convert_to_conll(row):
    data = {
        "text": row['content'],
        "label": row['target']
    }
    doc = nlp(data["text"])
    ents = []
    for start, end, label in data["label"]:
        span = doc.char_span(start, end, label=label)
        if span is not None:
            ents.append(span)
        else:
            pass
        #TODO fix not align to token case
        '''
            print(
                "Skipping span (does not align to tokens):",
                start,
                end,
                label,
                doc.text[start:end],
            )
        '''
    doc.ents = ents
    return {
        'tokens': list([t.text for t in doc]),
        'labels': list(biluo_to_iob(doc_to_biluo_tags(doc)))
    }

df['conll'] = df.progress_apply(convert_to_conll, axis=1)

resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved
resolved


  0%|          | 0/3822 [00:00<?, ?it/s]

In [3]:
import numpy as np


df['is_valid'] = np.random.binomial(1, 0.2, df.shape[0])

In [4]:
label2id = {'O': 0, 'B-TRIGGER': 1, 'I-TRIGGER': 2}

df['tokens'] = df.conll.str['tokens']
df['ner_tags'] = df.conll.str['labels'].apply(lambda x: [label2id[t] for t in x])

df_train = df[df.is_valid == 0]
df_valid = df[df.is_valid == 1]

In [5]:
import os
os.makedirs('data', exist_ok=True)

df_train[['tokens', 'ner_tags', 'trigger_words', 'content']].to_json(
    './data/train_processed.json', orient='records', lines=True)
df_valid[['tokens', 'ner_tags', 'trigger_words', 'content']].to_json(
    './data/valid_processed.json', orient='records', lines=True)

In [6]:
from datasets import load_dataset

raw_datasets_ua = load_dataset(
    "json",
    data_files={
        'train': './data/train_processed.json',
        'val': './data/valid_processed.json'
    }
)
raw_datasets_ua

Generating train split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'trigger_words', 'content'],
        num_rows: 3025
    })
    val: Dataset({
        features: ['tokens', 'ner_tags', 'trigger_words', 'content'],
        num_rows: 797
    })
})

In [7]:
from transformers import AutoTokenizer, AutoModelForTokenClassification


id2label = {v: k for k, v in label2id.items()}

model = AutoModelForTokenClassification.from_pretrained(
    'bert-base-multilingual-cased',
    id2label=id2label,
    label2id=label2id,
)
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)


def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs


In [9]:
tokenized_datasets_ua = raw_datasets_ua.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=[col for col in raw_datasets_ua["train"].column_names if col not in ["content", "trigger_words"]]
)

Map:   0%|          | 0/3025 [00:00<?, ? examples/s]

Map:   0%|          | 0/797 [00:00<?, ? examples/s]

In [10]:
from transformers import AdamW, get_linear_schedule_with_warmup


EPOCHS = 2

optimizer = AdamW([
    {'params': list(model.bert.parameters()), 'lr': 2e-5},
    {'params': list(model.classifier.parameters()), 'lr': 1e-4}
])

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0.1*EPOCHS*(tokenized_datasets_ua['train'].num_rows/16),
    num_training_steps=EPOCHS*(tokenized_datasets_ua['train'].num_rows/16)
)



In [11]:
import math
from transformers import Trainer, pipeline, TrainingArguments
from typing import Any
from transformers.trainer_utils import EvalPrediction

def extract_chars_from_spans(spans):
    """
    Given a list of spans (each a tuple (start, end)),
    return a set of character indices for all spans.
    """
    char_set = set()
    for start, end in spans:
        # Each span covers positions start, start+1, ..., end-1.
        char_set.update(range(start, end))
    return char_set

class SpanEvaluationTrainer(Trainer):
    def __init__(
        self,
        model: Any = None,
        args: TrainingArguments = None,
        data_collator: Any = None,
        train_dataset: Any = None,
        eval_dataset: Any = None,
        tokenizer: Any = None,
        **kwargs,
    ):
        """
        Initialize the Trainer with our custom compute_metrics.
        """
        super().__init__(
            model=model,
            args=args,
            data_collator=data_collator,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            tokenizer=tokenizer,
            compute_metrics=self.compute_metrics,  # assign our custom compute_metrics
            **kwargs,
        )
        
    def compute_metrics(self, eval_pred: EvalPrediction) -> dict:
        """
        Perform character-level span evaluation using a batch call to the pipeline.
        
        This method assumes that each sample in the evaluation dataset is a dict with:
          - "content": the text string to run inference on,
          - "trigger_words": the ground truth spans (list of (start, end) tuples or a string representation).
          
        It builds a token-classification pipeline (with aggregation_strategy="simple")
        from self.model and self.tokenizer, then processes the whole evaluation dataset
        in one batch. Predicted spans are compared against ground truth spans by expanding them
        into sets of character indices. Finally, precision, recall, F1 and accuracy are computed.
        """
        # Get the evaluation dataset (assumed to be an iterable of dicts).
        eval_dataset = self.eval_dataset

        # Build a list of texts from the evaluation dataset.
        texts = [sample["content"] for sample in eval_dataset]
        
        # Build the token-classification pipeline once.
        token_classifier = pipeline(
            "token-classification",
            model=self.model,
            tokenizer=self.tokenizer,
            aggregation_strategy="simple"
        )
        
        # Process the entire list of texts in one call.
        all_predictions = token_classifier(texts)
        
        total_true_chars = 0
        total_pred_chars = 0
        total_overlap_chars = 0
        total_chars = 0
        total_correct_chars = 0
        
        # Iterate over the evaluation samples and corresponding predictions.
        for sample, predictions in zip(eval_dataset, all_predictions):
            text = sample["content"]
            L = len(text)
            total_chars += L

            # Get the ground truth spans.
            true_spans = sample["trigger_words"]
            if isinstance(true_spans, str):
                try:
                    true_spans = eval(true_spans)
                except Exception:
                    true_spans = []
            
            # Extract predicted spans from the pipeline output.
            # Each prediction is expected to have "start" and "end" keys.
            pred_spans = [(pred["start"], pred["end"]) for pred in predictions]
            
            # Convert spans to sets of character indices.
            true_chars = extract_chars_from_spans(true_spans)
            pred_chars = extract_chars_from_spans(pred_spans)
            
            total_true_chars += len(true_chars)
            total_pred_chars += len(pred_chars)
            total_overlap_chars += len(true_chars.intersection(pred_chars))
            
            # For accuracy: correct characters are those predicted correctly as entity (intersection)
            # plus those correctly predicted as non-entity (i.e. not in the union).
            union_chars = true_chars.union(pred_chars)
            correct_chars = len(true_chars.intersection(pred_chars)) + (L - len(union_chars))
            total_correct_chars += correct_chars
        
        # Compute precision, recall, and F1.
        precision = total_overlap_chars / total_pred_chars if total_pred_chars > 0 else 0
        recall = total_overlap_chars / total_true_chars if total_true_chars > 0 else 0
        f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        # Compute overall accuracy.
        accuracy = total_correct_chars / total_chars if total_chars > 0 else 0
        
        metrics = {
            "precision": precision,
            "recall": recall,
            "f1": f1,
            "accuracy": accuracy,
        }
        return metrics



In [12]:
os.environ['WANDB_DISABLED'] = 'true'

from transformers import TrainingArguments

strategy = 'epoch'

args = TrainingArguments(
    "bert-ua-loc-ner",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_strategy=strategy,
    eval_strategy=strategy,
    save_strategy=strategy,
    # eval_steps=5,
    # save_steps=5,
    metric_for_best_model='eval_f1',
    num_train_epochs=EPOCHS,
    save_total_limit=5,
    seed=42,
    data_seed=42
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [13]:
from transformers import Trainer

trainer = SpanEvaluationTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets_ua["train"],
    eval_dataset=tokenized_datasets_ua["val"],
    data_collator=data_collator,
    # compute_metrics=compute_metrics_char,
    tokenizer=tokenizer,
    optimizers=(optimizer, scheduler),
    
)
trainer.train()

  super().__init__(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.536,0.451981,0.61937,0.378916,0.470184,0.796338
2,0.4283,0.450355,0.66739,0.326363,0.438361,0.800549


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Device set to use cuda:0
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Device set to use cuda:0


TrainOutput(global_step=380, training_loss=0.48216542695697984, metrics={'train_runtime': 379.5475, 'train_samples_per_second': 15.94, 'train_steps_per_second': 1.001, 'total_flos': 1458430315932294.0, 'train_loss': 0.48216542695697984, 'epoch': 2.0})

In [14]:
# preds = trainer.predict(tokenized_datasets_ua["val"])

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Device set to use cuda:0


In [27]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "./bert-ua-loc-ner/checkpoint-380/"#"./bert-ua-loc-ner/checkpoint-955/"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)

Device set to use cuda:0


In [28]:
preds = token_classifier.predict(df_valid.content.tolist())

In [29]:
val_sub = [str([(p['start'], p['end']) for p in row]) for row in preds]

In [30]:
from copy import deepcopy

def safe_string(row):
    if row is None:
        return '[]'
    else:
        return str([(s[0], s[1]) for s in row])

valid_sub = deepcopy(df_valid)
valid_sub['trigger_words'] = valid_sub.trigger_words.apply(safe_string)
valid_sub_gt = deepcopy(valid_sub[['id', 'trigger_words']])
valid_sub_hat = deepcopy(valid_sub[['id', 'trigger_words']])
valid_sub_hat['trigger_words'] = val_sub

In [32]:
import pandas as pd
import pandas.api.types
from sklearn.metrics import f1_score
import ast


class ParticipantVisibleError(Exception):
    """Custom exception for participant-visible errors."""
    pass


def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str) -> float:
    """
    Compute span-level F1 score based on overlap.

    Parameters:
    - solution (pd.DataFrame): Ground truth DataFrame with row ID and token labels.
    - submission (pd.DataFrame): Submission DataFrame with row ID and token labels.
    - row_id_column_name (str): Column name for the row identifier.

    Returns:
    - float: The token-level weighted F1 score.

    Example:
    >>> solution = pd.DataFrame({
    ...     "id": [1, 2, 3],
    ...     "trigger_words": [[(612, 622), (725, 831)], [(300, 312)], []]
    ... })
    >>> submission = pd.DataFrame({
    ...     "id": [1, 2, 3],
    ...     "trigger_words": [[(612, 622), (700, 720)], [(300, 312)], [(100, 200)]]
    ... })
    >>> score(solution, submission, "id")
    0.16296296296296295
    """
    if not all(col in solution.columns for col in ["id", "trigger_words"]):
        raise ValueError("Solution DataFrame must contain 'id' and 'trigger_words' columns.")
    if not all(col in submission.columns for col in ["id", "trigger_words"]):
        raise ValueError("Submission DataFrame must contain 'id' and 'trigger_words' columns.")
    
    def safe_parse_spans(trigger_words):
        if isinstance(trigger_words, str):
            try:
                return ast.literal_eval(trigger_words)
            except (ValueError, SyntaxError):
                return []
        if isinstance(trigger_words, (list, tuple)):
            return trigger_words
        return []

    def extract_tokens_from_spans(spans):
        tokens = set()
        for start, end in spans:
            tokens.update(range(start, end))
        return tokens
    
    solution = solution.copy()
    submission = submission.copy()

    solution["trigger_words"] = solution["trigger_words"].apply(safe_parse_spans)
    submission["trigger_words"] = submission["trigger_words"].apply(safe_parse_spans)

    merged = pd.merge(
        solution,
        submission,
        on="id",
        suffixes=("_solution", "_submission")
    )

    total_true_tokens = 0
    total_pred_tokens = 0
    overlapping_tokens = 0

    for _, row in merged.iterrows():
        true_spans = row["trigger_words_solution"]
        pred_spans = row["trigger_words_submission"]

        true_tokens = extract_tokens_from_spans(true_spans)
        pred_tokens = extract_tokens_from_spans(pred_spans)

        total_true_tokens += len(true_tokens)
        total_pred_tokens += len(pred_tokens)
        overlapping_tokens += len(true_tokens & pred_tokens)

    precision = overlapping_tokens / total_pred_tokens if total_pred_tokens > 0 else 0
    recall = overlapping_tokens / total_true_tokens if total_true_tokens > 0 else 0
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return f1

In [33]:
score(solution=valid_sub_gt, submission=valid_sub_hat, row_id_column_name='id')

0.43836133430728025

In [21]:
# test = pd.read_csv("/kaggle/input/unlp-2025-shared-task-span-identification/test.csv")#'test.csv')

In [22]:
# preds_test = token_classifier.predict(test.content.tolist())

In [23]:
# test_sub = [str([(p['start'], p['end']) for p in row]) for row in preds_test]

In [24]:
# ss = pd.read_csv('sample_submission.csv')

In [25]:
# ss['trigger_words'] = test_sub

In [26]:
# ss.to_csv('submissions/bert-base-ml-cv0.459.csv', index=False)