In [None]:
!pip install -q -U transformers bitsandbytes accelerate datasets peft scipy wandb scikit-learn

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.0/76.0 MB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h

In [None]:
!huggingface-cli login --token "$HUGGINGFACE_TOKEN"
!wandb login "$WANDB_TOKEN"

In [2]:
import os
import copy
import random
from dataclasses import dataclass
from tqdm.autonotebook import tqdm

import numpy as np
import torch
from datasets import Dataset
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer, 
    EvalPrediction,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
from sklearn.metrics import log_loss, accuracy_score

  from tqdm.autonotebook import tqdm


# Configurations

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

@dataclass
class Config:
    data_path: str = '/kaggle/input/unlp-2025-shared-task-span-identification/'
    cv_path: str = "/kaggle/input/unlp25-cross-validation-split/cv_split.csv"
    
    pretrained: str = "CohereForAI/aya-101"
    max_length: int = 2048

    hugginface_key: str = user_secrets.get_secret("hugginface_key")
    wandb_key: str = user_secrets.get_secret("wandb_key")
    wandb_init_args = {
        'project': "unlp-span-ident-task",
        'entity': "ivan-havlytskyiz",
        'name': "aya-101-encoder-a100"
    }

    lora_args = {
        'r': 16,
        'bias': "none",
        'lora_alpha': 32,
        'lora_dropout': 0.05,
        # 'layers_to_transform': list(range(16, 42))
    }

config = Config()

In [4]:
def set_seeds(seed):
    """Set seeds for reproducibility """
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        

set_seeds(seed=42)
tqdm.pandas()

In [None]:
import huggingface_hub
import wandb

wandb.init(**config.wandb_init_args)

# Training Arguments

In [None]:
training_args = TrainingArguments(
    output_dir=f'./model_checkpoints_{config.wandb_init_args["name"]}',
    logging_dir=f'./model_logs_{config.wandb_init_args["name"]}',
    learning_rate=1e-4,
    weight_decay=0.01,
    lr_scheduler_type='cosine',
    warmup_ratio=0.0,
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    bf16=True,
    report_to="wandb",
    optim='adamw_8bit',
    eval_strategy='steps',
    save_strategy="steps",
    eval_steps=200,
    logging_steps=20,
    save_steps=200,
    save_total_limit=10,
    metric_for_best_model='eval_f1',
    greater_is_better=True,
    load_best_model_at_end=True,
)

# LoRA config

In [None]:
lora_config = LoraConfig(
    **config.lora_args,
    # only target self-attention
    target_modules=['o', 'v', "q", "k", "wi_0"],
    task_type=TaskType.TOKEN_CLS,
)

# Instantiate the tokenizer & model

In [None]:
tokenizer = AutoTokenizer.from_pretrained(config.pretrained)
tokenizer.add_eos_token = True  # We'll add <eos> at the end
tokenizer.padding_side = "right"

In [None]:
import torch
from transformers.models.t5.modeling_t5 import T5ForTokenClassification
from transformers import BitsAndBytesConfig
from peft import get_peft_config, prepare_model_for_kbit_training, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType

nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=False,
   bnb_4bit_compute_dtype=torch.bfloat16
)


model = T5ForTokenClassification.from_pretrained(
    config.pretrained,
    num_labels=2,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    quantization_config=nf4_config
)


model.config.use_cache = False
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

model.print_trainable_parameters()

# Data

In [None]:
import pandas as pd

df = pd.read_parquet(config.data_path + "train.parquet")
cv = pd.read_csv(config.cv_path)
df = df.merge(cv, on='id', how='left')

df_test = pd.read_csv(config.data_path + "test.csv")

In [None]:
def convert_to_seq_labeling(text, tokenizer, max_length=None, trigger_spans=None):
    tokenized_output = tokenizer(
        text,
        return_offsets_mapping=True,
        add_special_tokens=True,
        
        max_length=max_length,
        truncation=(max_length is not None),
        padding=False
    )
    tokens = tokenized_output["input_ids"]
    offsets = tokenized_output["offset_mapping"]

    # Get subword tokenized versions of the text
    token_strings = tokenizer.convert_ids_to_tokens(tokens)

    
    # Initialize labels as 'O'
    labels = [0] * len(tokens)

    if trigger_spans is not None:
        # Assign 'TRIGGER' to overlapping tokens
        for start, end in trigger_spans:
            for i, (tok_start, tok_end) in enumerate(offsets):
                if tok_start == 0 and tok_end == 0:
                    continue
                if tok_start < end and tok_end > start:  # If token overlaps with the trigger span
                    labels[i] = 1

    tokenized_output['labels'] = labels
    return tokenized_output


def preprocess_df(df, max_length):
    """Modified processing incorporating trigger span handling"""
    tqdm.pandas()
    
    df['seq_labels'] = df.progress_apply(
        lambda row: convert_to_seq_labeling(
            text=row['content'],
            tokenizer=tokenizer,
            trigger_spans=row.get('trigger_words', None),  # Handle both validation and test cases
            max_length=max_length
        ),
        axis=1
    )
    
    # Extract all tokenizer outputs
    for column in df.seq_labels.iloc[0].keys():
        df[column] = df.seq_labels.apply(lambda x: x.get(column))
    
    return df

In [None]:
df.trigger_words = df.trigger_words.apply(lambda x: [] if x is None else x)

is_valid_mask = (df.fold == 4)
df_train = df[~is_valid_mask].copy()
df_valid = df[is_valid_mask].copy()


df_train = preprocess_df(df_train, max_length=config.max_length)
df_valid = preprocess_df(df_valid, max_length=None)
df_test = preprocess_df(df_test, max_length=None)

In [None]:
train_columns = list(df_train.seq_labels.iloc[0].keys()) +\
                ['content', 'trigger_words']
test_columns = list(df_train.seq_labels.iloc[0].keys()) + ['content']

ds_train = Dataset.from_pandas(df_train[train_columns].reset_index(drop=True))
ds_valid = Dataset.from_pandas(df_valid[train_columns].reset_index(drop=True))
ds_test = Dataset.from_pandas(df_test[test_columns].reset_index(drop=True))

# Custom Trainer

In [None]:
from itertools import chain

train_labels = df_train.labels.tolist() + df_valid.labels.tolist()
positive_class_balance = pd.Series(list(chain(*train_labels))).mean()
positive_class_balance

In [None]:
import math
from transformers import Trainer, pipeline, TrainingArguments
from typing import Any
from tqdm.autonotebook import tqdm
from transformers.trainer_utils import EvalPrediction

def extract_chars_from_spans(spans):
    """
    Given a list of spans (each a tuple (start, end)),
    return a set of character indices for all spans.
    """
    char_set = set()
    for start, end in spans:
        # Each span covers positions start, start+1, ..., end-1.
        char_set.update(range(start, end))
    return char_set

class SpanEvaluationTrainer(Trainer):
    def __init__(
        self,
        model: Any = None,
        args: TrainingArguments = None,
        data_collator: Any = None,
        train_dataset: Any = None,
        eval_dataset: Any = None,
        tokenizer: Any = None,
        desired_positive_ratio: float = 0.25,
        **kwargs,
    ):
        """
        Initialize the Trainer with our custom compute_metrics.
        """
        super().__init__(
            model=model,
            args=args,
            data_collator=data_collator,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            tokenizer=tokenizer,
            compute_metrics=self.compute_metrics,  # assign our custom compute_metrics
            **kwargs,
        )
        self.desired_positive_ratio = desired_positive_ratio

    def _calculate_inner_metric(self, gt_spans_all, pred_spans_all):
        total_true_chars = 0
        total_pred_chars = 0
        total_overlap_chars = 0
        for true_spans, pred_spans in zip(gt_spans_all, pred_spans_all):
            if isinstance(true_spans, str):
                try:
                    true_spans = eval(true_spans)
                except Exception:
                    true_spans = []
                    
            # Convert spans to sets of character indices.
            true_chars = extract_chars_from_spans(true_spans)
            pred_chars = extract_chars_from_spans(pred_spans)
            
            total_true_chars += len(true_chars)
            total_pred_chars += len(pred_chars)
            total_overlap_chars += len(true_chars.intersection(pred_chars))
            
            union_chars = true_chars.union(pred_chars)
            
        # Compute precision, recall, and F1.
        precision = total_overlap_chars / total_pred_chars if total_pred_chars > 0 else 0
        recall = total_overlap_chars / total_true_chars if total_true_chars > 0 else 0
        f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        metrics = {
            "precision": precision,
            "recall": recall,
            "f1": f1
        }
        return metrics

    def _find_optimal_threshold(self, probabilities, labels):
        """Finds the threshold that achieves the desired positive class balance."""
        best_th = 0.5  # Default starting point
        best_diff = float("inf")
        optimal_th = best_th
        
        for thold in np.linspace(0.01, 0.99, num=100):
            predictions = (probabilities[:, :, 1] >= thold).astype(int)
            true_predictions = [
                [p for (p, l) in zip(prediction, label) if l != -100]
                for prediction, label in zip(predictions, labels)
            ]
            total_pos = sum([sum(row for row in prediction) for prediction in true_predictions])
            total = sum([len(prediction) for prediction in true_predictions])
            
            positive_ratio = total_pos / total if total > 0 else 0
            
            diff = abs(positive_ratio - self.desired_positive_ratio)
            if diff < best_diff:
                best_diff = diff
                optimal_th = thold
        
        return optimal_th
        
        
    def compute_metrics(self, eval_pred: EvalPrediction) -> dict:
        eval_dataset = self.eval_dataset
        logits, labels = eval_pred
        probabilities = torch.softmax(torch.tensor(logits), dim=-1).cpu().numpy()
    
        #thresholds = np.linspace(0.1, 0.5, num=41)
        thresholds = [self._find_optimal_threshold(probabilities, labels)]
        results = []
        best_f1 = -1
        best_th = 0
        best_metrics = None
    
        for thold in tqdm(thresholds):
            # Apply thresholding instead of argmax
            predictions = (probabilities[:, :, 1] >= thold).astype(int)
    
            true_predictions = [
                [p for (p, l) in zip(prediction, label) if l != -100]
                for prediction, label in zip(predictions, labels)
            ]
    
            pred_spans_all = []
            for pred, offsets in zip(true_predictions, eval_dataset['offset_mapping']):
                samplewise_spans = []
                current_span = None
                for token_label, span in zip(pred, offsets):
                    if token_label == 1:  # If the current token is labeled as an entity (1)
                        if current_span is None:
                            current_span = [span[0], span[1]]  # Start a new span
                        else:
                            current_span[1] = span[1]  # Extend the span to include the current token
                    else:  # If token_label == 0 (not an entity)
                        if current_span is not None:
                            samplewise_spans.append(tuple(current_span))  # Save completed span
                            current_span = None  # Reset for the next entity
    
                # If the last token was part of a span, save it
                if current_span is not None:
                    samplewise_spans.append(tuple(current_span))
    
                pred_spans_all.append(samplewise_spans)
    
            # Store results for this threshold
            current_metrics = self._calculate_inner_metric(eval_dataset['trigger_words'], pred_spans_all)
            if current_metrics['f1'] >= best_f1:
                best_f1 = current_metrics['f1']
                best_th = thold
                best_metrics = current_metrics
                best_metrics['thold'] = thold
                
            
            results.append(current_metrics)
        return best_metrics

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

trainer = SpanEvaluationTrainer(
    model=model,
    args=training_args,
    train_dataset=ds_train,
    eval_dataset=ds_valid,
    data_collator=data_collator,
    tokenizer=tokenizer,
    desired_positive_ratio=positive_class_balance
)

In [None]:
trainer.train()

# Predict

In [None]:
FINETUNED_MODEL = f'./model_checkpoints_{config.wandb_init_args["name"]}/checkpoint-600'

In [None]:
trainer._load_from_checkpoint(FINETUNED_MODEL)

In [None]:
valid_preds = trainer.predict(ds_valid)

In [None]:
val_th = trainer.compute_metrics((valid_preds.predictions, valid_preds.label_ids))['thold']

In [None]:
test_preds = trainer.predict(ds_test)
test_probabilities = torch.softmax(torch.tensor(test_preds.predictions), dim=-1).cpu().numpy()

test_th = trainer._find_optimal_threshold(test_probabilities, test_preds.label_ids)

In [None]:
final_th = (val_th+test_th)/2 - 0.15
final_th

## Metric

In [None]:
import pandas as pd
import pandas.api.types
from sklearn.metrics import f1_score
import ast


class ParticipantVisibleError(Exception):
    """Custom exception for participant-visible errors."""
    pass


def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str) -> float:
    """
    Compute span-level F1 score based on overlap.

    Parameters:
    - solution (pd.DataFrame): Ground truth DataFrame with row ID and token labels.
    - submission (pd.DataFrame): Submission DataFrame with row ID and token labels.
    - row_id_column_name (str): Column name for the row identifier.

    Returns:
    - float: The token-level weighted F1 score.

    Example:
    >>> solution = pd.DataFrame({
    ...     "id": [1, 2, 3],
    ...     "trigger_words": [[(612, 622), (725, 831)], [(300, 312)], []]
    ... })
    >>> submission = pd.DataFrame({
    ...     "id": [1, 2, 3],
    ...     "trigger_words": [[(612, 622), (700, 720)], [(300, 312)], [(100, 200)]]
    ... })
    >>> score(solution, submission, "id")
    0.16296296296296295
    """
    if not all(col in solution.columns for col in ["id", "trigger_words"]):
        raise ValueError("Solution DataFrame must contain 'id' and 'trigger_words' columns.")
    if not all(col in submission.columns for col in ["id", "trigger_words"]):
        raise ValueError("Submission DataFrame must contain 'id' and 'trigger_words' columns.")
    
    def safe_parse_spans(trigger_words):
        if isinstance(trigger_words, str):
            try:
                return ast.literal_eval(trigger_words)
            except (ValueError, SyntaxError):
                return []
        if isinstance(trigger_words, (list, tuple, np.ndarray)):
            return trigger_words
        return []

    def extract_tokens_from_spans(spans):
        tokens = set()
        for start, end in spans:
            tokens.update(range(start, end))
        return tokens
    
    solution = solution.copy()
    submission = submission.copy()

    solution["trigger_words"] = solution["trigger_words"].apply(safe_parse_spans)
    submission["trigger_words"] = submission["trigger_words"].apply(safe_parse_spans)

    merged = pd.merge(
        solution,
        submission,
        on="id",
        suffixes=("_solution", "_submission")
    )

    total_true_tokens = 0
    total_pred_tokens = 0
    overlapping_tokens = 0

    for _, row in merged.iterrows():
        true_spans = row["trigger_words_solution"]
        pred_spans = row["trigger_words_submission"]

        true_tokens = extract_tokens_from_spans(true_spans)
        pred_tokens = extract_tokens_from_spans(pred_spans)

        total_true_tokens += len(true_tokens)
        total_pred_tokens += len(pred_tokens)
        overlapping_tokens += len(true_tokens & pred_tokens)

    precision = overlapping_tokens / total_pred_tokens if total_pred_tokens > 0 else 0
    recall = overlapping_tokens / total_true_tokens if total_true_tokens > 0 else 0
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return f1

## Span level

In [None]:
def inference_aggregation(probabilities, labels, offset_mappings, thold):
    predictions = (probabilities[:, :, 1] >= thold).astype(int)
    true_predictions = [
        [p for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)
    ]
    pred_spans_all = []
    for pred, offsets in zip(true_predictions, offset_mappings):
        samplewise_spans = []
        current_span = None
        for token_label, span in zip(pred, offsets):
            if token_label == 1:  # If the current token is labeled as an entity (1)
                if current_span is None:
                    current_span = [span[0], span[1]]  # Start a new span
                else:
                    current_span[1] = span[1]  # Extend the span to include the current token
            else:  # If token_label == 0 (not an entity)
                if current_span is not None:
                    samplewise_spans.append(tuple(current_span))  # Save completed span
                    current_span = None  # Reset for the next entity
        
                    # If the last token was part of a span, save it
        if current_span is not None:
            samplewise_spans.append(tuple(current_span))
        
        pred_spans_all.append(samplewise_spans)
    return [str(row) for row in pred_spans_all]

In [None]:
valid_probabilities = torch.softmax(torch.tensor(valid_preds.predictions), dim=-1).cpu().numpy()
valid_results = inference_aggregation(valid_probabilities, valid_preds.label_ids, ds_valid['offset_mapping'], final_th)

In [None]:
from copy import deepcopy

df_gt = df[df.fold==4][['id', 'trigger_words']].reset_index(drop=True)
df_pred = deepcopy(df_gt)
df_pred['trigger_words'] = valid_results
cv_f1 = score(df_gt, df_pred, row_id_column_name='id')

cv_f1

In [None]:
test_results = inference_aggregation(test_probabilities, test_preds.label_ids, ds_test['offset_mapping'], final_th)

In [None]:
ss = pd.read_csv('/kaggle/input/unlp-2025-shared-task-span-identification/sample_submission.csv')
ss['trigger_words'] = test_results

In [None]:
df_pred.to_csv(f"{config.wandb_init_args["name"]}-cv{cv_f1:.2f}.csv")