In [1]:
import os
import random
import numpy as np
import torch
from tqdm.autonotebook import tqdm

def set_seeds(seed):
    """Set seeds for reproducibility """
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        

set_seeds(seed=42)
tqdm.pandas()

  from tqdm.autonotebook import tqdm


In [2]:
DATA_PATH = '/kaggle/input/unlp-2025-shared-task-span-identification/'
CV_PATH = "/kaggle/input/unlp25-cross-validation-split/cv_split.csv"

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
PRETRAINED_MODEL = "microsoft/mdeberta-v3-base"
TRAIN_LEN = 512

# Data

In [4]:
import pandas as pd

df = pd.read_parquet(DATA_PATH + "train.parquet")
cv = pd.read_csv(CV_PATH)
df = df.merge(cv, on='id', how='left')

df_test = pd.read_csv(DATA_PATH + "test.csv")

In [5]:
df.head()

Unnamed: 0,id,content,lang,manipulative,techniques,trigger_words,fold
0,0bb0c7fa-101b-4583-a5f9-9d503339141c,Новий огляд мапи DeepState від російського вій...,uk,True,"[euphoria, loaded_language]","[[27, 63], [65, 88], [90, 183], [186, 308]]",1
1,7159f802-6f99-4e9d-97bd-6f565a4a0fae,Недавно 95 квартал жёстко поглумился над русск...,ru,True,"[loaded_language, cherry_picking]","[[0, 40], [123, 137], [180, 251], [253, 274]]",3
2,e6a427f1-211f-405f-bd8b-70798458d656,🤩\nТим часом йде евакуація Бєлгородського авто...,uk,True,"[loaded_language, euphoria]","[[55, 100]]",1
3,1647a352-4cd3-40f6-bfa1-d87d42e34eea,В Україні найближчим часом мають намір посилит...,uk,False,,,2
4,9c01de00-841f-4b50-9407-104e9ffb03bf,"Расчёты 122-мм САУ 2С1 ""Гвоздика"" 132-й бригад...",ru,True,[loaded_language],"[[114, 144]]",2


# Targets Prep

## Classification

In [6]:
from collections.abc import Iterable

techniques = ['straw_man', 'appeal_to_fear', 'fud', 'bandwagon', 'whataboutism', 'loaded_language', 'glittering_generalities', 'euphoria', 'cherry_picking', 'cliche']

for col in techniques:
    df[col] = 0

import numpy as np
for ind, row in df.iterrows():
    if isinstance(row['techniques'], Iterable):
        for t in row['techniques']:
            df.loc[ind, t] = 1

df['sequence_labels'] = list(df[techniques].values)
# df.drop(columns=techniques, inplace=True)

In [7]:
df.head()

Unnamed: 0,id,content,lang,manipulative,techniques,trigger_words,fold,straw_man,appeal_to_fear,fud,bandwagon,whataboutism,loaded_language,glittering_generalities,euphoria,cherry_picking,cliche,sequence_labels
0,0bb0c7fa-101b-4583-a5f9-9d503339141c,Новий огляд мапи DeepState від російського вій...,uk,True,"[euphoria, loaded_language]","[[27, 63], [65, 88], [90, 183], [186, 308]]",1,0,0,0,0,0,1,0,1,0,0,"[0, 0, 0, 0, 0, 1, 0, 1, 0, 0]"
1,7159f802-6f99-4e9d-97bd-6f565a4a0fae,Недавно 95 квартал жёстко поглумился над русск...,ru,True,"[loaded_language, cherry_picking]","[[0, 40], [123, 137], [180, 251], [253, 274]]",3,0,0,0,0,0,1,0,0,1,0,"[0, 0, 0, 0, 0, 1, 0, 0, 1, 0]"
2,e6a427f1-211f-405f-bd8b-70798458d656,🤩\nТим часом йде евакуація Бєлгородського авто...,uk,True,"[loaded_language, euphoria]","[[55, 100]]",1,0,0,0,0,0,1,0,1,0,0,"[0, 0, 0, 0, 0, 1, 0, 1, 0, 0]"
3,1647a352-4cd3-40f6-bfa1-d87d42e34eea,В Україні найближчим часом мають намір посилит...,uk,False,,,2,0,0,0,0,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,9c01de00-841f-4b50-9407-104e9ffb03bf,"Расчёты 122-мм САУ 2С1 ""Гвоздика"" 132-й бригад...",ru,True,[loaded_language],"[[114, 144]]",2,0,0,0,0,0,1,0,0,0,0,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]"


## Span

In [8]:
from transformers import AutoTokenizer
import pandas as pd

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]



In [9]:
def convert_to_seq_labeling(text, tokenizer, max_length=None, trigger_spans=None):
    tokenized_output = tokenizer(
        text,
        return_offsets_mapping=True,
        add_special_tokens=True,
        
        max_length=max_length,
        truncation=(max_length is not None),
        padding=False
    )
    tokens = tokenized_output["input_ids"]
    offsets = tokenized_output["offset_mapping"]

    # Get subword tokenized versions of the text
    token_strings = tokenizer.convert_ids_to_tokens(tokens)

    
    # Initialize labels as 'O'
    labels = [0] * len(tokens)

    if trigger_spans is not None:
        # Assign 'TRIGGER' to overlapping tokens
        for start, end in trigger_spans:
            for i, (tok_start, tok_end) in enumerate(offsets):
                if tok_start == 0 and tok_end == 0:
                    continue
                if tok_start < end and tok_end > start:  # If token overlaps with the trigger span
                    labels[i] = 1

    tokenized_output['labels'] = labels
    return tokenized_output

In [10]:
def preprocess_df(df, max_length):
    """Modified processing incorporating trigger span handling"""
    tqdm.pandas()
    
    df['seq_labels'] = df.progress_apply(
        lambda row: convert_to_seq_labeling(
            text=row['content'],
            tokenizer=tokenizer,
            trigger_spans=row.get('trigger_words', None),  # Handle both validation and test cases
            max_length=max_length
        ),
        axis=1
    )
    
    # Extract all tokenizer outputs
    for column in df.seq_labels.iloc[0].keys():
        df[column] = df.seq_labels.apply(lambda x: x.get(column))

    if "sequence_labels" not in df.columns:
        df["sequence_labels"] = [[0]*10]*df.shape[0]
    
    return df

In [11]:
df.trigger_words = df.trigger_words.apply(lambda x: [] if x is None else x)

is_valid_mask = (df.fold == 4)
df_train = df[~is_valid_mask].copy()
df_valid = df[is_valid_mask].copy()


df_train = preprocess_df(df_train, max_length=TRAIN_LEN)
df_valid = preprocess_df(df_valid, max_length=None)
df_test = preprocess_df(df_test, max_length=None)

  0%|          | 0/3058 [00:00<?, ?it/s]

  0%|          | 0/764 [00:00<?, ?it/s]

  0%|          | 0/5735 [00:00<?, ?it/s]

In [12]:
df_train.head(2)

Unnamed: 0,id,content,lang,manipulative,techniques,trigger_words,fold,straw_man,appeal_to_fear,fud,...,euphoria,cherry_picking,cliche,sequence_labels,seq_labels,input_ids,token_type_ids,attention_mask,offset_mapping,labels
0,0bb0c7fa-101b-4583-a5f9-9d503339141c,Новий огляд мапи DeepState від російського вій...,uk,True,"[euphoria, loaded_language]","[[27, 63], [65, 88], [90, 183], [186, 308]]",1,0,0,0,...,1,0,0,"[0, 0, 0, 0, 0, 1, 0, 1, 0, 0]","[input_ids, token_type_ids, attention_mask, of...","[1, 55816, 544, 260, 84748, 3554, 14381, 29189...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 4), (4, 5), (5, 6), (6, 11), (11,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, ..."
1,7159f802-6f99-4e9d-97bd-6f565a4a0fae,Недавно 95 квартал жёстко поглумился над русск...,ru,True,"[loaded_language, cherry_picking]","[[0, 40], [123, 137], [180, 251], [253, 274]]",3,0,0,0,...,0,1,0,"[0, 0, 0, 0, 0, 1, 0, 0, 1, 0]","[input_ids, token_type_ids, attention_mask, of...","[1, 1909, 21922, 6943, 64148, 1774, 20485, 456...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 2), (2, 7), (7, 10), (10, 18), (1...","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, ..."


# Datasets

In [15]:
from datasets import Dataset
import numpy as np

df['is_valid'] = (df.fold == 4)

train_columns = list(df_train.seq_labels.iloc[0].keys()) +\
                ['content', 'trigger_words', 'sequence_labels']
test_columns = list(df_train.seq_labels.iloc[0].keys()) + ['content', 'sequence_labels']

ds_train = Dataset.from_pandas(df_train[train_columns].reset_index(drop=True))
ds_valid = Dataset.from_pandas(df_valid[train_columns].reset_index(drop=True))
ds_test = Dataset.from_pandas(df_test[test_columns].reset_index(drop=True))

In [16]:
ds_train.to_pandas().head(2)

Unnamed: 0,input_ids,token_type_ids,attention_mask,offset_mapping,labels,content,trigger_words,sequence_labels
0,"[1, 55816, 544, 260, 84748, 3554, 14381, 29189...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[0, 0], [0, 4], [4, 5], [5, 6], [6, 11], [11,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, ...",Новий огляд мапи DeepState від російського вій...,"[[27, 63], [65, 88], [90, 183], [186, 308]]","[0, 0, 0, 0, 0, 1, 0, 1, 0, 0]"
1,"[1, 1909, 21922, 6943, 64148, 1774, 20485, 456...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[0, 0], [0, 2], [2, 7], [7, 10], [10, 18], [1...","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, ...",Недавно 95 квартал жёстко поглумился над русск...,"[[0, 40], [123, 137], [180, 251], [253, 274]]","[0, 0, 0, 0, 0, 1, 0, 0, 1, 0]"


In [17]:
max_length_train = max(len(x) for x in ds_train['input_ids'])
max_length_val = max(len(x) for x in ds_valid['input_ids'])
max_length_test = max(len(x) for x in ds_test['input_ids'])

print(max_length_train)
print(max_length_val)
print(max_length_test)

512
1516
1445


# Model

In [18]:
import torch
import torch.nn as nn
from transformers import AutoConfig, AutoModel
from transformers import DebertaV2Model, DebertaV2PreTrainedModel

In [19]:
class BertForTokenSequenceClassification(DebertaV2PreTrainedModel):
    def __init__(self, model_name,
                 token_loss_weight, sequence_loss_weight,
                 num_token_labels, num_sequence_labels):
        bert_model = DebertaV2Model.from_pretrained(model_name)
        super().__init__(bert_model.config)
        
        self.bert = bert_model
        hidden_size = self.config.hidden_size
        self.token_classifier = nn.Linear(hidden_size, num_token_labels)
        self.sequence_classifier = nn.Linear(hidden_size, num_sequence_labels)
        self.token_loss_weight=token_loss_weight
        self.sequence_loss_weight=sequence_loss_weight

        # Initialize weights and apply final processing
        self.post_init()

    def forward(self, input_ids, attention_mask, labels=None, sequence_labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state  # Shape: (batch, seq_len, hidden)

        # Token Classification Output (Apply to each token)
        token_logits = self.token_classifier(sequence_output)  # (batch, seq_len, num_token_labels)

        # Sequence Classification Output (Use [CLS] token's representation)
        cls_output = sequence_output[:, 0, :]  # Take first token (CLS)
        sequence_logits = self.sequence_classifier(cls_output)  # (batch, num_sequence_labels)

        loss = None
        if labels is not None and sequence_labels is not None:
            token_loss_fn = nn.CrossEntropyLoss()
            seq_loss_fn = nn.BCEWithLogitsLoss()  # For multi-label classification

            token_loss = token_loss_fn(token_logits.view(-1, token_logits.shape[-1]), labels.view(-1))
            seq_loss = seq_loss_fn(sequence_logits, sequence_labels.float())

            loss = self.token_loss_weight * token_loss +\
                   self.sequence_loss_weight * seq_loss  # Combine losses

        return {
            "loss": loss,
            "token_logits": token_logits,
            "sequence_logits": sequence_logits,
        }

## Init and Test

In [20]:
sample = ds_train[0]

# Convert input to batch format (add batch dimension)
input_ids = torch.tensor([sample["input_ids"]])
attention_mask = torch.tensor([sample["attention_mask"]])
token_labels = torch.tensor([sample["labels"]])
sequence_labels = torch.tensor([sample["sequence_labels"]])

In [21]:
model = BertForTokenSequenceClassification(
    model_name=PRETRAINED_MODEL,
    token_loss_weight=1, sequence_loss_weight=1,
    num_token_labels=2, num_sequence_labels=10
)

model

pytorch_model.bin:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

BertForTokenSequenceClassification(
  (bert): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(251000, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): Dropout(p=0.1, inplace=False)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNor

## Metrics

In [22]:
from itertools import chain

TOKEN_CLASS_DISTRIBUTION = pd.Series(list(chain(*df_train.labels.tolist()))).mean()
SEQUENCE_CLASS_DISTRIBUTION = df_train[techniques].mean().values

In [23]:
import math
from scipy.optimize import minimize_scalar
from transformers import Trainer, pipeline, TrainingArguments
from typing import Any, Dict, List, Optional, Union, Tuple
from tqdm.autonotebook import tqdm
from transformers.trainer_utils import EvalPrediction
from sklearn.metrics import f1_score


from transformers.trainer import nested_detach, is_sagemaker_mp_enabled


def extract_chars_from_spans(spans):
    """
    Given a list of spans (each a tuple (start, end)),
    return a set of character indices for all spans.
    """
    char_set = set()
    for start, end in spans:
        # Each span covers positions start, start+1, ..., end-1.
        char_set.update(range(start, end))
    return char_set


class TokenSequenceEvaluationTrainer(Trainer):
    def __init__(
        self,
        model: Any = None,
        args: TrainingArguments = None,
        data_collator: Any = None,
        train_dataset: Any = None,
        eval_dataset: Any = None,
        tokenizer: Any = None,
        sequence_class_distribution: list[float] = [0.1]*10,
        token_class_distribution: float = 0.25, # mean
        predict_tokens=True,
        **kwargs,
    ):
        """
        Initialize the Trainer with our custom compute_metrics.
        """
        super().__init__(
            model=model,
            args=args,
            data_collator=data_collator,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            tokenizer=tokenizer,
            compute_metrics=self.compute_metrics,  # assign our custom compute_metrics
            **kwargs,
        )
        self.sequence_class_distribution = sequence_class_distribution
        self.token_class_distribution = token_class_distribution
        self.predict_tokens = predict_tokens
        
    def compute_metrics(self, eval_pred: EvalPrediction) -> dict:        
        token_logits, sequence_logits = eval_pred.predictions
        token_labels, sequence_labels = eval_pred.label_ids

        # Sequence classification metrics (multi-label)
        sequence_metrics = self._compute_sequence_metrics(sequence_logits, sequence_labels)
    
        # Token classification metrics
        token_metrics = self._compute_token_metrics(token_logits, token_labels)
        
        return {
            **{f"sequence_{key}": value for key, value in sequence_metrics.items()},
            **{f"token_{key}": value for key, value in token_metrics.items()}
        }


    # SEQUENCE
    def _compute_sequence_metrics(self, logits, labels):
        proba = torch.nn.functional.sigmoid(torch.tensor(logits)).numpy()
        optimal_thresholds = self._find_thresholds_for_distribution(
            proba, desired_distribution=self.sequence_class_distribution
        )
        binarized_preds = (proba >= np.array(optimal_thresholds)).astype(int)
        
        return {"f1": f1_score(labels, binarized_preds, average="macro")}

        
    def _find_thresholds_for_distribution(self, preds, desired_distribution):
        """
        Find thresholds for each class to achieve the desired class distribution.
    
        Args:
            preds (ndarray): Array of shape (num_samples, num_classes) with probabilities (after sigmoid).
            desired_distribution (list): Desired proportion of positive samples for each class.
    
        Returns:
            thresholds (list): List of thresholds for each class.
        """
        num_classes = preds.shape[1]
        thresholds = []
    
        for class_idx in range(num_classes):
            probs = preds[:, class_idx]
            desired_ratio = desired_distribution[class_idx]
    
            # Function to minimize the difference between actual and desired positive ratios
            def objective(threshold):
                predicted_ratio = (probs >= threshold).mean()
                return abs(predicted_ratio - desired_ratio)
    
            # Find the threshold using optimization
            result = minimize_scalar(objective, bounds=(0, 1), method="bounded")
            thresholds.append(result.x)
    
        return thresholds


    # TOKEN

    def _compute_token_metrics(self, logits, labels):
        eval_dataset = self.eval_dataset
        probabilities = torch.softmax(torch.tensor(logits), dim=-1).cpu().numpy()
    
        #thresholds = np.linspace(0.1, 0.5, num=41)
        thresholds = [self._find_optimal_threshold(probabilities, labels)]
        results = []
        best_f1 = -1
        best_th = 0
        best_metrics = None
    
        for thold in tqdm(thresholds):
            # Apply thresholding instead of argmax
            predictions = (probabilities[:, :, 1] >= thold).astype(int)
    
            true_predictions = [
                [p for (p, l) in zip(prediction, label) if l != -100]
                for prediction, label in zip(predictions, labels)
            ]
    
            pred_spans_all = []
            for pred, offsets in zip(true_predictions, eval_dataset['offset_mapping']):
                samplewise_spans = []
                current_span = None
                for token_label, span in zip(pred, offsets):
                    if token_label == 1:  # If the current token is labeled as an entity (1)
                        if current_span is None:
                            current_span = [span[0], span[1]]  # Start a new span
                        else:
                            current_span[1] = span[1]  # Extend the span to include the current token
                    else:  # If token_label == 0 (not an entity)
                        if current_span is not None:
                            samplewise_spans.append(tuple(current_span))  # Save completed span
                            current_span = None  # Reset for the next entity
    
                # If the last token was part of a span, save it
                if current_span is not None:
                    samplewise_spans.append(tuple(current_span))
    
                pred_spans_all.append(samplewise_spans)
    
            # Store results for this threshold
            current_metrics = self._calculate_inner_metric(eval_dataset['trigger_words'], pred_spans_all)
            if current_metrics['f1'] >= best_f1:
                best_f1 = current_metrics['f1']
                best_th = thold
                best_metrics = current_metrics
                best_metrics['thold'] = thold
                
            
            results.append(current_metrics)
        return best_metrics


    def _calculate_inner_metric(self, gt_spans_all, pred_spans_all):
        total_true_chars = 0
        total_pred_chars = 0
        total_overlap_chars = 0
        for true_spans, pred_spans in zip(gt_spans_all, pred_spans_all):
            if isinstance(true_spans, str):
                try:
                    true_spans = eval(true_spans)
                except Exception:
                    true_spans = []
                    
            # Convert spans to sets of character indices.
            true_chars = extract_chars_from_spans(true_spans)
            pred_chars = extract_chars_from_spans(pred_spans)
            
            total_true_chars += len(true_chars)
            total_pred_chars += len(pred_chars)
            total_overlap_chars += len(true_chars.intersection(pred_chars))
            
            union_chars = true_chars.union(pred_chars)
            
        # Compute precision, recall, and F1.
        precision = total_overlap_chars / total_pred_chars if total_pred_chars > 0 else 0
        recall = total_overlap_chars / total_true_chars if total_true_chars > 0 else 0
        f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        metrics = {
            "precision": precision,
            "recall": recall,
            "f1": f1
        }
        return metrics

    def _find_optimal_threshold(self, probabilities, labels):
        """Finds the threshold that achieves the desired positive class balance."""
        best_th = 0.5  # Default starting point
        best_diff = float("inf")
        optimal_th = best_th
        
        for thold in np.linspace(0.01, 0.99, num=100):
            predictions = (probabilities[:, :, 1] >= thold).astype(int)
            true_predictions = [
                [p for (p, l) in zip(prediction, label) if l != -100]
                for prediction, label in zip(predictions, labels)
            ]
            total_pos = sum([sum(row for row in prediction) for prediction in true_predictions])
            total = sum([len(prediction) for prediction in true_predictions])
            
            positive_ratio = total_pos / total if total > 0 else 0
            
            diff = abs(positive_ratio - self.token_class_distribution)
            if diff < best_diff:
                best_diff = diff
                optimal_th = thold
        
        return optimal_th


    
            

## Train

In [24]:
EPOCHS = 2

In [25]:
from transformers import DataCollatorForTokenClassification

class CustomDataCollator(DataCollatorForTokenClassification):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
    
    def __call__(self, features):
        # Separate token-level and sequence-level labels
        sequence_labels = [f.pop("sequence_labels", None) for f in features]
        
        # Use Hugging Face's built-in collator for token classification
        batch = super().torch_call(features)
        
        # Convert sequence labels to tensor
        if sequence_labels[0] is not None:
            batch["sequence_labels"] = torch.tensor(sequence_labels, dtype=torch.int64)
        
        return batch

# Use the custom data collator
data_collator = CustomDataCollator(tokenizer=tokenizer)

In [26]:
# from transformers import AdamW, get_linear_schedule_with_warmup

# optimizer = AdamW([
#     {'params': list(model.bert.parameters()), 'lr': 2e-5},
#     {'params': list(model.token_classifier.parameters()), 'lr': 1e-4},
#     {'params': list(model.sequence_classifier.parameters()), 'lr': 1e-4}
# ])

# scheduler = get_linear_schedule_with_warmup(
#     optimizer,
#     num_warmup_steps=0.1*EPOCHS*(ds_train.num_rows/16),
#     num_training_steps=EPOCHS*(ds_train.num_rows/16)
# )

In [27]:
# from transformers import TrainingArguments, Trainer

# training_args = TrainingArguments(
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     num_train_epochs=EPOCHS,
    
#     output_dir="./results",
#     logging_strategy="steps",
#     logging_dir="./logs",
#     logging_steps=10,
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     report_to="none"
# )

# trainer = TokenSequenceEvaluationTrainer(
#     model=model,
#     args=training_args,
#     train_dataset=ds_train,
#     eval_dataset=ds_valid,
#     data_collator=data_collator,
#     tokenizer=tokenizer,
#     optimizers=(optimizer, scheduler),
#     sequence_class_distribution=SEQUENCE_CLASS_DISTRIBUTION,
#     token_class_distribution=TOKEN_CLASS_DISTRIBUTION
# )

In [28]:
os.environ['WANDB_DISABLED'] = 'true'

import math
from transformers import Trainer, pipeline, TrainingArguments
from typing import Any
from transformers.trainer_utils import EvalPrediction


train_args = TrainingArguments(
    output_dir='model_checkpoints_mdebertav3',
    logging_dir='./model_logs_mdebertav3',
    learning_rate=2e-5,
    weight_decay=0.01,
    lr_scheduler_type='cosine',
    warmup_ratio=0.0,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    #bf16=True,
    # report_to="wandb",
    optim='adamw_torch',
    eval_strategy='steps',
    save_strategy="steps",
    eval_steps=100,
    logging_steps=10,
    save_steps=100,
    save_total_limit=10,
    # metric_for_best_model='eval_f1',
    greater_is_better=True,
    load_best_model_at_end=True,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [29]:
trainer = TokenSequenceEvaluationTrainer(
    model=model,
    args=train_args,
    train_dataset=ds_train,
    eval_dataset=ds_valid,
    data_collator=data_collator,
    tokenizer=tokenizer,
    sequence_class_distribution=SEQUENCE_CLASS_DISTRIBUTION,
    token_class_distribution=TOKEN_CLASS_DISTRIBUTION
)

  super().__init__(


In [32]:
torch.cuda.empty_cache()

In [33]:
trainer.train()

Step,Training Loss,Validation Loss,Sequence F1,Token Precision,Token Recall,Token F1,Token Thold
100,0.7063,0.759248,0.075924,0.594398,0.571025,0.582477,0.30697
200,0.6151,0.731915,0.137278,0.598613,0.571432,0.584707,0.376263
300,0.7161,0.704062,0.188732,0.602733,0.581318,0.591832,0.396061


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

KeyboardInterrupt: 

# Predict

In [34]:
FINETUNED_MODEL = '/kaggle/working/model_checkpoints_mdebertav3/checkpoint-300'

In [35]:
trainer._load_from_checkpoint(FINETUNED_MODEL)
trainer.model = trainer.model.cuda().eval()

In [36]:
valid_preds = trainer.predict(ds_valid)
trainer.compute_metrics(valid_preds)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'sequence_f1': 0.18873196476404694,
 'token_precision': 0.6027327356390607,
 'token_recall': 0.581317991276751,
 'token_f1': 0.5918317098982744,
 'token_thold': 0.39606060606060606}

In [39]:
trainer.label_names

['labels', 'sequence_labels']

In [40]:
test_preds = trainer.predict(ds_test)

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


  0%|          | 0/1 [00:00<?, ?it/s]

In [44]:
test_probabilities = torch.softmax(torch.tensor(test_preds.predictions[0]), dim=-1).cpu().numpy()

In [45]:
trainer._find_optimal_threshold(test_probabilities, test_preds.label_ids[0])

0.36636363636363634

In [46]:
# optimal th on (val set + test set) / 2
final_th = (0.396 + 0.366)/2
final_th

0.381

In [47]:
def inference_aggregation(probabilities, labels, offset_mappings, thold):
    predictions = (probabilities[:, :, 1] >= thold).astype(int)
    true_predictions = [
        [p for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)
    ]
    pred_spans_all = []
    for pred, offsets in zip(true_predictions, offset_mappings):
        samplewise_spans = []
        current_span = None
        for token_label, span in zip(pred, offsets):
            if token_label == 1:  # If the current token is labeled as an entity (1)
                if current_span is None:
                    current_span = [span[0], span[1]]  # Start a new span
                else:
                    current_span[1] = span[1]  # Extend the span to include the current token
            else:  # If token_label == 0 (not an entity)
                if current_span is not None:
                    samplewise_spans.append(tuple(current_span))  # Save completed span
                    current_span = None  # Reset for the next entity
        
                    # If the last token was part of a span, save it
        if current_span is not None:
            samplewise_spans.append(tuple(current_span))
        
        pred_spans_all.append(samplewise_spans)
    return [str(row) for row in pred_spans_all]

In [48]:
valid_probabilities = torch.softmax(torch.tensor(valid_preds.predictions[0]), dim=-1).cpu().numpy()
valid_results = inference_aggregation(
    valid_probabilities, valid_preds.label_ids[0],
    ds_valid['offset_mapping'],
    final_th
    )

In [49]:
import pandas as pd
import pandas.api.types
from sklearn.metrics import f1_score
import ast


class ParticipantVisibleError(Exception):
    """Custom exception for participant-visible errors."""
    pass


def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str) -> float:
    """
    Compute span-level F1 score based on overlap.

    Parameters:
    - solution (pd.DataFrame): Ground truth DataFrame with row ID and token labels.
    - submission (pd.DataFrame): Submission DataFrame with row ID and token labels.
    - row_id_column_name (str): Column name for the row identifier.

    Returns:
    - float: The token-level weighted F1 score.

    Example:
    >>> solution = pd.DataFrame({
    ...     "id": [1, 2, 3],
    ...     "trigger_words": [[(612, 622), (725, 831)], [(300, 312)], []]
    ... })
    >>> submission = pd.DataFrame({
    ...     "id": [1, 2, 3],
    ...     "trigger_words": [[(612, 622), (700, 720)], [(300, 312)], [(100, 200)]]
    ... })
    >>> score(solution, submission, "id")
    0.16296296296296295
    """
    if not all(col in solution.columns for col in ["id", "trigger_words"]):
        raise ValueError("Solution DataFrame must contain 'id' and 'trigger_words' columns.")
    if not all(col in submission.columns for col in ["id", "trigger_words"]):
        raise ValueError("Submission DataFrame must contain 'id' and 'trigger_words' columns.")
    
    def safe_parse_spans(trigger_words):
        if isinstance(trigger_words, str):
            try:
                return ast.literal_eval(trigger_words)
            except (ValueError, SyntaxError):
                return []
        if isinstance(trigger_words, (list, tuple, np.ndarray)):
            return trigger_words
        return []

    def extract_tokens_from_spans(spans):
        tokens = set()
        for start, end in spans:
            tokens.update(range(start, end))
        return tokens
    
    solution = solution.copy()
    submission = submission.copy()

    solution["trigger_words"] = solution["trigger_words"].apply(safe_parse_spans)
    submission["trigger_words"] = submission["trigger_words"].apply(safe_parse_spans)

    # print(solution)
    # print()
    # print(submission)

    merged = pd.merge(
        solution,
        submission,
        on="id",
        suffixes=("_solution", "_submission")
    )

    total_true_tokens = 0
    total_pred_tokens = 0
    overlapping_tokens = 0

    for _, row in merged.iterrows():
        true_spans = row["trigger_words_solution"]
        pred_spans = row["trigger_words_submission"]
        # print(true_spans)
        # print()
        # print(pred_spans)
        # print()

        true_tokens = extract_tokens_from_spans(true_spans)
        pred_tokens = extract_tokens_from_spans(pred_spans)

        # print(true_tokens)
        # print()
        # print(pred_tokens)

        total_true_tokens += len(true_tokens)
        total_pred_tokens += len(pred_tokens)
        overlapping_tokens += len(true_tokens & pred_tokens)

    # print(true_tokens)
    # print()
    # print(pred_tokens)
    
    precision = overlapping_tokens / total_pred_tokens if total_pred_tokens > 0 else 0
    recall = overlapping_tokens / total_true_tokens if total_true_tokens > 0 else 0
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return f1

In [52]:
from copy import deepcopy

df_gt = df_valid[['id', 'trigger_words']].reset_index(drop=True)
df_pred = deepcopy(df_gt)
df_pred['trigger_words'] = valid_results
score(df_gt, df_pred, row_id_column_name='id')

0.5974773461274381

In [53]:
test_results = inference_aggregation(
    test_probabilities, test_preds.label_ids[0],
    ds_test['offset_mapping'],
    final_th
    )

In [55]:
ss = pd.read_csv('/kaggle/input/unlp-2025-shared-task-span-identification/sample_submission.csv')
ss['trigger_words'] = test_results

In [57]:
ss.head()

Unnamed: 0,id,trigger_words
0,521cd2e8-dd9f-42c4-98ba-c0c8890ff1ba,"[(0, 253)]"
1,9b2a61e4-d14e-4ff7-b304-e73d720319bf,"[(373, 425)]"
2,f0f1c236-80a8-4d25-b30c-a420a39be632,"[(14, 46), (47, 127)]"
3,31ea05ba-2c2b-4b84-aba7-f3cf6841b204,[]
4,a79e13ec-6d9a-40b5-b54c-7f4f743a7525,"[(55, 59), (63, 72), (86, 103), (126, 309)]"


In [58]:
os.makedirs('submissions', exist_ok=True)

ss.to_csv('submissions/aux_trunc_mdebertav3-binary-cv0.597.csv', index=False)