In [1]:
!pip install bitsandbytes==0.43.2

Collecting bitsandbytes==0.43.2
  Downloading bitsandbytes-0.43.2-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading bitsandbytes-0.43.2-py3-none-manylinux_2_24_x86_64.whl (137.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.5/137.5 MB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0mm
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.43.2


In [2]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("hugginface_key")
secret_value_1 = user_secrets.get_secret("wandb_key")

In [3]:
from huggingface_hub import login
login(token=secret_value_0)

In [4]:
import pandas as pd

df = pd.read_parquet("/kaggle/input/unlp-2025-shared-task-span-identification/train.parquet")#'train.parquet')
cv = pd.read_csv("/kaggle/input/span-ident-cv-split-csv/cv_split.csv")#'cv_split.csv')
df = df.merge(cv, on='id', how='left')

df_test = pd.read_csv("/kaggle/input/unlp-2025-shared-task-span-identification/test.csv")#'test.csv')

In [5]:
import spacy

from spacy.training.iob_utils import biluo_to_iob, doc_to_biluo_tags
from tqdm.autonotebook import tqdm
tqdm.pandas()

df.trigger_words = df.trigger_words.apply(lambda x: [] if x is None else x)

In [10]:
PRETRAINED_MODEL ="unsloth/gemma-2-2b-it-bnb-4bit"
#"unsloth/gemma-2-9b-it-bnb-4bit"
#"unsloth/gemma-2-2b-it-bnb-4bit"
#'google/gemma-2-2b-it'
#'google/gemma-2-9b-it'

MAX_LEN = 1400

In [11]:
from transformers import AutoTokenizer
import pandas as pd
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)

In [12]:
def convert_to_seq_labeling(text, tokenizer, trigger_spans=None):
    tokenized_output = tokenizer(
        text, return_offsets_mapping=True, add_special_tokens=True, max_length=MAX_LEN,
        truncation=True, padding=False
    )
    tokens = tokenized_output["input_ids"]
    offsets = tokenized_output["offset_mapping"]

    # Get subword tokenized versions of the text
    token_strings = tokenizer.convert_ids_to_tokens(tokens)

    
    # Initialize labels as 'O'
    labels = [0] * len(tokens)

    if trigger_spans is not None:
        # Assign 'TRIGGER' to overlapping tokens
        for start, end in trigger_spans:
            for i, (tok_start, tok_end) in enumerate(offsets):
                if tok_start == 0 and tok_end == 0:
                    continue
                if tok_start < end and tok_end > start:  # If token overlaps with the trigger span
                    labels[i] = 1

    tokenized_output['labels'] = labels
    # tokenized_output['token_strings'] = token_strings
    return tokenized_output

In [13]:
from tqdm.autonotebook import tqdm

tqdm.pandas()

df['seq_labels'] = df.progress_apply(lambda row: convert_to_seq_labeling(row['content'], tokenizer, row['trigger_words']), axis=1)
for column in df.seq_labels.iloc[0].keys():
    df[column] = df.seq_labels.apply(lambda x: x.get(column))

df_test['seq_labels'] = df_test.progress_apply(lambda row: convert_to_seq_labeling(row['content'], tokenizer, None), axis=1)
for column in df_test.seq_labels.iloc[0].keys():
    df_test[column] = df_test.seq_labels.apply(lambda x: x.get(column))

  0%|          | 0/3822 [00:00<?, ?it/s]

  0%|          | 0/5735 [00:00<?, ?it/s]

In [14]:
from datasets import Dataset
import numpy as np

df['is_valid'] = df.fold == 4

columns = list(df.seq_labels.iloc[0].keys()) + ['content', 'trigger_words']
ds_train = Dataset.from_pandas(df[df.is_valid==0][columns].reset_index(drop=True))
ds_valid = Dataset.from_pandas(df[df.is_valid==1][columns].reset_index(drop=True))

columns = list(df.seq_labels.iloc[0].keys()) + ['content']
ds_test = Dataset.from_pandas(df_test[columns].reset_index(drop=True))

In [17]:
import torch
from transformers import Gemma2ForTokenClassification, LlamaForTokenClassification, BitsAndBytesConfig
from peft import get_peft_config, prepare_model_for_kbit_training, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType

id2label = {0: 0, 1: 1}
label2id = {0: 0, 1: 1}

nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=False,
   bnb_4bit_compute_dtype=torch.float16
)

model = Gemma2ForTokenClassification.from_pretrained(
    PRETRAINED_MODEL,
    id2label=id2label,
    label2id=label2id,
    torch_dtype=torch.float16,
    device_map="cuda:0",
    quantization_config=nf4_config
)

lora_config = LoraConfig(
    r=64,  # the dimension of the low-rank matrices
    lora_alpha=32, # scaling factor for LoRA activations vs pre-trained weight activations
    lora_dropout=0.05, 
    bias='none',
    inference_mode=False,
    task_type=TaskType.TOKEN_CLS,
    target_modules=['o_proj', 'v_proj', "q_proj", "k_proj", "gate_proj", "down_proj", "up_proj"]
) 

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
# Trainable Parameters
model.print_trainable_parameters()

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


model.safetensors:   0%|          | 0.00/2.22G [00:00<?, ?B/s]

Some weights of Gemma2ForTokenClassification were not initialized from the model checkpoint at unsloth/gemma-2-2b-it-bnb-4bit and are newly initialized: ['score.bias', 'score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 83,071,490 || all params: 2,697,417,988 || trainable%: 3.0797


In [None]:
# for name, param in model.named_parameters():
#     print(name, param.dtype)

In [18]:
import os
import random
import numpy as np
import torch

def set_seeds(seed):
    """Set seeds for reproducibility """
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        

set_seeds(seed=42)

In [19]:
import math
from transformers import Trainer, pipeline, TrainingArguments
from typing import Any
from transformers.trainer_utils import EvalPrediction


train_args = TrainingArguments(
    output_dir='model_checkpoints_gemma2_2b_binary',
    logging_dir='./model_logs_gemma2_2b_binary',
    learning_rate=2e-5,
    weight_decay=0.01,
    lr_scheduler_type='cosine',
    warmup_ratio=0.0,
    num_train_epochs=3,
    per_device_train_batch_size=4,#2
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=2,#4
    # bf16=True,
    report_to="wandb",
    optim='adamw_8bit',
    eval_strategy='steps',
    save_strategy="steps",
    eval_steps=200,
    logging_steps=20,
    save_steps=200,
    save_total_limit=10,
    metric_for_best_model='eval_f1',
    greater_is_better=True,
    load_best_model_at_end=True,
)

In [20]:
import wandb
wandb.login(key=secret_value_1)

# Initialize with team/entity
wandb.init(
    project="unlp-span-ident-task",
    entity="IASA-BA-Diploma-Ivan-Bashtovyi", 
    name='unsloth-gemma2-2b-binary-full-seq',
    settings=wandb.Settings(init_timeout=180)  # Increase timeout
)

import os
os.environ["WANDB_LOG_MODEL"] = "checkpoint"


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mivanbashtovyi1[0m ([33mIASA-BA-Diploma-Ivan-Bashtovyi[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [21]:
from itertools import chain

positive_class_balance = pd.Series(list(chain(*df.labels.tolist()))).mean()
positive_class_balance

0.2291704630193249

In [22]:
import math
from transformers import Trainer, TrainingArguments
from typing import Any
from tqdm.autonotebook import tqdm
from transformers.trainer_utils import EvalPrediction

def extract_chars_from_spans(spans):
    """
    Given a list of spans (each a tuple (start, end)),
    return a set of character indices for all spans.
    """
    char_set = set()
    for start, end in spans:
        # Each span covers positions start, start+1, ..., end-1.
        char_set.update(range(start, end))
    return char_set

class SpanEvaluationTrainer(Trainer):
    def __init__(
        self,
        model: Any = None,
        args: TrainingArguments = None,
        data_collator: Any = None,
        train_dataset: Any = None,
        eval_dataset: Any = None,
        tokenizer: Any = None,
        desired_positive_ratio: float = 0.25,
        **kwargs,
    ):
        """
        Initialize the Trainer with our custom compute_metrics.
        """
        super().__init__(
            model=model,
            args=args,
            data_collator=data_collator,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            tokenizer=tokenizer,
            compute_metrics=self.compute_metrics,  # assign our custom compute_metrics
            **kwargs,
        )
        self.desired_positive_ratio = desired_positive_ratio

    def _calculate_inner_metric(self, gt_spans_all, pred_spans_all):
        total_true_chars = 0
        total_pred_chars = 0
        total_overlap_chars = 0
        for true_spans, pred_spans in zip(gt_spans_all, pred_spans_all):
            if isinstance(true_spans, str):
                try:
                    true_spans = eval(true_spans)
                except Exception:
                    true_spans = []
                    
            # Convert spans to sets of character indices.
            true_chars = extract_chars_from_spans(true_spans)
            pred_chars = extract_chars_from_spans(pred_spans)
            
            total_true_chars += len(true_chars)
            total_pred_chars += len(pred_chars)
            total_overlap_chars += len(true_chars.intersection(pred_chars))
            
            union_chars = true_chars.union(pred_chars)
            
        # Compute precision, recall, and F1.
        precision = total_overlap_chars / total_pred_chars if total_pred_chars > 0 else 0
        recall = total_overlap_chars / total_true_chars if total_true_chars > 0 else 0
        f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        metrics = {
            "precision": precision,
            "recall": recall,
            "f1": f1
        }
        return metrics

    def _find_optimal_threshold(self, probabilities, labels):
        """Finds the threshold that achieves the desired positive class balance."""
        best_th = 0.5  # Default starting point
        best_diff = float("inf")
        optimal_th = best_th
        
        for thold in np.linspace(0.01, 0.99, num=100):
            predictions = (probabilities[:, :, 1] >= thold).astype(int)
            true_predictions = [
                [p for (p, l) in zip(prediction, label) if l != -100]
                for prediction, label in zip(predictions, labels)
            ]
            total_pos = sum([sum(row for row in prediction) for prediction in true_predictions])
            total = sum([len(prediction) for prediction in true_predictions])
            
            positive_ratio = total_pos / total if total > 0 else 0
            
            diff = abs(positive_ratio - self.desired_positive_ratio)
            if diff < best_diff:
                best_diff = diff
                optimal_th = thold
        
        return optimal_th
        
        
    def compute_metrics(self, eval_pred: EvalPrediction) -> dict:
        eval_dataset = self.eval_dataset
        logits, labels = eval_pred
        probabilities = torch.softmax(torch.tensor(logits), dim=-1).cpu().numpy()
    
        #thresholds = np.linspace(0.1, 0.5, num=41)
        thresholds = [self._find_optimal_threshold(probabilities, labels)]
        results = []
        best_f1 = -1
        best_th = 0
        best_metrics = None
    
        for thold in tqdm(thresholds):
            # Apply thresholding instead of argmax
            predictions = (probabilities[:, :, 1] >= thold).astype(int)
    
            true_predictions = [
                [p for (p, l) in zip(prediction, label) if l != -100]
                for prediction, label in zip(predictions, labels)
            ]
    
            pred_spans_all = []
            for pred, offsets in zip(true_predictions, eval_dataset['offset_mapping']):
                samplewise_spans = []
                current_span = None
                for token_label, span in zip(pred, offsets):
                    if token_label == 1:  # If the current token is labeled as an entity (1)
                        if current_span is None:
                            current_span = [span[0], span[1]]  # Start a new span
                        else:
                            current_span[1] = span[1]  # Extend the span to include the current token
                    else:  # If token_label == 0 (not an entity)
                        if current_span is not None:
                            samplewise_spans.append(tuple(current_span))  # Save completed span
                            current_span = None  # Reset for the next entity
    
                # If the last token was part of a span, save it
                if current_span is not None:
                    samplewise_spans.append(tuple(current_span))
    
                pred_spans_all.append(samplewise_spans)
    
            # Store results for this threshold
            current_metrics = self._calculate_inner_metric(eval_dataset['trigger_words'], pred_spans_all)
            if current_metrics['f1'] >= best_f1:
                best_f1 = current_metrics['f1']
                best_th = thold
                best_metrics = current_metrics
                best_metrics['thold'] = thold
                
            
            results.append(current_metrics)
        return best_metrics

In [23]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

trainer = SpanEvaluationTrainer(
    model=model,
    args=train_args,
    train_dataset=ds_train,
    eval_dataset=ds_valid,
    data_collator=data_collator,
    tokenizer=tokenizer,
    desired_positive_ratio=positive_class_balance
)
trainer.train()

  super().__init__(
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss,Precision,Recall,F1,Thold
200,0.4483,0.426781,0.575162,0.563956,0.569504,0.435657
400,0.3724,0.419625,0.592053,0.572484,0.582104,0.415859
600,0.4064,0.415686,0.589642,0.585094,0.587359,0.524747
800,0.4148,0.4192,0.590919,0.585213,0.588052,0.40596
1000,0.3825,0.418223,0.590361,0.583007,0.586661,0.485152


The 'batch_size' argument of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'max_batch_size' argument instead.
The 'batch_size' attribute of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'self.max_batch_size' attribute instead.


  0%|          | 0/1 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (./model_checkpoints_gemma2_2b_binary/checkpoint-200)... Done. 1.3s
  return fn(*args, **kwargs)


  0%|          | 0/1 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (./model_checkpoints_gemma2_2b_binary/checkpoint-400)... Done. 1.2s
  return fn(*args, **kwargs)


  0%|          | 0/1 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (./model_checkpoints_gemma2_2b_binary/checkpoint-600)... Done. 1.2s
  return fn(*args, **kwargs)


  0%|          | 0/1 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (./model_checkpoints_gemma2_2b_binary/checkpoint-800)... Done. 1.2s
  return fn(*args, **kwargs)


  0%|          | 0/1 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (./model_checkpoints_gemma2_2b_binary/checkpoint-1000)... Done. 1.3s
  return fn(*args, **kwargs)


KeyboardInterrupt: 

In [24]:
import torch
from transformers import Gemma2ForTokenClassification, BitsAndBytesConfig
from peft import PeftModel, get_peft_config, prepare_model_for_kbit_training, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType



nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=False,
   bnb_4bit_compute_dtype=torch.float16
)

model = Gemma2ForTokenClassification.from_pretrained(
    PRETRAINED_MODEL,
    id2label=id2label,
    label2id=label2id,
    torch_dtype=torch.float16,
    device_map="cuda:0",
    quantization_config=nf4_config
)

model = prepare_model_for_kbit_training(model)

model = PeftModel.from_pretrained(model, "./model_checkpoints_gemma2_2b_binary/checkpoint-800/")
# Trainable Parameters
model.print_trainable_parameters()

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
Some weights of Gemma2ForTokenClassification were not initialized from the model checkpoint at unsloth/gemma-2-2b-it-bnb-4bit and are newly initialized: ['score.bias', 'score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 4,610 || all params: 2,697,417,988 || trainable%: 0.0002


In [25]:
trainer.model = model.cuda().eval()

In [26]:
valid_preds = trainer.predict(ds_valid)

  0%|          | 0/1 [00:00<?, ?it/s]

In [27]:
valid_preds.predictions.shape, valid_preds.label_ids.shape

((764, 1400, 2), (764, 1400))

In [28]:
trainer.compute_metrics((valid_preds.predictions, valid_preds.label_ids))

  0%|          | 0/1 [00:00<?, ?it/s]

{'precision': 0.5909192171916235,
 'recall': 0.5852129049776827,
 'f1': 0.5880522182525134,
 'thold': 0.40595959595959596}

In [29]:
test_preds = trainer.predict(ds_test)

  0%|          | 0/1 [00:00<?, ?it/s]

In [30]:
test_probabilities = torch.softmax(torch.tensor(test_preds.predictions), dim=-1).cpu().numpy()

In [31]:
trainer._find_optimal_threshold(test_probabilities, test_preds.label_ids)

0.36636363636363634

In [32]:
final_th = (0.405959+0.366363)/2
final_th

0.386161

In [33]:
def inference_aggregation(probabilities, labels, offset_mappings, thold):
    predictions = (probabilities[:, :, 1] >= thold).astype(int)
    true_predictions = [
        [p for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)
    ]
    pred_spans_all = []
    for pred, offsets in zip(true_predictions, offset_mappings):
        samplewise_spans = []
        current_span = None
        for token_label, span in zip(pred, offsets):
            if token_label == 1:  # If the current token is labeled as an entity (1)
                if current_span is None:
                    current_span = [span[0], span[1]]  # Start a new span
                else:
                    current_span[1] = span[1]  # Extend the span to include the current token
            else:  # If token_label == 0 (not an entity)
                if current_span is not None:
                    samplewise_spans.append(tuple(current_span))  # Save completed span
                    current_span = None  # Reset for the next entity
        
                    # If the last token was part of a span, save it
        if current_span is not None:
            samplewise_spans.append(tuple(current_span))
        
        pred_spans_all.append(samplewise_spans)
    return [str(row) for row in pred_spans_all]

In [34]:
valid_probabilities = torch.softmax(torch.tensor(valid_preds.predictions), dim=-1).cpu().numpy()
valid_results = inference_aggregation(valid_probabilities, valid_preds.label_ids, ds_valid['offset_mapping'], final_th)

In [35]:
import pandas as pd
import pandas.api.types
from sklearn.metrics import f1_score
import ast


class ParticipantVisibleError(Exception):
    """Custom exception for participant-visible errors."""
    pass


def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str) -> float:
    """
    Compute span-level F1 score based on overlap.

    Parameters:
    - solution (pd.DataFrame): Ground truth DataFrame with row ID and token labels.
    - submission (pd.DataFrame): Submission DataFrame with row ID and token labels.
    - row_id_column_name (str): Column name for the row identifier.

    Returns:
    - float: The token-level weighted F1 score.

    Example:
    >>> solution = pd.DataFrame({
    ...     "id": [1, 2, 3],
    ...     "trigger_words": [[(612, 622), (725, 831)], [(300, 312)], []]
    ... })
    >>> submission = pd.DataFrame({
    ...     "id": [1, 2, 3],
    ...     "trigger_words": [[(612, 622), (700, 720)], [(300, 312)], [(100, 200)]]
    ... })
    >>> score(solution, submission, "id")
    0.16296296296296295
    """
    if not all(col in solution.columns for col in ["id", "trigger_words"]):
        raise ValueError("Solution DataFrame must contain 'id' and 'trigger_words' columns.")
    if not all(col in submission.columns for col in ["id", "trigger_words"]):
        raise ValueError("Submission DataFrame must contain 'id' and 'trigger_words' columns.")
    
    def safe_parse_spans(trigger_words):
        if isinstance(trigger_words, str):
            try:
                return ast.literal_eval(trigger_words)
            except (ValueError, SyntaxError):
                return []
        if isinstance(trigger_words, (list, tuple, np.ndarray)):
            return trigger_words
        return []

    def extract_tokens_from_spans(spans):
        tokens = set()
        for start, end in spans:
            tokens.update(range(start, end))
        return tokens
    
    solution = solution.copy()
    submission = submission.copy()

    solution["trigger_words"] = solution["trigger_words"].apply(safe_parse_spans)
    submission["trigger_words"] = submission["trigger_words"].apply(safe_parse_spans)

    merged = pd.merge(
        solution,
        submission,
        on="id",
        suffixes=("_solution", "_submission")
    )

    total_true_tokens = 0
    total_pred_tokens = 0
    overlapping_tokens = 0

    for _, row in merged.iterrows():
        true_spans = row["trigger_words_solution"]
        pred_spans = row["trigger_words_submission"]

        true_tokens = extract_tokens_from_spans(true_spans)
        pred_tokens = extract_tokens_from_spans(pred_spans)

        total_true_tokens += len(true_tokens)
        total_pred_tokens += len(pred_tokens)
        overlapping_tokens += len(true_tokens & pred_tokens)

    precision = overlapping_tokens / total_pred_tokens if total_pred_tokens > 0 else 0
    recall = overlapping_tokens / total_true_tokens if total_true_tokens > 0 else 0
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return f1

In [36]:
from copy import deepcopy

df_gt = df[df.fold==4][['id', 'trigger_words']].reset_index(drop=True)
df_pred = deepcopy(df_gt)
df_pred['trigger_words'] = valid_results
score(df_gt, df_pred, row_id_column_name='id')

0.5935790586726836

In [37]:
test_results = inference_aggregation(test_probabilities, test_preds.label_ids, ds_test['offset_mapping'], final_th)

In [38]:
ss = pd.read_csv("/kaggle/input/unlp-2025-shared-task-span-identification/sample_submission.csv")#'sample_submission.csv')
ss['trigger_words'] = test_results

In [39]:
ss.to_csv('unsloth-full-seq-gemma2-2b-binary-cv0.593.csv', index=False)

In [None]:
import pickle

pickle.dump(valid_preds, open('valid_preds_gemma2_binary.pkl', 'wb'))
pickle.dump(test_preds, open('test_preds_gemma2_binary.pkl', 'wb'))