In [1]:
import os
import random
import numpy as np
import torch
from tqdm.autonotebook import tqdm

def set_seeds(seed):
    """Set seeds for reproducibility """
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        

set_seeds(seed=42)
tqdm.pandas()

  from tqdm.autonotebook import tqdm


In [2]:
DATA_PATH = '/kaggle/input/unlp-2025-shared-task-span-identification/'
CV_PATH = "/kaggle/input/unlp25-cross-validation-split/cv_split.csv"

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

PRETRAINED_MODEL = "microsoft/mdeberta-v3-base"
MAX_LEN = 512

# Data

In [3]:
import pandas as pd

df = pd.read_parquet(DATA_PATH + "train.parquet")
cv = pd.read_csv(CV_PATH)
df = df.merge(cv, on='id', how='left')

df_test = pd.read_csv(DATA_PATH + "test.csv")

In [4]:
df.head()

Unnamed: 0,id,content,lang,manipulative,techniques,trigger_words,fold
0,0bb0c7fa-101b-4583-a5f9-9d503339141c,Новий огляд мапи DeepState від російського вій...,uk,True,"[euphoria, loaded_language]","[[27, 63], [65, 88], [90, 183], [186, 308]]",1
1,7159f802-6f99-4e9d-97bd-6f565a4a0fae,Недавно 95 квартал жёстко поглумился над русск...,ru,True,"[loaded_language, cherry_picking]","[[0, 40], [123, 137], [180, 251], [253, 274]]",3
2,e6a427f1-211f-405f-bd8b-70798458d656,🤩\nТим часом йде евакуація Бєлгородського авто...,uk,True,"[loaded_language, euphoria]","[[55, 100]]",1
3,1647a352-4cd3-40f6-bfa1-d87d42e34eea,В Україні найближчим часом мають намір посилит...,uk,False,,,2
4,9c01de00-841f-4b50-9407-104e9ffb03bf,"Расчёты 122-мм САУ 2С1 ""Гвоздика"" 132-й бригад...",ru,True,[loaded_language],"[[114, 144]]",2


# Targets Prep

## Classification

In [5]:
from collections.abc import Iterable

techniques = ['straw_man', 'appeal_to_fear', 'fud', 'bandwagon', 'whataboutism', 'loaded_language', 'glittering_generalities', 'euphoria', 'cherry_picking', 'cliche']

for col in techniques:
    df[col] = 0

import numpy as np
for ind, row in df.iterrows():
    if isinstance(row['techniques'], Iterable):
        for t in row['techniques']:
            df.loc[ind, t] = 1

df['sequence_labels'] = list(df[techniques].values)
# df.drop(columns=techniques, inplace=True)

In [6]:
df.head()

Unnamed: 0,id,content,lang,manipulative,techniques,trigger_words,fold,straw_man,appeal_to_fear,fud,bandwagon,whataboutism,loaded_language,glittering_generalities,euphoria,cherry_picking,cliche,sequence_labels
0,0bb0c7fa-101b-4583-a5f9-9d503339141c,Новий огляд мапи DeepState від російського вій...,uk,True,"[euphoria, loaded_language]","[[27, 63], [65, 88], [90, 183], [186, 308]]",1,0,0,0,0,0,1,0,1,0,0,"[0, 0, 0, 0, 0, 1, 0, 1, 0, 0]"
1,7159f802-6f99-4e9d-97bd-6f565a4a0fae,Недавно 95 квартал жёстко поглумился над русск...,ru,True,"[loaded_language, cherry_picking]","[[0, 40], [123, 137], [180, 251], [253, 274]]",3,0,0,0,0,0,1,0,0,1,0,"[0, 0, 0, 0, 0, 1, 0, 0, 1, 0]"
2,e6a427f1-211f-405f-bd8b-70798458d656,🤩\nТим часом йде евакуація Бєлгородського авто...,uk,True,"[loaded_language, euphoria]","[[55, 100]]",1,0,0,0,0,0,1,0,1,0,0,"[0, 0, 0, 0, 0, 1, 0, 1, 0, 0]"
3,1647a352-4cd3-40f6-bfa1-d87d42e34eea,В Україні найближчим часом мають намір посилит...,uk,False,,,2,0,0,0,0,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,9c01de00-841f-4b50-9407-104e9ffb03bf,"Расчёты 122-мм САУ 2С1 ""Гвоздика"" 132-й бригад...",ru,True,[loaded_language],"[[114, 144]]",2,0,0,0,0,0,1,0,0,0,0,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]"


## Span

In [7]:
from transformers import AutoTokenizer
import pandas as pd

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]



In [8]:
def convert_to_seq_labeling(text, tokenizer, trigger_spans=None):
    tokenized_output = tokenizer(
        text, return_offsets_mapping=True, add_special_tokens=True, max_length=MAX_LEN,
        truncation=True, padding=False
    )
    tokens = tokenized_output["input_ids"]
    offsets = tokenized_output["offset_mapping"]

    # Get subword tokenized versions of the text
    token_strings = tokenizer.convert_ids_to_tokens(tokens)

    
    # Initialize labels as 'O'
    labels = [0] * len(tokens)

    if trigger_spans is not None:
        # Assign 'TRIGGER' to overlapping tokens
        for start, end in trigger_spans:
            for i, (tok_start, tok_end) in enumerate(offsets):
                if tok_start == 0 and tok_end == 0:
                    continue
                if tok_start < end and tok_end > start:  # If token overlaps with the trigger span
                    labels[i] = 1

    tokenized_output['labels'] = labels
    return tokenized_output

In [9]:
df.trigger_words = df.trigger_words.apply(lambda x: [] if x is None else x)

df['seq_labels'] = df.progress_apply(lambda row: convert_to_seq_labeling(row['content'], tokenizer, row['trigger_words']), axis=1)
for column in df.seq_labels.iloc[0].keys():
    df[column] = df.seq_labels.apply(lambda x: x.get(column))

df_test['seq_labels'] = df_test.progress_apply(lambda row: convert_to_seq_labeling(row['content'], tokenizer, None), axis=1)
for column in df_test.seq_labels.iloc[0].keys():
    df_test[column] = df_test.seq_labels.apply(lambda x: x.get(column))

  0%|          | 0/3822 [00:00<?, ?it/s]

  0%|          | 0/5735 [00:00<?, ?it/s]

In [10]:
df.head(2)

Unnamed: 0,id,content,lang,manipulative,techniques,trigger_words,fold,straw_man,appeal_to_fear,fud,...,euphoria,cherry_picking,cliche,sequence_labels,seq_labels,input_ids,token_type_ids,attention_mask,offset_mapping,labels
0,0bb0c7fa-101b-4583-a5f9-9d503339141c,Новий огляд мапи DeepState від російського вій...,uk,True,"[euphoria, loaded_language]","[[27, 63], [65, 88], [90, 183], [186, 308]]",1,0,0,0,...,1,0,0,"[0, 0, 0, 0, 0, 1, 0, 1, 0, 0]","[input_ids, token_type_ids, attention_mask, of...","[1, 55816, 544, 260, 84748, 3554, 14381, 29189...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 4), (4, 5), (5, 6), (6, 11), (11,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, ..."
1,7159f802-6f99-4e9d-97bd-6f565a4a0fae,Недавно 95 квартал жёстко поглумился над русск...,ru,True,"[loaded_language, cherry_picking]","[[0, 40], [123, 137], [180, 251], [253, 274]]",3,0,0,0,...,0,1,0,"[0, 0, 0, 0, 0, 1, 0, 0, 1, 0]","[input_ids, token_type_ids, attention_mask, of...","[1, 1909, 21922, 6943, 64148, 1774, 20485, 456...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 2), (2, 7), (7, 10), (10, 18), (1...","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, ..."


# Datasets

In [11]:
from datasets import Dataset
import numpy as np

df['is_valid'] = (df.fold == 4)

columns = list(df.seq_labels.iloc[0].keys()) + ['content', 'trigger_words', 'sequence_labels']
ds_train = Dataset.from_pandas(df[df.is_valid==0][columns].reset_index(drop=True))
ds_valid = Dataset.from_pandas(df[df.is_valid==1][columns].reset_index(drop=True))

columns = list(df.seq_labels.iloc[0].keys()) + ['content']
ds_test = Dataset.from_pandas(df_test[columns].reset_index(drop=True))

In [12]:
ds_train.to_pandas().head(2)

Unnamed: 0,input_ids,token_type_ids,attention_mask,offset_mapping,labels,content,trigger_words,sequence_labels
0,"[1, 55816, 544, 260, 84748, 3554, 14381, 29189...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[0, 0], [0, 4], [4, 5], [5, 6], [6, 11], [11,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, ...",Новий огляд мапи DeepState від російського вій...,"[[27, 63], [65, 88], [90, 183], [186, 308]]","[0, 0, 0, 0, 0, 1, 0, 1, 0, 0]"
1,"[1, 1909, 21922, 6943, 64148, 1774, 20485, 456...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[0, 0], [0, 2], [2, 7], [7, 10], [10, 18], [1...","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, ...",Недавно 95 квартал жёстко поглумился над русск...,"[[0, 40], [123, 137], [180, 251], [253, 274]]","[0, 0, 0, 0, 0, 1, 0, 0, 1, 0]"


# Model

In [13]:
import torch
import torch.nn as nn
from transformers import AutoConfig, AutoModel
from transformers import DebertaV2Model, DebertaV2PreTrainedModel

In [14]:
class BertForTokenSequenceClassification(DebertaV2PreTrainedModel):
    def __init__(self, model_name, num_token_labels, num_sequence_labels):
        bert_model = DebertaV2Model.from_pretrained(model_name)
        super().__init__(bert_model.config)
        self.bert = bert_model
        hidden_size = self.config.hidden_size

        # Token Classification Head
        self.token_classifier = nn.Linear(hidden_size, num_token_labels)

        # Sequence Classification Head
        self.sequence_classifier = nn.Linear(hidden_size, num_sequence_labels)

        # Initialize weights and apply final processing
        self.post_init()

    def forward(self, input_ids, attention_mask, labels=None, sequence_labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state  # Shape: (batch, seq_len, hidden)

        # Token Classification Output (Apply to each token)
        token_logits = self.token_classifier(sequence_output)  # (batch, seq_len, num_token_labels)

        # Sequence Classification Output (Use [CLS] token's representation)
        cls_output = sequence_output[:, 0, :]  # Take first token (CLS)
        sequence_logits = self.sequence_classifier(cls_output)  # (batch, num_sequence_labels)

        loss = None
        if labels is not None and sequence_labels is not None:
            token_loss_fn = nn.CrossEntropyLoss()
            seq_loss_fn = nn.BCEWithLogitsLoss()  # For multi-label classification

            token_loss = token_loss_fn(token_logits.view(-1, token_logits.shape[-1]), labels.view(-1))
            seq_loss = seq_loss_fn(sequence_logits, sequence_labels.float())

            loss = token_loss + seq_loss  # Combine losses

        return {
            "loss": loss,
            "token_logits": token_logits,
            "sequence_logits": sequence_logits,
        }

## Init and Test

In [15]:
sample = ds_train[0]

# Convert input to batch format (add batch dimension)
input_ids = torch.tensor([sample["input_ids"]])
attention_mask = torch.tensor([sample["attention_mask"]])
token_labels = torch.tensor([sample["labels"]])
sequence_labels = torch.tensor([sample["sequence_labels"]])

In [16]:
model = BertForTokenSequenceClassification(
    model_name=PRETRAINED_MODEL,
    num_token_labels=2,
    num_sequence_labels=10
)

model

pytorch_model.bin:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

BertForTokenSequenceClassification(
  (bert): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(251000, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): Dropout(p=0.1, inplace=False)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNor

## Metrics

In [17]:
from itertools import chain

TOKEN_CLASS_DISTRIBUTION = pd.Series(list(chain(*df.labels.tolist()))).mean()
SEQUENCE_CLASS_DISTRIBUTION = df[techniques].mean().values

In [18]:
import math
from scipy.optimize import minimize_scalar
from transformers import Trainer, pipeline, TrainingArguments
from typing import Any
from tqdm.autonotebook import tqdm
from transformers.trainer_utils import EvalPrediction
from sklearn.metrics import f1_score


def extract_chars_from_spans(spans):
    """
    Given a list of spans (each a tuple (start, end)),
    return a set of character indices for all spans.
    """
    char_set = set()
    for start, end in spans:
        # Each span covers positions start, start+1, ..., end-1.
        char_set.update(range(start, end))
    return char_set


class TokenSequenceEvaluationTrainer(Trainer):
    def __init__(
        self,
        model: Any = None,
        args: TrainingArguments = None,
        data_collator: Any = None,
        train_dataset: Any = None,
        eval_dataset: Any = None,
        tokenizer: Any = None,
        sequence_class_distribution: list[float] = [0.1]*10,
        token_class_distribution: float = 0.25, # mean
        **kwargs,
    ):
        """
        Initialize the Trainer with our custom compute_metrics.
        """
        super().__init__(
            model=model,
            args=args,
            data_collator=data_collator,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            tokenizer=tokenizer,
            compute_metrics=self.compute_metrics,  # assign our custom compute_metrics
            **kwargs,
        )
        self.sequence_class_distribution = sequence_class_distribution
        self.token_class_distribution = token_class_distribution
        
    def compute_metrics(self, eval_pred: EvalPrediction) -> dict:        
        token_logits, sequence_logits = eval_pred.predictions
        token_labels, sequence_labels = eval_pred.label_ids

        # Sequence classification metrics (multi-label)
        sequence_metrics = self._compute_sequence_metrics(sequence_logits, sequence_labels)
    
        # Token classification metrics
        token_metrics = self._compute_token_metrics(token_logits, token_labels)
        
        return {
            **{f"sequence_{key}": value for key, value in sequence_metrics.items()},
            **{f"token_{key}": value for key, value in token_metrics.items()}
        }


    # SEQUENCE
    def _compute_sequence_metrics(self, logits, labels):
        proba = torch.nn.functional.sigmoid(torch.tensor(logits)).numpy()
        optimal_thresholds = self._find_thresholds_for_distribution(
            proba, desired_distribution=self.sequence_class_distribution
        )
        binarized_preds = (proba >= np.array(optimal_thresholds)).astype(int)
        
        return {"f1": f1_score(labels, binarized_preds, average="macro")}

        
    def _find_thresholds_for_distribution(self, preds, desired_distribution):
        """
        Find thresholds for each class to achieve the desired class distribution.
    
        Args:
            preds (ndarray): Array of shape (num_samples, num_classes) with probabilities (after sigmoid).
            desired_distribution (list): Desired proportion of positive samples for each class.
    
        Returns:
            thresholds (list): List of thresholds for each class.
        """
        num_classes = preds.shape[1]
        thresholds = []
    
        for class_idx in range(num_classes):
            probs = preds[:, class_idx]
            desired_ratio = desired_distribution[class_idx]
    
            # Function to minimize the difference between actual and desired positive ratios
            def objective(threshold):
                predicted_ratio = (probs >= threshold).mean()
                return abs(predicted_ratio - desired_ratio)
    
            # Find the threshold using optimization
            result = minimize_scalar(objective, bounds=(0, 1), method="bounded")
            thresholds.append(result.x)
    
        return thresholds


    # TOKEN

    def _compute_token_metrics(self, logits, labels):
        eval_dataset = self.eval_dataset
        probabilities = torch.softmax(torch.tensor(logits), dim=-1).cpu().numpy()
    
        #thresholds = np.linspace(0.1, 0.5, num=41)
        thresholds = [self._find_optimal_threshold(probabilities, labels)]
        results = []
        best_f1 = -1
        best_th = 0
        best_metrics = None
    
        for thold in tqdm(thresholds):
            # Apply thresholding instead of argmax
            predictions = (probabilities[:, :, 1] >= thold).astype(int)
    
            true_predictions = [
                [p for (p, l) in zip(prediction, label) if l != -100]
                for prediction, label in zip(predictions, labels)
            ]
    
            pred_spans_all = []
            for pred, offsets in zip(true_predictions, eval_dataset['offset_mapping']):
                samplewise_spans = []
                current_span = None
                for token_label, span in zip(pred, offsets):
                    if token_label == 1:  # If the current token is labeled as an entity (1)
                        if current_span is None:
                            current_span = [span[0], span[1]]  # Start a new span
                        else:
                            current_span[1] = span[1]  # Extend the span to include the current token
                    else:  # If token_label == 0 (not an entity)
                        if current_span is not None:
                            samplewise_spans.append(tuple(current_span))  # Save completed span
                            current_span = None  # Reset for the next entity
    
                # If the last token was part of a span, save it
                if current_span is not None:
                    samplewise_spans.append(tuple(current_span))
    
                pred_spans_all.append(samplewise_spans)
    
            # Store results for this threshold
            current_metrics = self._calculate_inner_metric(eval_dataset['trigger_words'], pred_spans_all)
            if current_metrics['f1'] >= best_f1:
                best_f1 = current_metrics['f1']
                best_th = thold
                best_metrics = current_metrics
                best_metrics['thold'] = thold
                
            
            results.append(current_metrics)
        return best_metrics


    def _calculate_inner_metric(self, gt_spans_all, pred_spans_all):
        total_true_chars = 0
        total_pred_chars = 0
        total_overlap_chars = 0
        for true_spans, pred_spans in zip(gt_spans_all, pred_spans_all):
            if isinstance(true_spans, str):
                try:
                    true_spans = eval(true_spans)
                except Exception:
                    true_spans = []
                    
            # Convert spans to sets of character indices.
            true_chars = extract_chars_from_spans(true_spans)
            pred_chars = extract_chars_from_spans(pred_spans)
            
            total_true_chars += len(true_chars)
            total_pred_chars += len(pred_chars)
            total_overlap_chars += len(true_chars.intersection(pred_chars))
            
            union_chars = true_chars.union(pred_chars)
            
        # Compute precision, recall, and F1.
        precision = total_overlap_chars / total_pred_chars if total_pred_chars > 0 else 0
        recall = total_overlap_chars / total_true_chars if total_true_chars > 0 else 0
        f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        metrics = {
            "precision": precision,
            "recall": recall,
            "f1": f1
        }
        return metrics

    def _find_optimal_threshold(self, probabilities, labels):
        """Finds the threshold that achieves the desired positive class balance."""
        best_th = 0.5  # Default starting point
        best_diff = float("inf")
        optimal_th = best_th
        
        for thold in np.linspace(0.01, 0.99, num=100):
            predictions = (probabilities[:, :, 1] >= thold).astype(int)
            true_predictions = [
                [p for (p, l) in zip(prediction, label) if l != -100]
                for prediction, label in zip(predictions, labels)
            ]
            total_pos = sum([sum(row for row in prediction) for prediction in true_predictions])
            total = sum([len(prediction) for prediction in true_predictions])
            
            positive_ratio = total_pos / total if total > 0 else 0
            
            diff = abs(positive_ratio - self.token_class_distribution)
            if diff < best_diff:
                best_diff = diff
                optimal_th = thold
        
        return optimal_th

## Train

In [19]:
EPOCHS = 5

In [20]:
from transformers import DataCollatorForTokenClassification

class CustomDataCollator(DataCollatorForTokenClassification):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
    
    def __call__(self, features):
        # Separate token-level and sequence-level labels
        sequence_labels = [f.pop("sequence_labels") for f in features]
        
        # Use Hugging Face's built-in collator for token classification
        batch = super().torch_call(features)
        
        # Convert sequence labels to tensor
        batch["sequence_labels"] = torch.tensor(sequence_labels, dtype=torch.int64)
        
        return batch

# Use the custom data collator
data_collator = CustomDataCollator(tokenizer=tokenizer)

In [21]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW([
    {'params': list(model.bert.parameters()), 'lr': 2e-5},
    {'params': list(model.token_classifier.parameters()), 'lr': 1e-4},
    {'params': list(model.sequence_classifier.parameters()), 'lr': 1e-4}
])

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0.1*EPOCHS*(ds_train.num_rows/16),
    num_training_steps=EPOCHS*(ds_train.num_rows/16)
)



In [22]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=EPOCHS,
    
    output_dir="./results",
    logging_strategy="steps",
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    report_to="none"
)

trainer = TokenSequenceEvaluationTrainer(
    model=model,
    args=training_args,
    train_dataset=ds_train,
    eval_dataset=ds_valid,
    data_collator=data_collator,
    tokenizer=tokenizer,
    optimizers=(optimizer, scheduler),
    sequence_class_distribution=SEQUENCE_CLASS_DISTRIBUTION,
    token_class_distribution=TOKEN_CLASS_DISTRIBUTION
)

  super().__init__(


In [23]:
trainer.train()



Epoch,Training Loss,Validation Loss,Sequence F1,Token Precision,Token Recall,Token F1,Token Thold
1,0.6651,0.713234,0.107196,0.611214,0.55508,0.581797,0.534646
2,0.6825,0.720087,0.227798,0.619421,0.551075,0.583253,0.316869
3,0.6298,0.676411,0.297652,0.614783,0.551915,0.581655,0.504949
4,0.5442,0.747947,0.299959,0.619465,0.550804,0.58312,0.326768
5,0.4214,0.735057,0.338227,0.615062,0.550804,0.581162,0.425758


  0%|          | 0/1 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]

TrainOutput(global_step=960, training_loss=0.6608349964022636, metrics={'train_runtime': 1488.5825, 'train_samples_per_second': 10.272, 'train_steps_per_second': 0.645, 'total_flos': 3638254796488656.0, 'train_loss': 0.6608349964022636, 'epoch': 5.0})

In [27]:
FINETUNED_MODEL = '/kaggle/working/results/checkpoint-960'