In [1]:
import os
import numpy as np
import pandas as pd
import torch
import random
import nltk

from tqdm import tqdm
from nltk.corpus import wordnet
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.metrics import f1_score, accuracy_score, classification_report

# Download necessary NLTK data (WordNet for synonyms)
nltk.download("wordnet")
nltk.download("omw-1.4")

# ----------------------------------------------------------------------------
#  1.1 Check GPU availability
# ----------------------------------------------------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Backe\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Backe\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
#!unzip /kaggle/working/nltk_data/corpora/omw-1.4.zip -d /kaggle/working/nltk_data/corpora/
#!unzip /kaggle/working/nltk_data/corpora/wordnet.zip -d /kaggle/working/nltk_data/corpora/

In [23]:
# ----------------------------------------------------------------------------
#  2. LOAD YOUR DATA
#     Adjust these file paths to your environment. 
#     The CSVs must contain columns:
#       train.csv: claim, evidence, label
#       dev.csv:   claim, evidence, label
#       test.csv:  claim, evidence, (no label)
# ----------------------------------------------------------------------------

TRAIN_PATH = "data\\train.csv"
DEV_PATH   = "data\\dev.csv"
TEST_PATH  = "data\\test.csv"

BEST_MODEL_PATH = "data\\taskC\\best_deberta_model.pt"
OUTPUT_PATH = "data\\taskC\\predictions.csv"

train_df = pd.read_csv(TRAIN_PATH)
dev_df   = pd.read_csv(DEV_PATH)
test_df  = pd.read_csv(TEST_PATH)

train_df.rename(columns={"Claim": "claim", "Evidence": "evidence"}, inplace=True)
dev_df.rename(columns={"Claim": "claim", "Evidence": "evidence"}, inplace=True)
test_df.rename(columns={"Claim": "claim", "Evidence": "evidence"}, inplace=True)

print("Train samples:", len(train_df))
print("Dev samples:", len(dev_df))
print("Test samples:", len(test_df))

# If labels are strings, map them to integer {0,1} or {0,1,2,...}.
# For ED, assume 2 classes: 0 = not evidence, 1 = relevant evidence
train_df["label"] = train_df["label"].astype(int)
dev_df["label"]   = dev_df["label"].astype(int)

train_df.head(3)


Train samples: 21508
Dev samples: 5926
Test samples: 4688


Unnamed: 0,claim,evidence,label
0,We should introduce school vouchers,"Among the many educational reform efforts, suc...",0
1,We should legalize insider trading,The U.S. Securities and Exchange Commission wa...,0
2,We should subsidize investigative journalism,"The film won an Emmy Award (1980), George Polk...",0


In [4]:
# ----------------------------------------------------------------------------
#  3. (OPTIONAL) DATA AUGMENTATION (Synonym Replacement)
#      - We'll replace 1 random word in claim/evidence with a WordNet synonym
#      - For demonstration, there's a 15% chance per example to create an
#        augmented copy.
# 
# ----------------------------------------------------------------------------

def synonym_replacement(sentence, n=1):
    """
    Replace 'n' words in 'sentence' with synonyms from WordNet, if possible.
    """
    words = sentence.split()
    if len(words) < 2:
        return sentence

    indices_to_replace = random.sample(range(len(words)), k=min(n, len(words)))
    new_words = words[:]
    for i in indices_to_replace:
        word = words[i]
        syns = wordnet.synsets(word)
        if not syns:
            continue
        # For simplicity, pick from the first synset's lemmas
        lemmas = syns[0].lemma_names()
        # Filter out lemmas that are the same as the original
        lemmas = [l for l in lemmas if l.lower() != word.lower()]
        if len(lemmas) == 0:
            continue
        new_words[i] = random.choice(lemmas)
    return " ".join(new_words)


def augment_dataframe(df, alpha=0.15):
    """
    For each row, with probability alpha, create an augmented copy.
    Return a new DataFrame with both original and augmented samples.
    """
    augmented_rows = []
    for _, row in df.iterrows():
        # Original row
        augmented_rows.append(row.to_dict())
        
        if random.random() < alpha:
            new_row = row.copy()
            # Randomly augment claim or evidence
            if random.random() < 0.5:
                new_row["claim"] = synonym_replacement(row["claim"], n=1)
            else:
                new_row["evidence"] = synonym_replacement(row["evidence"], n=1)
            augmented_rows.append(new_row.to_dict())
    return pd.DataFrame(augmented_rows)

# Let's do a random seed for reproducibility
random.seed(42)

# AUGMENT the training set (remove if undesired)
augmented_train_df = augment_dataframe(train_df, alpha=0.15)
print("Original train size:", len(train_df), 
      " => After augmentation:", len(augmented_train_df))

train_df = augmented_train_df.reset_index(drop=True)


Original train size: 21508  => After augmentation: 24819


In [5]:
# ----------------------------------------------------------------------------
#  4. CREATE HUGGING FACE DATASETS
# ----------------------------------------------------------------------------

train_dataset = Dataset.from_pandas(train_df)
dev_dataset   = Dataset.from_pandas(dev_df)
test_dataset  = Dataset.from_pandas(test_df)

dataset_dict = DatasetDict({
    "train": train_dataset,
    "dev":   dev_dataset,
    "test":  test_dataset
})
dataset_dict


DatasetDict({
    train: Dataset({
        features: ['claim', 'evidence', 'label'],
        num_rows: 24819
    })
    dev: Dataset({
        features: ['claim', 'evidence', 'label'],
        num_rows: 5926
    })
    test: Dataset({
        features: ['claim', 'evidence'],
        num_rows: 4688
    })
})

In [6]:
# ----------------------------------------------------------------------------
#  5. TOKENIZATION
#     We use a powerful model: DeBERTa v3 (microsoft/deberta-v3-base)
#     which is known to outperform standard BERT on many tasks.
# ----------------------------------------------------------------------------

model_name = "microsoft/deberta-v3-base"

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(
        examples["claim"],
        examples["evidence"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

encoded_dataset = dataset_dict.map(tokenize_function, batched=True)




Map:   0%|          | 0/24819 [00:00<?, ? examples/s]

Map:   0%|          | 0/5926 [00:00<?, ? examples/s]

Map:   0%|          | 0/4688 [00:00<?, ? examples/s]

In [7]:
#Handling the labels in the dataset
encoded_dataset["train"] = encoded_dataset["train"].rename_column("label", "labels")
encoded_dataset["dev"]   = encoded_dataset["dev"].rename_column("label", "labels")

encoded_dataset["train"] = encoded_dataset["train"].remove_columns(["claim", "evidence"])
encoded_dataset["dev"]   = encoded_dataset["dev"].remove_columns(["claim", "evidence"])

encoded_dataset["test"]  = encoded_dataset["test"].remove_columns(["claim", "evidence"])

encoded_dataset["train"].set_format("torch")
encoded_dataset["dev"].set_format("torch")
encoded_dataset["test"].set_format("torch")

In [11]:
# ----------------------------------------------------------------------------
#  6. CUSTOM MODEL: Focal Loss or Label Smoothing
#     overridomg forward() to allow advanced loss functions.
# ----------------------------------------------------------------------------

import torch.nn as nn

class CustomDebertaModel(nn.Module):
    def __init__(self, model_name, num_labels=2, use_focal_loss=False, gamma=2.0, label_smoothing=0.0):
        super().__init__()
        self.num_labels = num_labels
        self.use_focal_loss = use_focal_loss
        self.gamma = gamma
        self.label_smoothing = label_smoothing
        
        # Load the pre-trained DeBERTa classification model
        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels
        )

    def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
        # DeBERTa forward pass (omit internal CE)
        kwargs.pop("num_items_in_batch", None)

        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=None,
            **kwargs
        )
        logits = outputs.logits  # shape: (batch_size, num_labels)
        
        loss = None
        if labels is not None:
            if self.use_focal_loss:
                loss = self.focal_loss(logits, labels, self.gamma)
            else:
                loss = self.label_smoothing_loss(logits, labels, self.label_smoothing)
        
        return {"loss": loss, "logits": logits}

    def focal_loss(self, logits, targets, gamma=2.0):
        # Focal Loss
        ce = nn.CrossEntropyLoss(reduction='none')(logits, targets)
        pt = torch.exp(-ce)
        focal = (1 - pt)**gamma * ce
        return focal.mean()

    def label_smoothing_loss(self, logits, targets, smoothing=0.0):
        if smoothing == 0.0:
            return nn.CrossEntropyLoss()(logits, targets)

        log_probs = nn.LogSoftmax(dim=-1)(logits)
        n_class = logits.size(1)
        with torch.no_grad():
            true_dist = torch.zeros_like(log_probs)
            true_dist.fill_(smoothing / (n_class - 1))
            true_dist.scatter_(1, targets.unsqueeze(1), 1.0 - smoothing)
        return torch.mean(torch.sum(-true_dist * log_probs, dim=1))


In [13]:
# ----------------------------------------------------------------------------
#  7. HYPERPARAMETER SEARCH WITH HYPEROPT
#     We'll define:
#       - learning_rate
#       - epochs
#       - batch_size
#       - use_focal_loss
#       - gamma (for focal loss)
#       - label_smoothing
# ----------------------------------------------------------------------------

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"f1": f1, "accuracy": acc}


def objective(space):
    """
    Hyperopt Objective:
      1) Create a CustomDebertaModel with the candidate hyperparams
      2) Train on train_dataset, evaluate on dev_dataset
      3) Return negative F1 (since Hyperopt minimizes)
    """
    learning_rate = space["learning_rate"]
    epochs        = int(space["epochs"])
    batch_size    = int(space["batch_size"])
    use_focal_loss = space["use_focal_loss"]
    gamma          = space["gamma"]
    label_smoothing = space["label_smoothing"]

    # Build the model
    model = CustomDebertaModel(
        model_name=model_name,
        num_labels=2,
        use_focal_loss=use_focal_loss,
        gamma=gamma,
        label_smoothing=label_smoothing
    )
    model.to(device)

    training_args = TrainingArguments(
        output_dir="./sota-ed-checkpoints",
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        learning_rate=learning_rate,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=8,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        save_total_limit=1,
        report_to="none", 
        logging_steps=1
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=encoded_dataset["train"],
        eval_dataset=encoded_dataset["dev"],
        processing_class=tokenizer,
        compute_metrics=compute_metrics
    )

    trainer.train()
    metrics = trainer.evaluate(encoded_dataset["dev"])
    f1 = metrics["eval_f1"]

    print(f"[Hyperopt] params={space} => F1={f1:.4f}")
    return {"loss": -f1, "status": STATUS_OK}


# Define search space
search_space = {
    "learning_rate":   hp.loguniform("learning_rate", np.log(1e-5), np.log(5e-4)),
    "epochs":          hp.choice("epochs", [2, 3, 4]),
    "batch_size":      hp.choice("batch_size", [4, 8, 16]),
    "use_focal_loss":  hp.choice("use_focal_loss", [False, True]),
    "gamma":           hp.quniform("gamma", 1.0, 5.0, 0.5),    
    "label_smoothing": hp.uniform("label_smoothing", 0.0, 0.2)
}

max_evals = 30 
trials = Trials()

best = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=max_evals,
    trials=trials
)

print("\nHyperopt best param indices:", best)


  0%|          | 0/30 [00:00<?, ?trial/s, best loss=?]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.3004,0.297859,0.607102,0.723253
2,0.2969,0.299362,0.607102,0.723253


[Hyperopt] params={'batch_size': 4, 'epochs': 2, 'gamma': 1.0, 'label_smoothing': 0.08672050344959666, 'learning_rate': 0.00012634788355162484, 'use_focal_loss': True} => F1=0.6071
  3%|▎         | 1/30 [31:31<15:14:12, 1891.46s/trial, best loss: -0.607102296650199]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0381,0.037999,0.607102,0.723253
2,0.0381,0.037954,0.607102,0.723253
3,0.0379,0.037944,0.607102,0.723253
4,0.0379,0.037944,0.607102,0.723253


[Hyperopt] params={'batch_size': 16, 'epochs': 4, 'gamma': 4.0, 'label_smoothing': 0.10773162807203462, 'learning_rate': 9.964250922861787e-05, 'use_focal_loss': True} => F1=0.6071
  7%|▋         | 2/30 [1:00:33<14:01:32, 1803.31s/trial, best loss: -0.607102296650199]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.6307,0.627734,0.607102,0.723253
2,0.6269,0.627803,0.607102,0.723253
3,0.6258,0.62844,0.607102,0.723253


[Hyperopt] params={'batch_size': 4, 'epochs': 3, 'gamma': 3.0, 'label_smoothing': 0.0993708290133283, 'learning_rate': 0.00027118448777552456, 'use_focal_loss': False} => F1=0.6071
 10%|█         | 3/30 [1:44:06<16:17:52, 2173.04s/trial, best loss: -0.607102296650199]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0183,0.014377,0.869561,0.866183
2,0.0109,0.016152,0.856087,0.850152
3,0.0059,0.024856,0.881629,0.880526
4,0.0019,0.049745,0.880927,0.878839


[Hyperopt] params={'batch_size': 8, 'epochs': 4, 'gamma': 4.5, 'label_smoothing': 0.18027208373731465, 'learning_rate': 4.5080800504417184e-05, 'use_focal_loss': True} => F1=0.8816
 13%|█▎        | 4/30 [2:18:15<15:20:28, 2124.19s/trial, best loss: -0.8816287159700587]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0386,0.037987,0.607102,0.723253
2,0.0381,0.037948,0.607102,0.723253
3,0.038,0.037954,0.607102,0.723253
4,0.0378,0.037944,0.607102,0.723253


[Hyperopt] params={'batch_size': 16, 'epochs': 4, 'gamma': 4.0, 'label_smoothing': 0.011327595605151087, 'learning_rate': 0.00021970189070193293, 'use_focal_loss': True} => F1=0.6071
 17%|█▋        | 5/30 [2:56:27<15:10:18, 2184.73s/trial, best loss: -0.8816287159700587]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.4052,0.355265,0.88767,0.886095
2,0.2884,0.405907,0.893369,0.891158


[Hyperopt] params={'batch_size': 8, 'epochs': 2, 'gamma': 1.5, 'label_smoothing': 0.031926145594475486, 'learning_rate': 1.879441696809551e-05, 'use_focal_loss': False} => F1=0.8934
 20%|██        | 6/30 [3:13:50<11:58:37, 1796.56s/trial, best loss: -0.893369349629818] 

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.052,0.043226,0.892043,0.891158
2,0.0302,0.056921,0.889431,0.886939


[Hyperopt] params={'batch_size': 4, 'epochs': 2, 'gamma': 3.0, 'label_smoothing': 0.05051277325799206, 'learning_rate': 1.5929841121371466e-05, 'use_focal_loss': True} => F1=0.8920
 23%|██▎       | 7/30 [13:50:03<87:27:20, 13688.71s/trial, best loss: -0.893369349629818]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.1349,0.110107,0.86965,0.867364
2,0.077,0.108732,0.883664,0.88137
3,0.0345,0.20696,0.876313,0.873102
4,0.0121,0.228995,0.88068,0.87867


[Hyperopt] params={'batch_size': 16, 'epochs': 4, 'gamma': 1.5, 'label_smoothing': 0.16069659461521935, 'learning_rate': 6.336198076397712e-05, 'use_focal_loss': True} => F1=0.8837
 27%|██▋       | 8/30 [14:18:49<60:22:50, 9880.48s/trial, best loss: -0.893369349629818] 

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.6282,0.626398,0.607102,0.723253
2,0.626,0.625895,0.607102,0.723253
3,0.6253,0.626997,0.607102,0.723253
4,0.6247,0.627043,0.607102,0.723253


[Hyperopt] params={'batch_size': 4, 'epochs': 4, 'gamma': 1.0, 'label_smoothing': 0.09395626219382402, 'learning_rate': 0.0001097628027946798, 'use_focal_loss': False} => F1=0.6071
 30%|███       | 9/30 [15:24:28<46:48:03, 8023.03s/trial, best loss: -0.893369349629818]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.3879,0.404272,0.855033,0.848802
2,0.2769,0.365201,0.884653,0.882045


[Hyperopt] params={'batch_size': 16, 'epochs': 2, 'gamma': 4.5, 'label_smoothing': 0.024482437807113323, 'learning_rate': 1.1850862886557e-05, 'use_focal_loss': False} => F1=0.8847
 33%|███▎      | 10/30 [15:38:03<32:12:39, 5798.00s/trial, best loss: -0.893369349629818]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0386,0.037994,0.607102,0.723253
2,0.0379,0.038001,0.607102,0.723253
3,0.0378,0.037954,0.607102,0.723253


[Hyperopt] params={'batch_size': 4, 'epochs': 3, 'gamma': 4.0, 'label_smoothing': 0.06070184050785832, 'learning_rate': 0.00018683738197035072, 'use_focal_loss': True} => F1=0.6071
 37%|███▋      | 11/30 [16:21:37<25:27:23, 4823.36s/trial, best loss: -0.893369349629818]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0274,0.026912,0.607102,0.723253
2,0.0269,0.026954,0.607102,0.723253
3,0.0269,0.026877,0.607102,0.723253
4,0.0268,0.026877,0.607102,0.723253


[Hyperopt] params={'batch_size': 4, 'epochs': 4, 'gamma': 4.5, 'label_smoothing': 0.12992113965102156, 'learning_rate': 0.00012137691764950135, 'use_focal_loss': True} => F1=0.6071
 40%|████      | 12/30 [17:21:08<22:12:42, 4442.38s/trial, best loss: -0.893369349629818]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0542,0.05358,0.607102,0.723253
2,0.0538,0.053557,0.607102,0.723253
3,0.0536,0.053553,0.607102,0.723253
4,0.0534,0.05355,0.607102,0.723253


[Hyperopt] params={'batch_size': 16, 'epochs': 4, 'gamma': 3.5, 'label_smoothing': 0.04895674939753514, 'learning_rate': 8.814181397671885e-05, 'use_focal_loss': True} => F1=0.6071
 43%|████▎     | 13/30 [17:47:43<16:54:15, 3579.74s/trial, best loss: -0.893369349629818]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.5086,0.484265,0.881844,0.878839
2,0.4383,0.484767,0.892397,0.89082
3,0.3999,0.503069,0.893377,0.890989


[Hyperopt] params={'batch_size': 8, 'epochs': 3, 'gamma': 3.0, 'label_smoothing': 0.11205288449302564, 'learning_rate': 1.8158929545468875e-05, 'use_focal_loss': False} => F1=0.8934
 47%|████▋     | 14/30 [18:15:21<13:19:51, 2999.47s/trial, best loss: -0.8933765059050405]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0498,0.044836,0.889247,0.889133
2,0.031,0.061949,0.891791,0.889976


[Hyperopt] params={'batch_size': 4, 'epochs': 2, 'gamma': 3.0, 'label_smoothing': 0.05304744261829811, 'learning_rate': 1.2955055291914709e-05, 'use_focal_loss': True} => F1=0.8918
 50%|█████     | 15/30 [18:49:39<11:18:52, 2715.51s/trial, best loss: -0.8933765059050405]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.1326,0.102923,0.885283,0.885589
2,0.0676,0.143638,0.885038,0.88272


[Hyperopt] params={'batch_size': 8, 'epochs': 2, 'gamma': 1.5, 'label_smoothing': 0.183157384568861, 'learning_rate': 4.393573366421746e-05, 'use_focal_loss': True} => F1=0.8853
 53%|█████▎    | 16/30 [19:08:07<8:40:42, 2231.63s/trial, best loss: -0.8933765059050405] 

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.5829,0.569749,0.883758,0.881033
2,0.5462,0.572334,0.887388,0.884914


[Hyperopt] params={'batch_size': 16, 'epochs': 2, 'gamma': 3.5, 'label_smoothing': 0.19984721687915863, 'learning_rate': 1.4139342385561376e-05, 'use_focal_loss': False} => F1=0.8874
 57%|█████▋    | 17/30 [19:20:59<6:28:27, 1792.86s/trial, best loss: -0.8933765059050405]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.5364,0.517599,0.881972,0.88002
2,0.4765,0.545029,0.870022,0.865339
3,0.4427,0.532794,0.887634,0.884914
4,0.4236,0.536544,0.891133,0.889133


[Hyperopt] params={'batch_size': 8, 'epochs': 4, 'gamma': 2.0, 'label_smoothing': 0.13967662389764965, 'learning_rate': 2.7204750836743727e-05, 'use_focal_loss': False} => F1=0.8911
 60%|██████    | 18/30 [19:57:34<6:22:42, 1913.58s/trial, best loss: -0.8933765059050405]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0624,0.050648,0.884913,0.882551
2,0.0366,0.057256,0.894367,0.892339
3,0.0207,0.101759,0.884469,0.88137
4,0.0102,0.143188,0.892096,0.889976


[Hyperopt] params={'batch_size': 16, 'epochs': 4, 'gamma': 2.5, 'label_smoothing': 0.06252359010244561, 'learning_rate': 1.785454033277208e-05, 'use_focal_loss': True} => F1=0.8944
 63%|██████▎   | 19/30 [20:22:36<5:28:08, 1789.90s/trial, best loss: -0.8943666045652502]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.61,0.610744,0.607102,0.723253
2,0.6072,0.611502,0.607102,0.723253


[Hyperopt] params={'batch_size': 8, 'epochs': 2, 'gamma': 5.0, 'label_smoothing': 0.04806870998017668, 'learning_rate': 0.00012525585317361445, 'use_focal_loss': False} => F1=0.6071
 67%|██████▋   | 20/30 [20:41:01<4:24:03, 1584.34s/trial, best loss: -0.8943666045652502]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.4707,0.430107,0.887714,0.886433
2,0.3831,0.465321,0.889007,0.88677
3,0.3303,0.480223,0.885971,0.883226


[Hyperopt] params={'batch_size': 8, 'epochs': 3, 'gamma': 2.5, 'label_smoothing': 0.08024144968797006, 'learning_rate': 2.5696553188284025e-05, 'use_focal_loss': False} => F1=0.8890
 70%|███████   | 21/30 [21:08:22<4:00:12, 1601.37s/trial, best loss: -0.8943666045652502]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.5123,0.493611,0.884505,0.882383
2,0.4524,0.499779,0.888457,0.886095
3,0.4222,0.512575,0.886555,0.883564


[Hyperopt] params={'batch_size': 8, 'epochs': 3, 'gamma': 2.5, 'label_smoothing': 0.11612128420639324, 'learning_rate': 1.0065107974453222e-05, 'use_focal_loss': False} => F1=0.8885
 73%|███████▎  | 22/30 [21:35:46<3:35:12, 1614.11s/trial, best loss: -0.8943666045652502]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.4582,0.424796,0.881389,0.878502
2,0.3691,0.452577,0.87942,0.875464
3,0.3215,0.459614,0.888278,0.885589


[Hyperopt] params={'batch_size': 16, 'epochs': 3, 'gamma': 2.5, 'label_smoothing': 0.07385518682696005, 'learning_rate': 2.372892642546355e-05, 'use_focal_loss': False} => F1=0.8883
 77%|███████▋  | 23/30 [21:54:29<2:51:08, 1466.89s/trial, best loss: -0.8943666045652502]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.538,0.514711,0.879428,0.876477
2,0.4776,0.525495,0.886994,0.884576
3,0.4423,0.53084,0.889181,0.88677


[Hyperopt] params={'batch_size': 16, 'epochs': 3, 'gamma': 2.0, 'label_smoothing': 0.14355111866704404, 'learning_rate': 3.789493852966346e-05, 'use_focal_loss': False} => F1=0.8892
 80%|████████  | 24/30 [22:13:12<2:16:22, 1363.76s/trial, best loss: -0.8943666045652502]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.6357,0.63454,0.607102,0.723253
2,0.6332,0.63461,0.607102,0.723253
3,0.6327,0.634664,0.607102,0.723253


[Hyperopt] params={'batch_size': 8, 'epochs': 3, 'gamma': 3.5, 'label_smoothing': 0.12036839329901747, 'learning_rate': 0.0004561999311834353, 'use_focal_loss': False} => F1=0.6071
 83%|████████▎ | 25/30 [22:40:32<2:00:32, 1446.57s/trial, best loss: -0.8943666045652502]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0925,0.077817,0.875211,0.873777
2,0.0493,0.081563,0.890812,0.890145
3,0.0199,0.130365,0.882864,0.880864


[Hyperopt] params={'batch_size': 16, 'epochs': 3, 'gamma': 2.0, 'label_smoothing': 0.06784922911079358, 'learning_rate': 5.987226405908773e-05, 'use_focal_loss': True} => F1=0.8908
 87%|████████▋ | 26/30 [22:59:16<1:29:58, 1349.60s/trial, best loss: -0.8943666045652502]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0654,0.054065,0.886545,0.886095
2,0.0347,0.073362,0.88998,0.888289
3,0.0146,0.118762,0.891779,0.889808


[Hyperopt] params={'batch_size': 8, 'epochs': 3, 'gamma': 2.5, 'label_smoothing': 0.0009259536821840803, 'learning_rate': 3.136651912519059e-05, 'use_focal_loss': True} => F1=0.8918
 90%|█████████ | 27/30 [23:26:35<1:11:50, 1436.69s/trial, best loss: -0.8943666045652502]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.551,0.533854,0.874205,0.870233
2,0.4988,0.538673,0.884148,0.881201
3,0.4682,0.544811,0.887429,0.885251
4,0.4531,0.548516,0.892464,0.89082


[Hyperopt] params={'batch_size': 8, 'epochs': 4, 'gamma': 3.5, 'label_smoothing': 0.1566917294437482, 'learning_rate': 2.0084866507935567e-05, 'use_focal_loss': False} => F1=0.8925
 93%|█████████▎| 28/30 [24:02:45<55:13, 1656.69s/trial, best loss: -0.8943666045652502]  

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.1734,0.174015,0.867277,0.862302
2,0.1104,0.161441,0.888519,0.886264
3,0.0774,0.205045,0.88288,0.88002
4,0.056,0.248037,0.88264,0.88002


[Hyperopt] params={'batch_size': 16, 'epochs': 4, 'gamma': 1.0, 'label_smoothing': 0.08299773877656061, 'learning_rate': 1.0303387461831919e-05, 'use_focal_loss': True} => F1=0.8885
 97%|█████████▋| 29/30 [24:27:26<26:43, 1603.91s/trial, best loss: -0.8943666045652502]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.3835,0.329326,0.885684,0.884239
2,0.269,0.379126,0.883801,0.880526
3,0.2079,0.401091,0.885612,0.882551


[Hyperopt] params={'batch_size': 16, 'epochs': 3, 'gamma': 2.0, 'label_smoothing': 0.02416071096173772, 'learning_rate': 1.7804767303328657e-05, 'use_focal_loss': False} => F1=0.8857
100%|██████████| 30/30 [24:46:10<00:00, 2972.35s/trial, best loss: -0.8943666045652502]

Hyperopt best param indices: {'batch_size': np.int64(2), 'epochs': np.int64(2), 'gamma': np.float64(2.5), 'label_smoothing': np.float64(0.06252359010244561), 'learning_rate': np.float64(1.785454033277208e-05), 'use_focal_loss': np.int64(1)}


In [16]:
# ----------------------------------------------------------------------------
#  7.1 Interpret best param indices from Hyperopt
# ----------------------------------------------------------------------------

epochs_options = [2, 3, 4]
batch_options  = [4, 8, 16]
use_focal_options = [False, True]

final_params = {
    "learning_rate":    best["learning_rate"],
    "epochs":           epochs_options[best["epochs"]],
    "batch_size":       batch_options[ best["batch_size"] ],
    "use_focal_loss":   use_focal_options[ best["use_focal_loss"] ],
    "gamma":            best["gamma"],
    "label_smoothing":  best["label_smoothing"]
}

print("Interpreted best hyperparams:\n", final_params)


Interpreted best hyperparams:
 {'learning_rate': np.float64(1.785454033277208e-05), 'epochs': 4, 'batch_size': 16, 'use_focal_loss': True, 'gamma': np.float64(2.5), 'label_smoothing': np.float64(0.06252359010244561)}


In [21]:
# ----------------------------------------------------------------------------
#  8. TRAIN A FINAL MODEL USING THE BEST HYPERPARAMS
# ----------------------------------------------------------------------------

best_model = CustomDebertaModel(
    model_name=model_name,
    num_labels=2,
    use_focal_loss=final_params["use_focal_loss"],
    gamma=final_params["gamma"],
    label_smoothing=final_params["label_smoothing"]
)
best_model.to(device)

training_args = TrainingArguments(
    output_dir="./final-sota-model",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=final_params["learning_rate"],
    num_train_epochs=final_params["epochs"],
    per_device_train_batch_size=final_params["batch_size"],
    per_device_eval_batch_size=8,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    save_total_limit=1,
    report_to="none",  
    logging_steps=1
)

trainer = Trainer(
    model=best_model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["dev"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
results_dev = trainer.evaluate(encoded_dataset["dev"])
print("Final Dev Results:", results_dev)

# Classification report
preds_output = trainer.predict(encoded_dataset["dev"])
dev_preds = np.argmax(preds_output.predictions, axis=1)
dev_labels = preds_output.label_ids
print("\nDetailed Classification Report (Dev):")
print(classification_report(dev_labels, dev_preds, digits=4))


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0632,0.049908,0.882377,0.879514
2,0.0375,0.053875,0.887228,0.884576
3,0.0215,0.084432,0.888672,0.886601
4,0.0106,0.114946,0.8894,0.887276


Final Dev Results: {'eval_loss': 0.11494570225477219, 'eval_f1': 0.8893995170056225, 'eval_accuracy': 0.8872764090448869, 'eval_runtime': 29.9141, 'eval_samples_per_second': 198.101, 'eval_steps_per_second': 24.771, 'epoch': 4.0}

Detailed Classification Report (Dev):
              precision    recall  f1-score   support

           0     0.9458    0.8955    0.9199      4286
           1     0.7602    0.8659    0.8096      1640

    accuracy                         0.8873      5926
   macro avg     0.8530    0.8807    0.8648      5926
weighted avg     0.8944    0.8873    0.8894      5926



In [25]:
# ----------------------------------------------------------------------------
# 8.1 SAVE THE BEST MODEL
# ----------------------------------------------------------------------------

torch.save(trainer.model.state_dict(), BEST_MODEL_PATH)
print(f"Best model saved to {BEST_MODEL_PATH}")

Best model saved to data\taskC\best_deberta_model.pt


In [26]:
# ----------------------------------------------------------------------------
# 8.2 LOAD THE BEST MODEL
# ----------------------------------------------------------------------------

# Re-initialize the same architecture
loaded_model = CustomDebertaModel(
    model_name=model_name,
    num_labels=2,
    use_focal_loss=final_params["use_focal_loss"],
    gamma=final_params["gamma"],
    label_smoothing=final_params["label_smoothing"]
)

# Load the saved weights
loaded_model.load_state_dict(torch.load(BEST_MODEL_PATH, map_location=device))
loaded_model.to(device)
loaded_model.eval()

print("Successfully loaded state_dict into loaded_model.")

# Assign the loaded model to the Trainer
trainer.model = loaded_model


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Successfully loaded state_dict into loaded_model.


In [31]:
# ----------------------------------------------------------------------------
# 9.1 INFERENCE ON THE DEV SET (codebench debugging)
# ----------------------------------------------------------------------------

from torch.utils.data import DataLoader

DEV_OUTPUT_PATH = "dev_predictions.csv"

# Create a DataLoader for the dev set
dev_loader = DataLoader(encoded_dataset["dev"], batch_size=8)

loaded_model.eval()
all_dev_preds = []

for batch in dev_loader:
    for key in batch:
        batch[key] = batch[key].to(device)
    
    with torch.no_grad():
        outputs = loaded_model(**batch)  
    
    logits = outputs["logits"]                  
    preds = torch.argmax(logits, dim=1)          
    all_dev_preds.extend(preds.cpu().tolist())   

# Create a single-column DataFrame for the dev predictions
dev_pred_df = pd.DataFrame({"prediction": all_dev_preds})
dev_pred_df.to_csv(DEV_OUTPUT_PATH, index=False)
print(f"Dev predictions saved to {DEV_OUTPUT_PATH}")



Dev predictions saved to dev_predictions.csv


In [None]:
# ----------------------------------------------------------------------------
# 9.2 INFERENCE ON THE TEST SET 
# ----------------------------------------------------------------------------

from torch.utils.data import DataLoader

# Create a DataLoader for the test set
test_loader = DataLoader(encoded_dataset["test"], batch_size=8)

loaded_model.eval()
all_test_preds = []

for batch in test_loader:
    # Move inputs to GPU/CPU
    for key in batch:
        batch[key] = batch[key].to(device)
    
    # No gradient calculation needed
    with torch.no_grad():
        outputs = loaded_model(**batch)  # outputs contains "logits"
    
    logits = outputs["logits"]                   # shape (B, num_labels)
    preds = torch.argmax(logits, dim=1)          # shape (B,)
    all_test_preds.extend(preds.cpu().tolist())  # Collect predictions on CPU

# Create a single-column DataFrame for the test predictions
test_pred_df = pd.DataFrame({"prediction": all_test_preds})
test_pred_df.to_csv(OUTPUT_PATH, index=False)
print(f"Test predictions saved to {OUTPUT_PATH}")



TypeError: Unsupported types (<class 'NoneType'>) passed to `_pad_across_processes`. Only nested list/tuple/dicts of objects that are valid for `is_torch_tensor` should be passed.