In [1]:
import os
import numpy as np
import pandas as pd
import torch
import random
import nltk

from tqdm import tqdm
from nltk.corpus import wordnet
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.metrics import f1_score, accuracy_score, classification_report

# Download necessary NLTK data (WordNet for synonyms)
nltk.download("wordnet")
nltk.download("omw-1.4")

# ----------------------------------------------------------------------------
#  1.1 Check GPU availability
# ----------------------------------------------------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Backe\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Backe\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
#!unzip /kaggle/working/nltk_data/corpora/omw-1.4.zip -d /kaggle/working/nltk_data/corpora/
#!unzip /kaggle/working/nltk_data/corpora/wordnet.zip -d /kaggle/working/nltk_data/corpora/

In [2]:
# ----------------------------------------------------------------------------
#  2. LOAD YOUR DATA
#     Adjust these file paths to your environment. 
#     The CSVs must contain columns:
#       train.csv: claim, evidence, label
#       dev.csv:   claim, evidence, label
#       test.csv:  claim, evidence, (no label)
# ----------------------------------------------------------------------------

TRAIN_PATH = "data\\train.csv"
DEV_PATH   = "data\\dev.csv"
TEST_PATH  = "data\\test.csv"

BEST_MODEL_PATH = "data\\taskC\\best_deberta_model.pt"
OUTPUT_PATH = "data\\taskC\\predictions.csv"

train_df = pd.read_csv(TRAIN_PATH)
dev_df   = pd.read_csv(DEV_PATH)
test_df  = pd.read_csv(TEST_PATH)

train_df.rename(columns={"Claim": "claim", "Evidence": "evidence"}, inplace=True)
dev_df.rename(columns={"Claim": "claim", "Evidence": "evidence"}, inplace=True)
test_df.rename(columns={"Claim": "claim", "Evidence": "evidence"}, inplace=True)

print("Train samples:", len(train_df))
print("Dev samples:", len(dev_df))
print("Test samples:", len(test_df))

# If labels are strings, map them to integer {0,1} or {0,1,2,...}.
# For ED, assume 2 classes: 0 = not evidence, 1 = relevant evidence
train_df["label"] = train_df["label"].astype(int)
dev_df["label"]   = dev_df["label"].astype(int)

train_df.head(3)


Train samples: 21508
Dev samples: 5926
Test samples: 4688


Unnamed: 0,claim,evidence,label
0,We should introduce school vouchers,"Among the many educational reform efforts, suc...",0
1,We should legalize insider trading,The U.S. Securities and Exchange Commission wa...,0
2,We should subsidize investigative journalism,"The film won an Emmy Award (1980), George Polk...",0


In [3]:
# ----------------------------------------------------------------------------
#  3. (OPTIONAL) DATA AUGMENTATION (Synonym Replacement)
#      - We'll replace 1 random word in claim/evidence with a WordNet synonym
#      - For demonstration, there's a 15% chance per example to create an
#        augmented copy.
# 
# ----------------------------------------------------------------------------

def synonym_replacement(sentence, n=1):
    """
    Replace 'n' words in 'sentence' with synonyms from WordNet, if possible.
    """
    words = sentence.split()
    if len(words) < 2:
        return sentence

    indices_to_replace = random.sample(range(len(words)), k=min(n, len(words)))
    new_words = words[:]
    for i in indices_to_replace:
        word = words[i]
        syns = wordnet.synsets(word)
        if not syns:
            continue
        # For simplicity, pick from the first synset's lemmas
        lemmas = syns[0].lemma_names()
        # Filter out lemmas that are the same as the original
        lemmas = [l for l in lemmas if l.lower() != word.lower()]
        if len(lemmas) == 0:
            continue
        new_words[i] = random.choice(lemmas)
    return " ".join(new_words)


def augment_dataframe(df, alpha=0.15):
    """
    For each row, with probability alpha, create an augmented copy.
    Return a new DataFrame with both original and augmented samples.
    """
    augmented_rows = []
    for _, row in df.iterrows():
        # Original row
        augmented_rows.append(row.to_dict())
        
        if random.random() < alpha:
            new_row = row.copy()
            # Randomly augment claim or evidence
            if random.random() < 0.5:
                new_row["claim"] = synonym_replacement(row["claim"], n=1)
            else:
                new_row["evidence"] = synonym_replacement(row["evidence"], n=1)
            augmented_rows.append(new_row.to_dict())
    return pd.DataFrame(augmented_rows)

# Let's do a random seed for reproducibility
random.seed(42)

# AUGMENT the training set (remove if undesired)
augmented_train_df = augment_dataframe(train_df, alpha=0.15)
print("Original train size:", len(train_df), 
      " => After augmentation:", len(augmented_train_df))

train_df = augmented_train_df.reset_index(drop=True)


Original train size: 21508  => After augmentation: 24819


In [4]:
# ----------------------------------------------------------------------------
#  4. CREATE HUGGING FACE DATASETS
# ----------------------------------------------------------------------------

train_dataset = Dataset.from_pandas(train_df)
dev_dataset   = Dataset.from_pandas(dev_df)
test_dataset  = Dataset.from_pandas(test_df)

dataset_dict = DatasetDict({
    "train": train_dataset,
    "dev":   dev_dataset,
    "test":  test_dataset
})
dataset_dict


DatasetDict({
    train: Dataset({
        features: ['claim', 'evidence', 'label'],
        num_rows: 24819
    })
    dev: Dataset({
        features: ['claim', 'evidence', 'label'],
        num_rows: 5926
    })
    test: Dataset({
        features: ['claim', 'evidence'],
        num_rows: 4688
    })
})

In [5]:
# ----------------------------------------------------------------------------
#  5. TOKENIZATION
#     We use a powerful model: DeBERTa v3 (microsoft/deberta-v3-base)
#     which is known to outperform standard BERT on many tasks.
# ----------------------------------------------------------------------------

model_name = "microsoft/deberta-v3-base"

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(
        examples["claim"],
        examples["evidence"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

encoded_dataset = dataset_dict.map(tokenize_function, batched=True)




Map:   0%|          | 0/24819 [00:00<?, ? examples/s]

Map:   0%|          | 0/5926 [00:00<?, ? examples/s]

Map:   0%|          | 0/4688 [00:00<?, ? examples/s]

In [6]:
#Handling the labels in the dataset
encoded_dataset["train"] = encoded_dataset["train"].rename_column("label", "labels")
encoded_dataset["dev"]   = encoded_dataset["dev"].rename_column("label", "labels")

encoded_dataset["train"] = encoded_dataset["train"].remove_columns(["claim", "evidence"])
encoded_dataset["dev"]   = encoded_dataset["dev"].remove_columns(["claim", "evidence"])

encoded_dataset["test"]  = encoded_dataset["test"].remove_columns(["claim", "evidence"])

encoded_dataset["train"].set_format("torch")
encoded_dataset["dev"].set_format("torch")
encoded_dataset["test"].set_format("torch")

In [7]:
# ----------------------------------------------------------------------------
#  6. CUSTOM MODEL: Focal Loss or Label Smoothing
#     overridomg forward() to allow advanced loss functions.
# ----------------------------------------------------------------------------

import torch.nn as nn

class CustomDebertaModel(nn.Module):
    def __init__(self, model_name, num_labels=2, use_focal_loss=False, gamma=2.0, label_smoothing=0.0):
        super().__init__()
        self.num_labels = num_labels
        self.use_focal_loss = use_focal_loss
        self.gamma = gamma
        self.label_smoothing = label_smoothing
        
        # Load the pre-trained DeBERTa classification model
        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels
        )

    def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
        # DeBERTa forward pass (omit internal CE)
        kwargs.pop("num_items_in_batch", None)

        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=None,
            **kwargs
        )
        logits = outputs.logits  # shape: (batch_size, num_labels)
        
        loss = None
        if labels is not None:
            if self.use_focal_loss:
                loss = self.focal_loss(logits, labels, self.gamma)
            else:
                loss = self.label_smoothing_loss(logits, labels, self.label_smoothing)
        
        return {"loss": loss, "logits": logits}

    def focal_loss(self, logits, targets, gamma=2.0):
        # Focal Loss
        ce = nn.CrossEntropyLoss(reduction='none')(logits, targets)
        pt = torch.exp(-ce)
        focal = (1 - pt)**gamma * ce
        return focal.mean()

    def label_smoothing_loss(self, logits, targets, smoothing=0.0):
        if smoothing == 0.0:
            return nn.CrossEntropyLoss()(logits, targets)

        log_probs = nn.LogSoftmax(dim=-1)(logits)
        n_class = logits.size(1)
        with torch.no_grad():
            true_dist = torch.zeros_like(log_probs)
            true_dist.fill_(smoothing / (n_class - 1))
            true_dist.scatter_(1, targets.unsqueeze(1), 1.0 - smoothing)
        return torch.mean(torch.sum(-true_dist * log_probs, dim=1))


In [8]:
# ----------------------------------------------------------------------------
#  7. HYPERPARAMETER SEARCH WITH HYPEROPT
#     We'll define:
#       - learning_rate
#       - epochs
#       - batch_size
#       - use_focal_loss
#       - gamma (for focal loss)
#       - label_smoothing
# ----------------------------------------------------------------------------

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"f1": f1, "accuracy": acc}


def objective(space):
    """
    Hyperopt Objective:
      1) Create a CustomDebertaModel with the candidate hyperparams
      2) Train on train_dataset, evaluate on dev_dataset
      3) Return negative F1 (since Hyperopt minimizes)
    """
    learning_rate = space["learning_rate"]
    epochs        = int(space["epochs"])
    batch_size    = int(space["batch_size"])
    use_focal_loss = space["use_focal_loss"]
    gamma          = space["gamma"]
    label_smoothing = space["label_smoothing"]

    # Build the model
    model = CustomDebertaModel(
        model_name=model_name,
        num_labels=2,
        use_focal_loss=use_focal_loss,
        gamma=gamma,
        label_smoothing=label_smoothing
    )
    model.to(device)

    training_args = TrainingArguments(
        output_dir="./sota-ed-checkpoints",
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        learning_rate=learning_rate,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=8,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        save_total_limit=1,
        report_to="none", 
        logging_steps=1
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=encoded_dataset["train"],
        eval_dataset=encoded_dataset["dev"],
        processing_class=tokenizer,
        compute_metrics=compute_metrics
    )

    trainer.train()
    metrics = trainer.evaluate(encoded_dataset["dev"])
    f1 = metrics["eval_f1"]

    print(f"[Hyperopt] params={space} => F1={f1:.4f}")
    return {"loss": -f1, "status": STATUS_OK}


# Define search space
search_space = {
    "learning_rate":   hp.loguniform("learning_rate", np.log(1e-5), np.log(5e-4)),
    "epochs":          hp.choice("epochs", [2, 3, 4]),
    "batch_size":      hp.choice("batch_size", [4, 8, 16]),
    "use_focal_loss":  hp.choice("use_focal_loss", [False, True]),
    "gamma":           hp.quniform("gamma", 1.0, 5.0, 0.5),    
    "label_smoothing": hp.uniform("label_smoothing", 0.0, 0.2)
}

max_evals = 30 
trials = Trials()

best = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=max_evals,
    trials=trials
)

print("\nHyperopt best param indices:", best)


  0%|          | 0/30 [00:00<?, ?trial/s, best loss=?]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0161,0.013316,0.88701,0.884576
2,0.0095,0.01507,0.888316,0.885926


[Hyperopt] params={'batch_size': 16, 'epochs': 2, 'gamma': 4.5, 'label_smoothing': 0.12159883445297202, 'learning_rate': 1.403352626309778e-05, 'use_focal_loss': True} => F1=0.8883
  3%|▎         | 1/30 [13:31<6:32:00, 811.05s/trial, best loss: -0.8883157289201107]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.3877,0.338183,0.884733,0.882889
2,0.2724,0.368012,0.884334,0.881876
3,0.1922,0.500047,0.873865,0.869389
4,0.146,0.475074,0.889748,0.887445


[Hyperopt] params={'batch_size': 8, 'epochs': 4, 'gamma': 2.5, 'label_smoothing': 0.017174488963293922, 'learning_rate': 1.804114416744061e-05, 'use_focal_loss': False} => F1=0.8897
  7%|▋         | 2/30 [50:43<12:48:42, 1647.22s/trial, best loss: -0.8897480309880822]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.5925,0.590224,0.607102,0.723253
2,0.5879,0.590458,0.607102,0.723253
3,0.5873,0.59037,0.607102,0.723253
4,0.5867,0.590262,0.607102,0.723253


[Hyperopt] params={'batch_size': 8, 'epochs': 4, 'gamma': 4.5, 'label_smoothing': 0.0008609566740792607, 'learning_rate': 0.0003437195506520273, 'use_focal_loss': False} => F1=0.6071
 10%|█         | 3/30 [1:28:09<14:24:13, 1920.49s/trial, best loss: -0.8897480309880822]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0543,0.053659,0.607102,0.723253
2,0.0538,0.05371,0.607102,0.723253
3,0.0534,0.05104,0.671278,0.752109
4,0.0486,0.04421,0.750358,0.761897


[Hyperopt] params={'batch_size': 8, 'epochs': 4, 'gamma': 3.5, 'label_smoothing': 0.13308782780412667, 'learning_rate': 7.109681895877819e-05, 'use_focal_loss': True} => F1=0.7504
 13%|█▎        | 4/30 [2:04:55<14:41:10, 2033.47s/trial, best loss: -0.8897480309880822]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.6331,0.63226,0.607102,0.723253
2,0.6304,0.63289,0.607102,0.723253


[Hyperopt] params={'batch_size': 16, 'epochs': 2, 'gamma': 3.5, 'label_smoothing': 0.1131348005430801, 'learning_rate': 0.0001601445807601044, 'use_focal_loss': False} => F1=0.6071
 17%|█▋        | 5/30 [2:17:54<10:58:39, 1580.79s/trial, best loss: -0.8897480309880822]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.549,0.524318,0.887981,0.886601
2,0.5008,0.534548,0.887337,0.885083
3,0.4736,0.554314,0.880816,0.876983
4,0.4564,0.549849,0.889528,0.887276


[Hyperopt] params={'batch_size': 8, 'epochs': 4, 'gamma': 4.5, 'label_smoothing': 0.1552095854820624, 'learning_rate': 1.3105059712462589e-05, 'use_focal_loss': False} => F1=0.8895
 20%|██        | 6/30 [2:53:51<11:50:47, 1776.99s/trial, best loss: -0.8897480309880822]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.6506,0.648645,0.607102,0.723253
2,0.6478,0.648656,0.607102,0.723253
3,0.6474,0.649094,0.607102,0.723253


[Hyperopt] params={'batch_size': 4, 'epochs': 3, 'gamma': 2.5, 'label_smoothing': 0.1683797151233103, 'learning_rate': 0.0001528471392864525, 'use_focal_loss': False} => F1=0.6071
 23%|██▎       | 7/30 [3:41:48<13:39:00, 2136.56s/trial, best loss: -0.8897480309880822]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.1199,0.107524,0.877162,0.873439
2,0.0669,0.118573,0.891459,0.889639


[Hyperopt] params={'batch_size': 16, 'epochs': 2, 'gamma': 1.5, 'label_smoothing': 0.165068225725836, 'learning_rate': 2.084358534961521e-05, 'use_focal_loss': True} => F1=0.8915
 27%|██▋       | 8/30 [3:54:42<10:24:20, 1702.75s/trial, best loss: -0.8914590908738702]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.152,0.150061,0.607102,0.723253
2,0.1494,0.150258,0.607102,0.723253


[Hyperopt] params={'batch_size': 4, 'epochs': 2, 'gamma': 2.0, 'label_smoothing': 0.1228314311873115, 'learning_rate': 0.00022976060375887023, 'use_focal_loss': True} => F1=0.6071
 30%|███       | 9/30 [4:26:56<10:21:15, 1775.02s/trial, best loss: -0.8914590908738702]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.6251,0.624345,0.607102,0.723253
2,0.6208,0.618916,0.607102,0.723253


[Hyperopt] params={'batch_size': 8, 'epochs': 2, 'gamma': 2.5, 'label_smoothing': 0.08792093552471075, 'learning_rate': 0.00014921331904601946, 'use_focal_loss': False} => F1=0.6071
 33%|███▎      | 10/30 [4:44:48<8:39:21, 1558.07s/trial, best loss: -0.8914590908738702]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0457,0.035875,0.887565,0.885758
2,0.0284,0.050691,0.892011,0.889639


[Hyperopt] params={'batch_size': 8, 'epochs': 2, 'gamma': 3.0, 'label_smoothing': 0.05736481208349751, 'learning_rate': 1.0495183700025125e-05, 'use_focal_loss': True} => F1=0.8920
 37%|███▋      | 11/30 [5:02:36<7:25:53, 1408.08s/trial, best loss: -0.892011039832924] 

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.2141,0.211329,0.607102,0.723253
2,0.2106,0.211273,0.607102,0.723253
3,0.2102,0.211493,0.607102,0.723253


[Hyperopt] params={'batch_size': 4, 'epochs': 3, 'gamma': 1.5, 'label_smoothing': 0.18935308979340293, 'learning_rate': 0.00022060936202683698, 'use_focal_loss': True} => F1=0.6071
 40%|████      | 12/30 [5:51:50<9:23:28, 1878.23s/trial, best loss: -0.892011039832924]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.6139,0.613915,0.607102,0.723253
2,0.6114,0.615828,0.607102,0.723253


[Hyperopt] params={'batch_size': 4, 'epochs': 2, 'gamma': 5.0, 'label_smoothing': 0.05654789517538768, 'learning_rate': 8.412609132075768e-05, 'use_focal_loss': False} => F1=0.6071
 43%|████▎     | 13/30 [6:25:46<9:05:43, 1926.07s/trial, best loss: -0.892011039832924]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0193,0.019059,0.607102,0.723253
2,0.0192,0.019037,0.607102,0.723253
3,0.0185,0.018315,0.682661,0.733547


[Hyperopt] params={'batch_size': 16, 'epochs': 3, 'gamma': 5.0, 'label_smoothing': 0.18398608845659972, 'learning_rate': 0.00012816468457413227, 'use_focal_loss': True} => F1=0.6827
 47%|████▋     | 14/30 [6:45:51<7:35:31, 1708.22s/trial, best loss: -0.892011039832924]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0313,0.02795,0.876324,0.872427
2,0.0187,0.027338,0.890845,0.888964
3,0.0107,0.054854,0.88437,0.881708
4,0.0051,0.087415,0.891595,0.890145


[Hyperopt] params={'batch_size': 16, 'epochs': 4, 'gamma': 3.5, 'label_smoothing': 0.14481766643999597, 'learning_rate': 1.5629540193973018e-05, 'use_focal_loss': True} => F1=0.8916
 50%|█████     | 15/30 [7:12:51<7:00:24, 1681.63s/trial, best loss: -0.892011039832924]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.2126,0.211189,0.607102,0.723253
2,0.2105,0.21152,0.607102,0.723253


[Hyperopt] params={'batch_size': 16, 'epochs': 2, 'gamma': 1.5, 'label_smoothing': 0.09001359678431926, 'learning_rate': 0.0002415762344714122, 'use_focal_loss': True} => F1=0.6071
 53%|█████▎    | 16/30 [7:26:40<5:32:28, 1424.90s/trial, best loss: -0.892011039832924]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.603,0.598805,0.607102,0.723253
2,0.5991,0.599181,0.607102,0.723253
3,0.5979,0.600134,0.607102,0.723253
4,0.5971,0.600049,0.607102,0.723253


[Hyperopt] params={'batch_size': 4, 'epochs': 4, 'gamma': 3.0, 'label_smoothing': 0.02124180043594457, 'learning_rate': 0.00024079132520729045, 'use_focal_loss': False} => F1=0.6071
 57%|█████▋    | 17/30 [8:35:30<8:05:00, 2238.47s/trial, best loss: -0.892011039832924]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.6062,0.601562,0.607102,0.723253
2,0.6017,0.601579,0.607102,0.723253
3,0.6001,0.604901,0.607102,0.723253


[Hyperopt] params={'batch_size': 4, 'epochs': 3, 'gamma': 1.0, 'label_smoothing': 0.028101331397877828, 'learning_rate': 0.00031678710574485977, 'use_focal_loss': False} => F1=0.6071
 60%|██████    | 18/30 [9:26:45<8:17:56, 2489.73s/trial, best loss: -0.892011039832924]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.6579,0.656417,0.607102,0.723253
2,0.656,0.656364,0.607102,0.723253
3,0.6554,0.656937,0.607102,0.723253


[Hyperopt] params={'batch_size': 4, 'epochs': 3, 'gamma': 4.5, 'label_smoothing': 0.19806543660787856, 'learning_rate': 9.296619262909997e-05, 'use_focal_loss': False} => F1=0.6071
 63%|██████▎   | 19/30 [10:17:50<8:08:07, 2662.50s/trial, best loss: -0.892011039832924]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.6121,0.623946,0.81324,0.810834
2,0.5768,0.591409,0.822775,0.81674
3,0.5369,0.560643,0.857246,0.856564


[Hyperopt] params={'batch_size': 4, 'epochs': 3, 'gamma': 1.5, 'label_smoothing': 0.15589210799291897, 'learning_rate': 3.4469853421405605e-05, 'use_focal_loss': False} => F1=0.8572
 67%|██████▋   | 20/30 [11:08:03<7:41:17, 2767.79s/trial, best loss: -0.892011039832924]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0341,0.027983,0.870079,0.866352
2,0.0201,0.02666,0.8826,0.880695
3,0.0097,0.06212,0.882917,0.88137
4,0.0031,0.095838,0.885691,0.88407


[Hyperopt] params={'batch_size': 8, 'epochs': 4, 'gamma': 3.5, 'label_smoothing': 0.06652181199212014, 'learning_rate': 3.999504592504038e-05, 'use_focal_loss': True} => F1=0.8857
 70%|███████   | 21/30 [11:46:47<6:35:12, 2634.68s/trial, best loss: -0.892011039832924]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0231,0.019342,0.879236,0.875802
2,0.0147,0.020928,0.878882,0.874958
3,0.0101,0.029247,0.880384,0.877152
4,0.0066,0.042514,0.88584,0.883564


[Hyperopt] params={'batch_size': 16, 'epochs': 4, 'gamma': 4.0, 'label_smoothing': 0.051267275213648905, 'learning_rate': 1.0398989510547704e-05, 'use_focal_loss': True} => F1=0.8858
 73%|███████▎  | 22/30 [12:13:14<5:09:21, 2320.23s/trial, best loss: -0.892011039832924]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0478,0.038136,0.88286,0.881033
2,0.0274,0.042338,0.887868,0.886939
3,0.0132,0.083416,0.888895,0.887445
4,0.0047,0.135628,0.888505,0.886601


[Hyperopt] params={'batch_size': 8, 'epochs': 4, 'gamma': 3.0, 'label_smoothing': 0.07708953726284096, 'learning_rate': 2.9729599618629024e-05, 'use_focal_loss': True} => F1=0.8889
 77%|███████▋  | 23/30 [12:52:12<4:31:18, 2325.44s/trial, best loss: -0.892011039832924]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0227,0.018936,0.888721,0.887108
2,0.0121,0.020919,0.892705,0.890989


[Hyperopt] params={'batch_size': 16, 'epochs': 2, 'gamma': 4.0, 'label_smoothing': 0.1414757766530963, 'learning_rate': 4.798455916219888e-05, 'use_focal_loss': True} => F1=0.8927
 80%|████████  | 24/30 [13:05:47<3:07:14, 1872.33s/trial, best loss: -0.8927049346409967]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0383,0.037953,0.607102,0.723253
2,0.038,0.038139,0.607102,0.723253


[Hyperopt] params={'batch_size': 8, 'epochs': 2, 'gamma': 4.0, 'label_smoothing': 0.10390261914513857, 'learning_rate': 5.3424417668195164e-05, 'use_focal_loss': True} => F1=0.6071
 83%|████████▎ | 25/30 [13:25:33<2:18:51, 1666.35s/trial, best loss: -0.8927049346409967]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0221,0.018048,0.885784,0.883226
2,0.0123,0.023999,0.889322,0.887108


[Hyperopt] params={'batch_size': 16, 'epochs': 2, 'gamma': 4.0, 'label_smoothing': 0.03554493012139208, 'learning_rate': 2.414437389287209e-05, 'use_focal_loss': True} => F1=0.8893
 87%|████████▋ | 26/30 [13:39:25<1:34:24, 1416.05s/trial, best loss: -0.8927049346409967]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0445,0.038195,0.878527,0.874958
2,0.0219,0.043816,0.889242,0.887108


[Hyperopt] params={'batch_size': 16, 'epochs': 2, 'gamma': 3.0, 'label_smoothing': 0.04145571816519898, 'learning_rate': 5.279633310022454e-05, 'use_focal_loss': True} => F1=0.8892
 90%|█████████ | 27/30 [13:54:00<1:02:41, 1253.73s/trial, best loss: -0.8927049346409967]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0225,0.018853,0.877801,0.874452
2,0.0138,0.023372,0.887859,0.88542


[Hyperopt] params={'batch_size': 8, 'epochs': 2, 'gamma': 4.0, 'label_smoothing': 0.07214507156523497, 'learning_rate': 1.0302550249527089e-05, 'use_focal_loss': True} => F1=0.8879
 93%|█████████▎| 28/30 [14:12:34<40:23, 1211.76s/trial, best loss: -0.8927049346409967]  

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.1511,0.150067,0.607102,0.723253
2,0.1494,0.150219,0.607102,0.723253


[Hyperopt] params={'batch_size': 16, 'epochs': 2, 'gamma': 2.0, 'label_smoothing': 0.12099940322990729, 'learning_rate': 0.0004715680943388656, 'use_focal_loss': True} => F1=0.6071
 97%|█████████▋| 29/30 [14:27:38<18:39, 1119.60s/trial, best loss: -0.8927049346409967]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0122,0.009392,0.885956,0.884914
2,0.0065,0.011801,0.882968,0.880695


[Hyperopt] params={'batch_size': 8, 'epochs': 2, 'gamma': 5.0, 'label_smoothing': 0.13984736905248035, 'learning_rate': 4.293097221060447e-05, 'use_focal_loss': True} => F1=0.8860
100%|██████████| 30/30 [14:45:26<00:00, 1770.89s/trial, best loss: -0.8927049346409967]

Hyperopt best param indices: {'batch_size': np.int64(2), 'epochs': np.int64(0), 'gamma': np.float64(4.0), 'label_smoothing': np.float64(0.1414757766530963), 'learning_rate': np.float64(4.798455916219888e-05), 'use_focal_loss': np.int64(1)}


In [9]:
# ----------------------------------------------------------------------------
#  7.1 Interpret best param indices from Hyperopt
# ----------------------------------------------------------------------------

epochs_options = [2, 3, 4]
batch_options  = [4, 8, 16]
use_focal_options = [False, True]

final_params = {
    "learning_rate":    best["learning_rate"],
    "epochs":           epochs_options[best["epochs"]],
    "batch_size":       batch_options[ best["batch_size"] ],
    "use_focal_loss":   use_focal_options[ best["use_focal_loss"] ],
    "gamma":            best["gamma"],
    "label_smoothing":  best["label_smoothing"]
}

print("Interpreted best hyperparams:\n", final_params)


Interpreted best hyperparams:
 {'learning_rate': np.float64(4.798455916219888e-05), 'epochs': 2, 'batch_size': 16, 'use_focal_loss': True, 'gamma': np.float64(4.0), 'label_smoothing': np.float64(0.1414757766530963)}


In [10]:
# ----------------------------------------------------------------------------
#  8. TRAIN A FINAL MODEL USING THE BEST HYPERPARAMS
# ----------------------------------------------------------------------------

best_model = CustomDebertaModel(
    model_name=model_name,
    num_labels=2,
    use_focal_loss=final_params["use_focal_loss"],
    gamma=final_params["gamma"],
    label_smoothing=final_params["label_smoothing"]
)
best_model.to(device)

training_args = TrainingArguments(
    output_dir="./final-sota-model",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=final_params["learning_rate"],
    num_train_epochs=final_params["epochs"],
    per_device_train_batch_size=final_params["batch_size"],
    per_device_eval_batch_size=8,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    save_total_limit=1,
    report_to="none",  
    logging_steps=1
)

trainer = Trainer(
    model=best_model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["dev"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
results_dev = trainer.evaluate(encoded_dataset["dev"])
print("Final Dev Results:", results_dev)

# Classification report
preds_output = trainer.predict(encoded_dataset["dev"])
dev_preds = np.argmax(preds_output.predictions, axis=1)
dev_labels = preds_output.label_ids
print("\nDetailed Classification Report (Dev):")
print(classification_report(dev_labels, dev_preds, digits=4))


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0247,0.019016,0.878786,0.875802
2,0.0127,0.020458,0.890355,0.888458


Final Dev Results: {'eval_loss': 0.02045840583741665, 'eval_f1': 0.8903553648924198, 'eval_accuracy': 0.8884576442794465, 'eval_runtime': 26.1667, 'eval_samples_per_second': 226.471, 'eval_steps_per_second': 28.318, 'epoch': 2.0}

Detailed Classification Report (Dev):
              precision    recall  f1-score   support

           0     0.9439    0.8992    0.9210      4286
           1     0.7656    0.8604    0.8102      1640

    accuracy                         0.8885      5926
   macro avg     0.8548    0.8798    0.8656      5926
weighted avg     0.8946    0.8885    0.8904      5926



In [16]:
# ----------------------------------------------------------------------------
# 8.1 SAVE THE BEST MODEL
# ----------------------------------------------------------------------------

hf_config_dict = trainer.model.model.config.to_dict()

# 2) Store everything in one dictionary
save_dict = {
    "model_state_dict": trainer.model.state_dict(),
    
    # The base Hugging Face model config 
    "hf_config": hf_config_dict,
    
    # Custom hyperparameters for re-initializing the custom wrapper
    "hyperparams": {
        "model_name": model_name,
        "num_labels": trainer.model.num_labels,
        "use_focal_loss": trainer.model.use_focal_loss,
        "gamma": float(trainer.model.gamma),
        "label_smoothing": float(trainer.model.label_smoothing)
    }
}

torch.save(save_dict, BEST_MODEL_PATH)
print(f"Model checkpoint saved to {BEST_MODEL_PATH}")

Model checkpoint saved to data\taskC\best_deberta_model.pt


In [14]:
# ----------------------------------------------------------------------------
# 9.1 INFERENCE ON THE DEV SET (codebench debugging)
# ----------------------------------------------------------------------------

from torch.utils.data import DataLoader

DEV_OUTPUT_PATH = "dev_predictions.csv"

# Create a DataLoader for the dev set
dev_loader = DataLoader(encoded_dataset["dev"], batch_size=8)
trainer.model.eval()
all_dev_preds = []

with torch.no_grad():
    for batch in dev_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = trainer.model(**batch)
        logits = outputs["logits"]
        preds = torch.argmax(logits, dim=1)
        all_dev_preds.extend(preds.cpu().tolist())

dev_pred_df = pd.DataFrame({"prediction": all_dev_preds})
dev_pred_df.to_csv(DEV_OUTPUT_PATH, index=False)
print(f"Dev predictions saved to {DEV_OUTPUT_PATH}")




Dev predictions saved to dev_predictions.csv


In [15]:
# ----------------------------------------------------------------------------
# 9.2 INFERENCE ON THE TEST SET 
# ----------------------------------------------------------------------------

from torch.utils.data import DataLoader

# Create a DataLoader for the test set
test_loader = DataLoader(encoded_dataset["test"], batch_size=8)
trainer.model.eval()
all_test_preds = []

with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = trainer.model(**batch)
        logits = outputs["logits"]
        preds = torch.argmax(logits, dim=1)
        all_test_preds.extend(preds.cpu().tolist())

test_pred_df = pd.DataFrame({"prediction": all_test_preds})
test_pred_df.to_csv(OUTPUT_PATH, index=False)
print(f"Test predictions saved to {OUTPUT_PATH}")

Test predictions saved to data\taskC\predictions.csv
