In [3]:
import os
import numpy as np
import pandas as pd
import torch
import random
import nltk

from tqdm import tqdm
from nltk.corpus import wordnet
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.metrics import f1_score, accuracy_score, classification_report

# Download necessary NLTK data (WordNet for synonyms)
nltk_data_dir = "/kaggle/working/nltk_data"
nltk.download("wordnet", download_dir=nltk_data_dir)
nltk.download("omw-1.4", download_dir=nltk_data_dir)
nltk.data.path.append(nltk_data_dir)

# ----------------------------------------------------------------------------
#  1.1 Check GPU availability
# ----------------------------------------------------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

[nltk_data] Downloading package wordnet to
[nltk_data]     /kaggle/working/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /kaggle/working/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Using device: cuda


In [4]:
#!unzip /kaggle/working/nltk_data/corpora/omw-1.4.zip -d /kaggle/working/nltk_data/corpora/
#!unzip /kaggle/working/nltk_data/corpora/wordnet.zip -d /kaggle/working/nltk_data/corpora/

In [None]:
# ----------------------------------------------------------------------------
#  2. LOAD YOUR DATA
#     Adjust these file paths to your environment. 
#     The CSVs must contain columns:
#       train.csv: claim, evidence, label
#       dev.csv:   claim, evidence, label
#       test.csv:  claim, evidence, (no label)
# ----------------------------------------------------------------------------

TRAIN_PATH = "/kaggle/input/nlu-ed-task/train.csv"
DEV_PATH   = "/kaggle/input/nlu-ed-task/dev.csv"
TEST_PATH  = "/kaggle/input/nlu-ed-task/train.csv"

train_df = pd.read_csv(TRAIN_PATH)
dev_df   = pd.read_csv(DEV_PATH)
test_df  = pd.read_csv(TEST_PATH)

train_df.rename(columns={"Claim": "claim", "Evidence": "evidence"}, inplace=True)
dev_df.rename(columns={"Claim": "claim", "Evidence": "evidence"}, inplace=True)
test_df.rename(columns={"Claim": "claim", "Evidence": "evidence"}, inplace=True)

print("Train samples:", len(train_df))
print("Dev samples:", len(dev_df))
print("Test samples:", len(test_df))

# If labels are strings, map them to integer {0,1} or {0,1,2,...}.
# For ED, assume 2 classes: 0 = not evidence, 1 = relevant evidence
train_df["label"] = train_df["label"].astype(int)
dev_df["label"]   = dev_df["label"].astype(int)

train_df.head(3)


Train samples: 21508
Dev samples: 5926
Test samples: 21508


Unnamed: 0,claim,evidence,label
0,We should introduce school vouchers,"Among the many educational reform efforts, suc...",0
1,We should legalize insider trading,The U.S. Securities and Exchange Commission wa...,0
2,We should subsidize investigative journalism,"The film won an Emmy Award (1980), George Polk...",0


In [6]:
# ----------------------------------------------------------------------------
#  3. (OPTIONAL) DATA AUGMENTATION (Synonym Replacement)
#      - We'll replace 1 random word in claim/evidence with a WordNet synonym
#      - For demonstration, there's a 15% chance per example to create an
#        augmented copy.
#      - This can help if your data is small or you want more variety.
# ----------------------------------------------------------------------------

def synonym_replacement(sentence, n=1):
    """
    Replace 'n' words in 'sentence' with synonyms from WordNet, if possible.
    """
    words = sentence.split()
    if len(words) < 2:
        return sentence

    indices_to_replace = random.sample(range(len(words)), k=min(n, len(words)))
    new_words = words[:]
    for i in indices_to_replace:
        word = words[i]
        syns = wordnet.synsets(word)
        if not syns:
            continue
        # For simplicity, pick from the first synset's lemmas
        lemmas = syns[0].lemma_names()
        # Filter out lemmas that are the same as the original
        lemmas = [l for l in lemmas if l.lower() != word.lower()]
        if len(lemmas) == 0:
            continue
        new_words[i] = random.choice(lemmas)
    return " ".join(new_words)


def augment_dataframe(df, alpha=0.15):
    """
    For each row, with probability alpha, create an augmented copy.
    Return a new DataFrame with both original and augmented samples.
    """
    augmented_rows = []
    for _, row in df.iterrows():
        # Original row
        augmented_rows.append(row.to_dict())
        
        if random.random() < alpha:
            new_row = row.copy()
            # Randomly augment claim or evidence
            if random.random() < 0.5:
                new_row["claim"] = synonym_replacement(row["claim"], n=1)
            else:
                new_row["evidence"] = synonym_replacement(row["evidence"], n=1)
            augmented_rows.append(new_row.to_dict())
    return pd.DataFrame(augmented_rows)

# Let's do a random seed for reproducibility
random.seed(42)

# AUGMENT the training set (remove if undesired)
augmented_train_df = augment_dataframe(train_df, alpha=0.15)
print("Original train size:", len(train_df), 
      " => After augmentation:", len(augmented_train_df))

train_df = augmented_train_df.reset_index(drop=True)


Original train size: 21508  => After augmentation: 24819


In [7]:
# ----------------------------------------------------------------------------
#  4. CREATE HUGGING FACE DATASETS
# ----------------------------------------------------------------------------

train_dataset = Dataset.from_pandas(train_df)
dev_dataset   = Dataset.from_pandas(dev_df)
test_dataset  = Dataset.from_pandas(test_df)

dataset_dict = DatasetDict({
    "train": train_dataset,
    "dev":   dev_dataset,
    "test":  test_dataset
})
dataset_dict


DatasetDict({
    train: Dataset({
        features: ['claim', 'evidence', 'label'],
        num_rows: 24819
    })
    dev: Dataset({
        features: ['claim', 'evidence', 'label'],
        num_rows: 5926
    })
    test: Dataset({
        features: ['claim', 'evidence', 'label'],
        num_rows: 21508
    })
})

In [8]:
# ----------------------------------------------------------------------------
#  5. TOKENIZATION
#     We'll use a powerful model: DeBERTa v3 (microsoft/deberta-v3-base)
#     which is known to outperform standard BERT on many tasks.
# ----------------------------------------------------------------------------

model_name = "microsoft/deberta-v3-base"

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(
        examples["claim"],
        examples["evidence"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

encoded_dataset = dataset_dict.map(tokenize_function, batched=True)


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



Map:   0%|          | 0/24819 [00:00<?, ? examples/s]

Map:   0%|          | 0/5926 [00:00<?, ? examples/s]

Map:   0%|          | 0/21508 [00:00<?, ? examples/s]

In [9]:
encoded_dataset["train"] = encoded_dataset["train"].rename_column("label", "labels")
encoded_dataset["dev"]   = encoded_dataset["dev"].rename_column("label", "labels")
# test split doesn't have label, so skip rename

# Now you can safely remove columns or set the format
encoded_dataset["train"] = encoded_dataset["train"].remove_columns(["claim", "evidence"])
encoded_dataset["dev"]   = encoded_dataset["dev"].remove_columns(["claim", "evidence"])

# If your test set still has "claim" & "evidence", remove them too:
encoded_dataset["test"]  = encoded_dataset["test"].remove_columns(["claim", "evidence"])

# Then set format for PyTorch
encoded_dataset["train"].set_format("torch")
encoded_dataset["dev"].set_format("torch")
encoded_dataset["test"].set_format("torch")


In [10]:
# ----------------------------------------------------------------------------
#  6. CUSTOM MODEL: Focal Loss or Label Smoothing
#     We'll override forward() to allow advanced loss functions.
# ----------------------------------------------------------------------------

import torch.nn as nn

class CustomDebertaModel(nn.Module):
    def __init__(self, model_name, num_labels=2, use_focal_loss=False, gamma=2.0, label_smoothing=0.0):
        super().__init__()
        self.num_labels = num_labels
        self.use_focal_loss = use_focal_loss
        self.gamma = gamma
        self.label_smoothing = label_smoothing
        
        # Load the pre-trained DeBERTa classification model
        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels
        )

    def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
        # DeBERTa forward pass (omit internal CE)
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=None,
            **kwargs
        )
        logits = outputs.logits  # shape: (batch_size, num_labels)
        
        loss = None
        if labels is not None:
            if self.use_focal_loss:
                loss = self.focal_loss(logits, labels, self.gamma)
            else:
                loss = self.label_smoothing_loss(logits, labels, self.label_smoothing)
        
        return {"loss": loss, "logits": logits}

    def focal_loss(self, logits, targets, gamma=2.0):
        # Focal Loss
        ce = nn.CrossEntropyLoss(reduction='none')(logits, targets)
        pt = torch.exp(-ce)
        focal = (1 - pt)**gamma * ce
        return focal.mean()

    def label_smoothing_loss(self, logits, targets, smoothing=0.0):
        if smoothing == 0.0:
            return nn.CrossEntropyLoss()(logits, targets)

        log_probs = nn.LogSoftmax(dim=-1)(logits)
        n_class = logits.size(1)
        with torch.no_grad():
            true_dist = torch.zeros_like(log_probs)
            true_dist.fill_(smoothing / (n_class - 1))
            true_dist.scatter_(1, targets.unsqueeze(1), 1.0 - smoothing)
        return torch.mean(torch.sum(-true_dist * log_probs, dim=1))


In [11]:
# ----------------------------------------------------------------------------
#  7. HYPERPARAMETER SEARCH WITH HYPEROPT
#     We'll define:
#       - learning_rate
#       - epochs
#       - batch_size
#       - use_focal_loss
#       - gamma (for focal loss)
#       - label_smoothing
# ----------------------------------------------------------------------------

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"f1": f1, "accuracy": acc}


def objective(space):
    """
    Hyperopt Objective:
      1) Create a CustomDebertaModel with the candidate hyperparams
      2) Train on train_dataset, evaluate on dev_dataset
      3) Return negative F1 (since Hyperopt minimizes)
    """
    learning_rate = space["learning_rate"]
    epochs        = int(space["epochs"])
    batch_size    = int(space["batch_size"])
    use_focal_loss = space["use_focal_loss"]
    gamma          = space["gamma"]
    label_smoothing = space["label_smoothing"]

    # Build the model
    model = CustomDebertaModel(
        model_name=model_name,
        num_labels=2,
        use_focal_loss=use_focal_loss,
        gamma=gamma,
        label_smoothing=label_smoothing
    )
    model.to(device)

    training_args = TrainingArguments(
        output_dir="./sota-ed-checkpoints",
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        learning_rate=learning_rate,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=8,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        save_total_limit=1,
        report_to="none",  # Turn off W&B or any other tracking
        logging_steps=1
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=encoded_dataset["train"],
        eval_dataset=encoded_dataset["dev"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    trainer.train()
    metrics = trainer.evaluate(encoded_dataset["dev"])
    f1 = metrics["eval_f1"]

    print(f"[Hyperopt] params={space} => F1={f1:.4f}")
    return {"loss": -f1, "status": STATUS_OK}


# Define search space
search_space = {
    "learning_rate":   hp.loguniform("learning_rate", np.log(1e-5), np.log(5e-4)),
    "epochs":          hp.choice("epochs", [2, 3, 4]),
    "batch_size":      hp.choice("batch_size", [4, 8, 16]),
    "use_focal_loss":  hp.choice("use_focal_loss", [False, True]),
    "gamma":           hp.quniform("gamma", 1.0, 5.0, 0.5),    # relevant if focal_loss=True
    "label_smoothing": hp.uniform("label_smoothing", 0.0, 0.2)
}

max_evals = 10  # For demonstration; increase for better search
trials = Trials()

best = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=max_evals,
    trials=trials
)

print("\nHyperopt best param indices:", best)


  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.6267,0.626217,0.607102,0.723253
2,0.6257,0.626089,0.607102,0.723253
3,0.6248,0.628621,0.607102,0.723253


[Hyperopt] params={'batch_size': 4, 'epochs': 3, 'gamma': 4.5, 'label_smoothing': 0.09456020674407445, 'learning_rate': 4.650364054074947e-05, 'use_focal_loss': False} => F1=0.6071
 10%|█         | 1/10 [36:55<5:32:19, 2215.53s/trial, best loss: -0.607102296650199]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.3381,0.304784,0.877063,0.873102
2,0.188,0.325829,0.892632,0.890651


[Hyperopt] params={'batch_size': 16, 'epochs': 2, 'gamma': 4.5, 'label_smoothing': 0.00011108738704290744, 'learning_rate': 2.379886141068789e-05, 'use_focal_loss': False} => F1=0.8926
 20%|██        | 2/10 [52:39<3:15:41, 1467.73s/trial, best loss: -0.8926320009887566]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.3895,0.368823,0.878019,0.87462
2,0.2814,0.406623,0.889338,0.886939


[Hyperopt] params={'batch_size': 8, 'epochs': 2, 'gamma': 1.5, 'label_smoothing': 0.025140874030732153, 'learning_rate': 1.257211962841882e-05, 'use_focal_loss': False} => F1=0.8893
 30%|███       | 3/10 [1:11:40<2:33:47, 1318.27s/trial, best loss: -0.8926320009887566]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0387,0.03797,0.607102,0.723253
2,0.038,0.03801,0.607102,0.723253
3,0.0378,0.037949,0.607102,0.723253


[Hyperopt] params={'batch_size': 4, 'epochs': 3, 'gamma': 4.0, 'label_smoothing': 0.14953429278875716, 'learning_rate': 0.0002599290684811281, 'use_focal_loss': True} => F1=0.6071
 40%|████      | 4/10 [1:48:43<2:47:33, 1675.57s/trial, best loss: -0.8926320009887566]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.2984,0.29707,0.607102,0.723253
2,0.2963,0.296987,0.607102,0.723253
3,0.2933,0.27547,0.707435,0.73861


[Hyperopt] params={'batch_size': 8, 'epochs': 3, 'gamma': 1.0, 'label_smoothing': 0.005405331684439041, 'learning_rate': 0.00010436157084074034, 'use_focal_loss': True} => F1=0.7074
 50%|█████     | 5/10 [2:16:51<2:20:00, 1680.15s/trial, best loss: -0.8926320009887566]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.2976,0.297114,0.607102,0.723253
2,0.296,0.298429,0.607102,0.723253


[Hyperopt] params={'batch_size': 8, 'epochs': 2, 'gamma': 1.0, 'label_smoothing': 0.042321244377780225, 'learning_rate': 8.501860704557743e-05, 'use_focal_loss': True} => F1=0.6071
 60%|██████    | 6/10 [2:35:50<1:39:43, 1495.95s/trial, best loss: -0.8926320009887566]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0525,0.049547,0.851188,0.845764
2,0.0326,0.044236,0.877285,0.873945
3,0.0184,0.063105,0.882228,0.879683


[Hyperopt] params={'batch_size': 8, 'epochs': 3, 'gamma': 3.0, 'label_smoothing': 0.06689669298932018, 'learning_rate': 2.681460711622723e-05, 'use_focal_loss': True} => F1=0.8822
 70%|███████   | 7/10 [3:03:57<1:17:55, 1558.44s/trial, best loss: -0.8926320009887566]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.0543,0.053571,0.607102,0.723253
2,0.0534,0.053597,0.607102,0.723253


[Hyperopt] params={'batch_size': 8, 'epochs': 2, 'gamma': 3.5, 'label_smoothing': 0.09892829430991, 'learning_rate': 0.00026964269961960493, 'use_focal_loss': True} => F1=0.6071
 80%|████████  | 8/10 [3:22:52<47:27, 1423.77s/trial, best loss: -0.8926320009887566]  

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.108,0.106574,0.607102,0.723253
2,0.1064,0.106598,0.607102,0.723253
3,0.1061,0.106527,0.607102,0.723253


[Hyperopt] params={'batch_size': 4, 'epochs': 3, 'gamma': 2.5, 'label_smoothing': 0.11402312832557485, 'learning_rate': 0.0002704088096216985, 'use_focal_loss': True} => F1=0.6071
 90%|█████████ | 9/10 [3:59:55<27:53, 1673.53s/trial, best loss: -0.8926320009887566]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.4221,0.398398,0.86438,0.864664
2,0.2886,0.441032,0.865231,0.860952
3,0.1939,0.482011,0.875046,0.872427


[Hyperopt] params={'batch_size': 8, 'epochs': 3, 'gamma': 2.0, 'label_smoothing': 0.022622711387814333, 'learning_rate': 5.689927283324451e-05, 'use_focal_loss': False} => F1=0.8750
100%|██████████| 10/10 [4:28:07<00:00, 1608.72s/trial, best loss: -0.8926320009887566]

Hyperopt best param indices: {'batch_size': 2, 'epochs': 0, 'gamma': 4.5, 'label_smoothing': 0.00011108738704290744, 'learning_rate': 2.379886141068789e-05, 'use_focal_loss': 0}


In [20]:
# ----------------------------------------------------------------------------
#  7.1 Interpret best param indices from Hyperopt
# ----------------------------------------------------------------------------

epochs_options = [2, 3, 4]
batch_options  = [4, 8, 16]
use_focal_options = [False, True]

final_params = {
    "learning_rate":    best["learning_rate"],
    "epochs":           epochs_options[best["epochs"]],
    "batch_size":       batch_options[ best["batch_size"] ],
    "use_focal_loss":   use_focal_options[ best["use_focal_loss"] ],
    "gamma":            best["gamma"],
    "label_smoothing":  best["label_smoothing"]
}

print("Interpreted best hyperparams:\n", final_params)


Interpreted best hyperparams:
 {'learning_rate': 2.379886141068789e-05, 'epochs': 2, 'batch_size': 16, 'use_focal_loss': False, 'gamma': 4.5, 'label_smoothing': 0.00011108738704290744}


In [None]:
# ----------------------------------------------------------------------------
#  8. TRAIN A FINAL MODEL USING THE BEST HYPERPARAMS
# ----------------------------------------------------------------------------

best_model = CustomDebertaModel(
    model_name=model_name,
    num_labels=2,
    use_focal_loss=final_params["use_focal_loss"],
    gamma=final_params["gamma"],
    label_smoothing=final_params["label_smoothing"]
)
best_model.to(device)

training_args = TrainingArguments(
    output_dir="./final-sota-model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=final_params["learning_rate"],
    num_train_epochs=final_params["epochs"],
    per_device_train_batch_size=final_params["batch_size"],
    per_device_eval_batch_size=8,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    save_total_limit=1,
    report_to="none",  # Turn off W&B or any other tracking
    logging_steps=1
)

trainer = Trainer(
    model=best_model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["dev"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
results_dev = trainer.evaluate(encoded_dataset["dev"])
print("Final Dev Results:", results_dev)

# Optional: classification report
preds_output = trainer.predict(encoded_dataset["dev"])
dev_preds = np.argmax(preds_output.predictions, axis=1)
dev_labels = preds_output.label_ids
print("\nDetailed Classification Report (Dev):")
print(classification_report(dev_labels, dev_preds, digits=4))


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.3318,0.314324,0.871924,0.867195
2,0.1818,0.34678,0.889496,0.887108


Final Dev Results: {'eval_loss': 0.3467804491519928, 'eval_f1': 0.889495550249558, 'eval_accuracy': 0.8871076611542356, 'eval_runtime': 34.26, 'eval_samples_per_second': 172.972, 'eval_steps_per_second': 21.629, 'epoch': 2.0}

Detailed Classification Report (Dev):
              precision    recall  f1-score   support

           0     0.9493    0.8915    0.9195      4286
           1     0.7554    0.8756    0.8111      1640

    accuracy                         0.8871      5926
   macro avg     0.8524    0.8836    0.8653      5926
weighted avg     0.8956    0.8871    0.8895      5926



In [14]:
# ----------------------------------------------------------------------------
#  9. INFERENCE ON TEST SET
# ----------------------------------------------------------------------------

test_predictions = trainer.predict(encoded_dataset["test"])
test_preds = np.argmax(test_predictions.predictions, axis=1)

# Add predictions to the test_df
test_df["label"] = test_preds
test_df.head()


Unnamed: 0,claim,evidence,label
0,We should introduce school vouchers,"Among the many educational reform efforts, suc...",0
1,We should legalize insider trading,The U.S. Securities and Exchange Commission wa...,0
2,We should subsidize investigative journalism,"The film won an Emmy Award (1980), George Polk...",0
3,We should further exploit nuclear power,a 2001 survey by the European Commission found...,1
4,We should ban whaling,The US and several other nations are whaling u...,0


In [15]:
# ----------------------------------------------------------------------------
#  9.1 SAVE PREDICTIONS
# ----------------------------------------------------------------------------

OUTPUT_PATH = "test_predictions.csv"
test_df.to_csv(OUTPUT_PATH, index=False)
print(f"Test predictions saved to: {OUTPUT_PATH}")


Test predictions saved to: test_predictions.csv
