# Run `run.py` in Google Colab
Install dependencies, clone the repository if needed, and launch the training or evaluation script with parameterized arguments.

In [1]:
import os
import subprocess
import sys
from pathlib import Path

repo_url = "https://github.com/Chris0lsen/fp-dataset-artifacts.git"
repo_dir = Path("fp-dataset-artifacts")

if Path.cwd().name != repo_dir.name:
    if not repo_dir.exists():
        subprocess.run(["git", "clone", repo_url, repo_dir.name], check=True)
    os.chdir(repo_dir)
    print(f"Working directory: {Path.cwd()}")
else:
    print(f"Working directory: {Path.cwd()}")

subprocess.run([sys.executable, "-m", "pip", "install", "-r", "requirements.txt"], check=True)

Working directory: /content/fp-dataset-artifacts


CompletedProcess(args=['/usr/bin/python3', '-m', 'pip', 'install', '-r', 'requirements.txt'], returncode=0)

In [2]:
# Adjust these values as needed.
task = "nli"
dataset = "snli"
do_train = True
do_eval = False
output_dir = "./trained_model"
model_id = "google/electra-small-discriminator"
max_length = 128
max_train_samples = None
max_eval_samples = None
training_arg_overrides = {
    "per_device_train_batch_size": 8,
    "num_train_epochs": 3.0
}
extra_args = []  # e.g., ["--resume_from_checkpoint", "./trained_model"]
disable_wandb = True  # Set to False if you have wandb configured
keep_intermediate_checkpoints = False  # When False, disable periodic checkpoint saves
save_to_drive = True  # Flip to True to copy artifacts into Google Drive after a run
drive_mount_point = "/content/drive"  # Leave as-is unless you mount elsewhere
drive_output_dir = "/content/drive/MyDrive/fp-trained-models"  # Destination folder in Drive


In [3]:
if save_to_drive:
    try:
        from google.colab import drive as gdrive
    except ImportError as exc:
        raise RuntimeError("save_to_drive=True requires running inside Google Colab") from exc
    print(f"Mounting Google Drive at {drive_mount_point} (you may be prompted to authorize)...")
    gdrive.mount(drive_mount_point, force_remount=False)

Mounting Google Drive at /content/drive (you may be prompted to authorize)...
Mounted at /content/drive


In [None]:
import os
import shlex
import shutil
import subprocess
import sys
from pathlib import Path

cli_args = [
    sys.executable,
    "run.py",
    "--task",
    task,
    "--output_dir",
    output_dir
]

if do_train:
    cli_args.append("--do_train")
if do_eval:
    cli_args.append("--do_eval")
if dataset:
    cli_args.extend(["--dataset", dataset])
if model_id:
    cli_args.extend(["--model", model_id])
if max_length is not None:
    cli_args.extend(["--max_length", str(max_length)])
if max_train_samples is not None:
    cli_args.extend(["--max_train_samples", str(max_train_samples)])
if max_eval_samples is not None:
    cli_args.extend(["--max_eval_samples", str(max_eval_samples)])

effective_overrides = dict(training_arg_overrides)
if not keep_intermediate_checkpoints:
    effective_overrides.setdefault("save_strategy", "no")

for key, value in effective_overrides.items():
    if value is None:
        continue
    cli_args.extend([f"--{key}", str(value)])

cli_args.extend(extra_args)
Path(output_dir).mkdir(parents=True, exist_ok=True)

env = os.environ.copy()
if disable_wandb:
    env["WANDB_DISABLED"] = "true"

if save_to_drive:
    try:
        from google.colab import drive as gdrive
    except ImportError as exc:
        raise RuntimeError("save_to_drive=True requires running inside Google Colab") from exc
    print(f"Mounting Google Drive at {drive_mount_point} (you may be prompted to authorize)...")
    gdrive.mount(drive_mount_point, force_remount=False)

print("Running:", " ".join(shlex.quote(str(arg)) for arg in cli_args))
result = subprocess.run(cli_args, check=False, capture_output=True, text=True, env=env)
if result.stdout:
    print("\nstdout:\n", result.stdout)
if result.stderr:
    print("\nstderr:\n", result.stderr, file=sys.stderr)
if result.returncode != 0:
    raise RuntimeError(f"run.py exited with status {result.returncode}")

if save_to_drive:
    source_dir = Path(output_dir)
    if not source_dir.is_dir():
        raise FileNotFoundError(f"Expected output directory '{source_dir}' not found")
    dest_root = Path(drive_output_dir)
    dest_root.mkdir(parents=True, exist_ok=True)
    dest_dir = dest_root / source_dir.name
    print(f"Copying artifacts to {dest_dir}...")
    shutil.copytree(source_dir, dest_dir, dirs_exist_ok=True)
    print("Artifacts copied to Google Drive.")


In [None]:
import os
import shlex
import subprocess
import sys
from pathlib import Path

# Use the fine-tuned checkpoint unless you override here
eval_model_path = Path(output_dir)
if not eval_model_path.exists():
    eval_model_path = Path(model_id)

cli_args = [
    sys.executable,
    "run.py",
    "--task",
    task,
    "--output_dir",
    output_dir,
    "--do_eval"
]

if dataset:
    cli_args.extend(["--dataset", dataset])
if eval_model_path:
    cli_args.extend(["--model", str(eval_model_path)])
if max_length is not None:
    cli_args.extend(["--max_length", str(max_length)])
if max_eval_samples is not None:
    cli_args.extend(["--max_eval_samples", str(max_eval_samples)])

per_device_eval_bs = training_arg_overrides.get("per_device_eval_batch_size")
if per_device_eval_bs is not None:
    cli_args.extend(["--per_device_eval_batch_size", str(per_device_eval_bs)])

cli_args.extend(extra_args)

env = os.environ.copy()
if disable_wandb:
    env["WANDB_DISABLED"] = "true"

print("Running eval:", " ".join(shlex.quote(str(arg)) for arg in cli_args))
result = subprocess.run(cli_args, check=False, capture_output=True, text=True, env=env)
if result.stdout:
    print("\nstdout:\n", result.stdout)
if result.stderr:
    print("\nstderr:\n", result.stderr, file=sys.stderr)
if result.returncode != 0:
    raise RuntimeError(f"run.py evaluation exited with status {result.returncode}")


In [4]:
!pip install -q "transformers==4.57.1"

import torch
import pandas as pd
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Path to your SNLI-finetuned ELECTRA
model_dir = "/content/drive/MyDrive/fp-trained-models/trained_model"

model = AutoModelForSequenceClassification.from_pretrained(
    model_dir,
    local_files_only=True,
)
tokenizer = AutoTokenizer.from_pretrained(
    model_dir,
    local_files_only=True,
)

print("Loaded model from:", model_dir)
print("num_labels:", model.config.num_labels)
print("id2label:", model.config.id2label)


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m142.8 MB/s[0m eta [36m0:00:00[0m
[?25hLoaded model from: /content/drive/MyDrive/fp-trained-models/trained_model
num_labels: 3
id2label: {0: 'LABEL_0', 1: 'LABEL_1', 2: 'LABEL_2'}


In [5]:
# Clone HANS repo (only needs to be done once per runtime)
!git clone -q https://github.com/tommccoy1/hans.git

hans_path = "hans/heuristics_evaluation_set.txt"

# HANS is tab-separated
hans_df = pd.read_table(hans_path)

print("HANS columns:", hans_df.columns.tolist())
print(hans_df[["gold_label", "sentence1", "sentence2", "heuristic"]].head())
print("Gold label distribution:\n", hans_df["gold_label"].value_counts())


HANS columns: ['gold_label', 'sentence1_binary_parse', 'sentence2_binary_parse', 'sentence1_parse', 'sentence2_parse', 'sentence1', 'sentence2', 'pairID', 'heuristic', 'subcase', 'template']
       gold_label                               sentence1  \
0  non-entailment      The president advised the doctor .   
1  non-entailment          The student saw the managers .   
2  non-entailment  The presidents encouraged the banker .   
3  non-entailment      The senators supported the actor .   
4  non-entailment        The actors avoided the bankers .   

                                sentence2        heuristic  
0      The doctor advised the president .  lexical_overlap  
1          The managers saw the student .  lexical_overlap  
2  The banker encouraged the presidents .  lexical_overlap  
3      The actor supported the senators .  lexical_overlap  
4        The bankers avoided the actors .  lexical_overlap  
Gold label distribution:
 gold_label
non-entailment    15000
entailment     

In [6]:
# ==== ADJUST THESE IF YOUR TRAINING USED DIFFERENT LABEL IDS ====
SNLI_ENTAILMENT_ID = 0
SNLI_NEUTRAL_ID = 1
SNLI_CONTRADICTION_ID = 2
# ================================================================

# For HANS, we only care about entailment vs non-entailment
def id_to_hans_label(pred_id: int) -> str:
    if pred_id == SNLI_ENTAILMENT_ID:
        return "entailment"
    else:
        return "non-entailment"

premises = hans_df["sentence1"].astype(str).tolist()
hypotheses = hans_df["sentence2"].astype(str).tolist()
gold_labels = hans_df["gold_label"].astype(str).tolist()

MAX_LEN = 128  # tweak

enc = tokenizer(
    premises,
    hypotheses,
    padding=True,
    truncation=True,
    max_length=MAX_LEN,
    return_tensors="pt",
)

if torch.cuda.is_available():
    model = model.cuda()
    enc = {k: v.cuda() for k, v in enc.items()}

model.eval()
with torch.no_grad():
    logits = model(**enc).logits
    pred_ids = logits.argmax(dim=1).cpu().tolist()

pred_hans_labels = [id_to_hans_label(pid) for pid in pred_ids]

hans_df["predicted"] = pred_hans_labels
hans_df["correct"] = hans_df["predicted"] == hans_df["gold_label"]

overall_acc = hans_df["correct"].mean()
print(f"HANS accuracy (entailment vs non-entailment): {overall_acc:.3f}")
print("Examples:")
print(hans_df[["sentence1", "sentence2", "gold_label", "predicted"]].head(10))


HANS accuracy (entailment vs non-entailment): 0.523
Examples:
                                    sentence1  \
0          The president advised the doctor .   
1              The student saw the managers .   
2      The presidents encouraged the banker .   
3          The senators supported the actor .   
4            The actors avoided the bankers .   
5         The senators mentioned the artist .   
6          The managers saw the secretaries .   
7  The professor recognized the secretaries .   
8        The author contacted the scientist .   
9      The athletes recommended the senator .   

                                    sentence2      gold_label   predicted  
0          The doctor advised the president .  non-entailment  entailment  
1              The managers saw the student .  non-entailment  entailment  
2      The banker encouraged the presidents .  non-entailment  entailment  
3          The actor supported the senators .  non-entailment  entailment  
4            The b

In [7]:
acc_by_heuristic = (
    hans_df.groupby("heuristic")["correct"]
           .mean()
           .sort_values()
)

print("Accuracy by heuristic:")
print(acc_by_heuristic)

# Optional: by heuristic + subcase
acc_by_heuristic_subcase = (
    hans_df.groupby(["heuristic", "subcase"])["correct"]
           .mean()
           .sort_values()
)

print("\nAccuracy by (heuristic, subcase):")
print(acc_by_heuristic_subcase.head(20))


Accuracy by heuristic:
heuristic
constituent        0.5102
subsequence        0.5121
lexical_overlap    0.5461
Name: correct, dtype: float64

Accuracy by (heuristic, subcase):
heuristic        subcase                      
constituent      cn_after_if_clause               0.000
                 cn_adverb                        0.000
lexical_overlap  ln_passive                       0.000
constituent      cn_embedded_under_verb           0.003
subsequence      sn_PP_on_subject                 0.006
                 sn_NP/S                          0.008
                 sn_past_participle               0.021
constituent      cn_disjunction                   0.022
subsequence      sn_relative_clause_on_subject    0.027
                 sn_NP/Z                          0.059
lexical_overlap  ln_relative_clause               0.117
                 ln_subject/object_swap           0.125
                 ln_preposition                   0.129
constituent      cn_embedded_under_if            

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split

# hans_path = "hans/heuristics_evaluation_set.txt"
# hans_df = pd.read_table(hans_path)

# keep the important columns; keep heuristic/subcase for later analysis
hans_df = hans_df[["sentence1", "sentence2", "gold_label", "heuristic", "subcase"]]

# First: train (60%) vs temp (40%)
train_df, temp_df = train_test_split(
    hans_df,
    test_size=0.4,
    random_state=42,
    stratify=hans_df["gold_label"],
)

# Then: split temp into dev (20%) and test (20%)
dev_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    random_state=43,  # can be different
    stratify=temp_df["gold_label"],
)

print("Train size:", len(train_df))
print("Dev size:  ", len(dev_df))
print("Test size: ", len(test_df))

print("Train label dist:\n", train_df["gold_label"].value_counts(normalize=True))
print("Dev label dist:\n", dev_df["gold_label"].value_counts(normalize=True))
print("Test label dist:\n", test_df["gold_label"].value_counts(normalize=True))



Train size: 18000
Dev size:   6000
Test size:  6000
Train label dist:
 gold_label
entailment        0.5
non-entailment    0.5
Name: proportion, dtype: float64
Dev label dist:
 gold_label
non-entailment    0.5
entailment        0.5
Name: proportion, dtype: float64
Test label dist:
 gold_label
non-entailment    0.5
entailment        0.5
Name: proportion, dtype: float64


In [15]:
from datasets import Dataset

def hans_to_snli_label(row):
    if row["gold_label"] == "entailment":
        return 0  # entailment
    else:
        return 2  # contradiction

for df in (train_df, dev_df, test_df):
    df["label"] = df.apply(hans_to_snli_label, axis=1)

train_ds = Dataset.from_pandas(train_df.reset_index(drop=True))
dev_ds   = Dataset.from_pandas(dev_df.reset_index(drop=True))
test_ds  = Dataset.from_pandas(test_df.reset_index(drop=True))



In [16]:
from transformers import AutoTokenizer

baseline_model_dir = "/content/drive/MyDrive/fp-trained-models/trained_model"
tokenizer = AutoTokenizer.from_pretrained(baseline_model_dir, local_files_only=True)

MAX_LEN = 128

def tokenize_hans(batch):
    return tokenizer(
        batch["sentence1"],
        batch["sentence2"],
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN,
    )

train_ds_tok = train_ds.map(tokenize_hans, batched=True)
dev_ds_tok   = dev_ds.map(tokenize_hans, batched=True)
test_ds_tok  = test_ds.map(tokenize_hans, batched=True)

cols = ["input_ids", "attention_mask", "token_type_ids", "label"]
for ds in (train_ds_tok, dev_ds_tok, test_ds_tok):
    ds.set_format(type="torch", columns=[c for c in cols if c in ds.column_names])


Map:   0%|          | 0/18000 [00:00<?, ? examples/s]

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

In [50]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import torch
from pathlib import Path

model = AutoModelForSequenceClassification.from_pretrained(
    baseline_model_dir,
    local_files_only=True,
)

training_args = TrainingArguments(
    output_dir="./tmp-ignore-me",
    eval_strategy="epoch",
    save_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=False,
    metric_for_best_model="accuracy",
    logging_steps=100,
    report_to="none",  # disable wandb/tensorboard
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = (preds == labels).astype("float32").mean().item()
    return {"accuracy": acc}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds_tok,
    eval_dataset=dev_ds_tok,
    compute_metrics=compute_metrics,
)

trainer.train()

drive_output_dir = "/content/drive/MyDrive/fp-trained-models"
hans_save_dir = Path(drive_output_dir) / "electra_snli_hans_split"

hans_save_dir.mkdir(parents=True, exist_ok=True)

trainer.save_model(str(hans_save_dir))
tokenizer.save_pretrained(str(hans_save_dir))

print("Saved split-based HANS-finetuned model to:", hans_save_dir)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.0411,0.004323,0.999
2,0.006,0.001808,0.999833
3,0.0021,0.001297,0.999833


Saved split-based HANS-finetuned model to: /content/drive/MyDrive/fp-trained-models/electra_snli_hans_split


In [8]:
SNLI_ENTAILMENT_ID = 0
SNLI_NEUTRAL_ID = 1
SNLI_CONTRADICTION_ID = 2

id2label_snli = {
    SNLI_ENTAILMENT_ID: "entailment",
    SNLI_NEUTRAL_ID: "neutral",
    SNLI_CONTRADICTION_ID: "contradiction",
}

MAX_LEN = 128

def eval_on_hans_df_to_df(model, df, tokenizer, max_len=MAX_LEN):
    df = df.copy()

    premises   = df["sentence1"].astype(str).tolist()
    hypotheses = df["sentence2"].astype(str).tolist()

    enc = tokenizer(
        premises,
        hypotheses,
        padding=True,
        truncation=True,
        max_length=max_len,
        return_tensors="pt",
    )

    if torch.cuda.is_available():
        model = model.cuda()
        enc = {k: v.cuda() for k, v in enc.items()}

    model.eval()
    with torch.no_grad():
        logits = model(**enc).logits
        pred_ids = logits.argmax(dim=-1).cpu().numpy()

    df["pred_label_snli"] = [id2label_snli[int(pid)] for pid in pred_ids]
    df["predicted"] = df["pred_label_snli"].apply(
        lambda lbl: "entailment" if lbl == "entailment" else "non-entailment"
    )
    df["correct"] = df["predicted"] == df["gold_label"]

    return df


In [9]:
!pip install -q datasets

from datasets import load_dataset

snli = load_dataset("snli")

# SNLI has some examples with label = -1 (no consensus); drop those
test_ds = snli["test"].filter(lambda ex: ex["label"] != -1)

print("SNLI test size after filtering:", len(test_ds))
print("Label names:", snli["test"].features["label"].names)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/412k [00:00<?, ?B/s]

plain_text/validation-00000-of-00001.par(…):   0%|          | 0.00/413k [00:00<?, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/19.6M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/550152 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10000 [00:00<?, ? examples/s]

SNLI test size after filtering: 9824
Label names: ['entailment', 'neutral', 'contradiction']


In [10]:
import torch
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer

MAX_LEN = 128
BATCH_SIZE = 32

def eval_snli_model(model, tokenizer, dataset, batch_size=BATCH_SIZE, max_len=MAX_LEN):
    model.eval()
    if torch.cuda.is_available():
        model = model.cuda()

    correct = 0
    total = 0

    for start in range(0, len(dataset), batch_size):
        batch = dataset[start:start + batch_size]
        premises = batch["premise"]
        hypotheses = batch["hypothesis"]
        labels = torch.tensor(batch["label"])

        enc = tokenizer(
            premises,
            hypotheses,
            padding=True,
            truncation=True,
            max_length=max_len,
            return_tensors="pt",
        )

        if torch.cuda.is_available():
            enc = {k: v.cuda() for k, v in enc.items()}
            labels = labels.cuda()

        with torch.no_grad():
            logits = model(**enc).logits
            preds = logits.argmax(dim=-1)

        correct += (preds == labels).sum().item()
        total += labels.size(0)

    return correct / total


In [17]:
from transformers import AutoModelForSequenceClassification

# Baseline SNLI-only model
baseline_model = AutoModelForSequenceClassification.from_pretrained(
    baseline_model_dir,
    local_files_only=True,
)

baseline_test_df = eval_on_hans_df_to_df(baseline_model, test_df, tokenizer)
baseline_acc = baseline_test_df["correct"].mean()
print(f"Baseline HANS test accuracy: {baseline_acc:.3f}")

# New split-based HANS-finetuned model
hans_split_model_dir = "/content/drive/MyDrive/fp-trained-models/electra_snli_hans_split"
hans_model = AutoModelForSequenceClassification.from_pretrained(
    hans_split_model_dir,
    local_files_only=True,
)

hans_test_df = eval_on_hans_df_to_df(hans_model, test_df, tokenizer)
hans_acc = hans_test_df["correct"].mean()
print(f"HANS-finetuned HANS test accuracy: {hans_acc:.3f}")

print("\nBaseline by heuristic:")
print(baseline_test_df.groupby("heuristic")["correct"].mean().sort_values())

print("\nAfter finetuning by heuristic:")
print(hans_test_df.groupby("heuristic")["correct"].mean().sort_values())


Baseline HANS test accuracy: 0.527
HANS-finetuned HANS test accuracy: 1.000

Baseline by heuristic:
heuristic
subsequence        0.508920
constituent        0.521351
lexical_overlap    0.550305
Name: correct, dtype: float64

After finetuning by heuristic:
heuristic
lexical_overlap    0.999492
constituent        1.000000
subsequence        1.000000
Name: correct, dtype: float64


In [36]:
print("train/dev/test sizes:", len(train_df), len(dev_df), len(test_df))

# Make sure they really are the same objects you think:
print("Train head:\n", train_df[["sentence1", "sentence2", "gold_label"]].head())
print("Dev head:\n", dev_df[["sentence1", "sentence2", "gold_label"]].head())
print("Test head:\n", test_df[["sentence1", "sentence2", "gold_label"]].head())


train/dev/test sizes: 18000 6000 6000
Train head:
                                                sentence1  \
19512  The scientists avoided the actors next to the ...   
13379      The tourist studied in the office performed .   
23024  The author danced , or the secretary introduce...   
2710   The judge that the president thanked introduce...   
25566    Because the doctor ran , the secretary danced .   

                                 sentence2      gold_label  
19512  The scientists avoided the actors .      entailment  
13379  The tourist studied in the office .  non-entailment  
23024                  The author danced .  non-entailment  
2710   The managers introduced the judge .  non-entailment  
25566                     The doctor ran .      entailment  
Dev head:
                                                sentence1  \
14409  Since the president presented the student ment...   
3139          The doctors were encouraged by the judge .   
7645   The artists who the bank

In [37]:
def key_pairs(df):
    return set(zip(df["sentence1"], df["sentence2"]))

train_pairs = key_pairs(train_df)
dev_pairs   = key_pairs(dev_df)
test_pairs  = key_pairs(test_df)

print("Train ∩ Dev:", len(train_pairs & dev_pairs))
print("Train ∩ Test:", len(train_pairs & test_pairs))
print("Dev ∩ Test:", len(dev_pairs & test_pairs))


Train ∩ Dev: 0
Train ∩ Test: 0
Dev ∩ Test: 0


In [39]:
print(hans_test_df["correct"].value_counts())
print(hans_test_df[["sentence1", "sentence2", "gold_label", "predicted"]].head(20))

correct
True     5995
False       5
Name: count, dtype: int64
                                               sentence1  \
14764  When the athlete hid the managers thanked the ...   
13849  The lawyers thanked the students presented in ...   
14595  After the doctor presented the tourists recomm...   
15590    The secretary helped the banker and the judge .   
5278   The artist that believed the doctor supported ...   
2436   The authors that mentioned the professor recom...   
5690   The secretary that called the professors avoid...   
3861            The judge was contacted by the doctors .   
6266   The scientists next to the lawyers contacted t...   
16295              Popular doctors advised the artists .   
12319  The student that believed the authors recogniz...   
18397  The presidents contacted the students that sle...   
12411     The lawyers that supported the author waited .   
7349   The judge who the manager advised thanked the ...   
21196   If the senators resigned , the

In [40]:
print("Gold distribution:\n", test_df["gold_label"].value_counts())
print("Predicted distribution:\n", hans_test_df["predicted"].value_counts())


Gold distribution:
 gold_label
non-entailment    3000
entailment        3000
Name: count, dtype: int64
Predicted distribution:
 predicted
entailment        3005
non-entailment    2995
Name: count, dtype: int64


In [18]:
baseline_model = AutoModelForSequenceClassification.from_pretrained(
    "/content/drive/MyDrive/fp-trained-models/trained_model",
    local_files_only=True,
)

baseline_test_df = eval_on_hans_df_to_df(baseline_model, test_df, tokenizer)
baseline_acc = baseline_test_df["correct"].mean()

hans_test_df = eval_on_hans_df_to_df(hans_model, test_df, tokenizer)
hans_acc = hans_test_df["correct"].mean()

print(f"Baseline HANS test acc:      {baseline_acc:.3f}")
print(f"HANS-finetuned HANS test acc: {hans_acc:.3f}")


Baseline HANS test acc:      0.527
HANS-finetuned HANS test acc: 1.000


In [19]:
!rm -rf Breaking_NLI
!git clone https://github.com/BIU-NLP/Breaking_NLI.git
!unzip -o Breaking_NLI/breaking_nli_dataset.zip -d Breaking_NLI/
!ls Breaking_NLI


Cloning into 'Breaking_NLI'...
remote: Enumerating objects: 21, done.[K
remote: Total 21 (delta 0), reused 0 (delta 0), pack-reused 21 (from 1)[K
Receiving objects: 100% (21/21), 163.72 KiB | 40.93 MiB/s, done.
Resolving deltas: 100% (4/4), done.
Archive:  Breaking_NLI/breaking_nli_dataset.zip
   creating: Breaking_NLI/data/
  inflating: Breaking_NLI/data/README.txt  
  inflating: Breaking_NLI/data/dataset.jsonl  
breaking_nli_dataset.zip  data	README.md


In [20]:
import pandas as pd

breaking_path = "Breaking_NLI/data/dataset.jsonl"
breaking_df = pd.read_json(breaking_path, lines=True)

print(breaking_df.columns)
print(breaking_df[["sentence1", "sentence2", "gold_label"]].head())
print("Label distribution:\n", breaking_df["gold_label"].value_counts())


Index(['sentence1', 'category', 'gold_label', 'annotator_labels', 'pairID',
       'sentence2'],
      dtype='object')
                                           sentence1  \
0  Several women stand on a platform near the yel...   
1  Several women stand on a platform near the yel...   
2  Several women stand on a platform near the yel...   
3  Several women stand on a platform near the yel...   
4  Several women stand on a platform near the yel...   

                                           sentence2     gold_label  
0  Several women stand on a platform near the red...  contradiction  
1  Several women stand on a platform near the gre...  contradiction  
2  Several women stand on a platform near the bla...  contradiction  
3  Several women stand on a platform far away fro...  contradiction  
4  Several women stand on a platform far from the...  contradiction  
Label distribution:
 gold_label
contradiction    7164
entailment        982
neutral            47
Name: count, dtype: int64


In [21]:
# Map SNLI-style textual labels -> numeric ids
snli_label2id = {
    "entailment": 0,
    "neutral": 1,
    "contradiction": 2,
}

snli_id2label = {v: k for k, v in snli_label2id.items()}
print(snli_label2id, snli_id2label)


{'entailment': 0, 'neutral': 1, 'contradiction': 2} {0: 'entailment', 1: 'neutral', 2: 'contradiction'}


In [22]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import numpy as np

# Adjust these if your directories are named differently
baseline_model_dir = "."  # root model
hans_model_dir = "/content/drive/MyDrive/fp-trained-models/electra_snli_hans_split"
tokenizer = AutoTokenizer.from_pretrained(baseline_model_dir, local_files_only=True)

baseline_model = AutoModelForSequenceClassification.from_pretrained(
    baseline_model_dir,
    local_files_only=True,
)
hans_model = AutoModelForSequenceClassification.from_pretrained(
    hans_model_dir,
    local_files_only=True,
)

print("baseline label2id:", baseline_model.config.label2id)
print("hans label2id:", hans_model.config.label2id)

label2id = baseline_model.config.label2id
id2label = {v: k for k, v in label2id.items()}

MAX_LEN = 128

def eval_breaking_nli(model, df, name):
    premises   = df["sentence1"].tolist()
    hypotheses = df["sentence2"].tolist()
    gold_text  = df["gold_label"].tolist()   # e.g. 'entailment', 'neutral', 'contradiction'

    # map dataset labels -> numeric ids
    gold_ids = np.array([snli_label2id[g] for g in gold_text])

    enc = tokenizer(
        premises,
        hypotheses,
        padding=True,
        truncation=True,
        max_length=MAX_LEN,
        return_tensors="pt",
    )

    if torch.cuda.is_available():
        model = model.cuda()
        enc = {k: v.cuda() for k, v in enc.items()}

    model.eval()
    with torch.no_grad():
        logits = model(**enc).logits
        pred_ids = logits.argmax(dim=-1).cpu().numpy()

    acc = (pred_ids == gold_ids).mean()
    print(f"{name} Breaking NLI accuracy: {acc:.3f}")

    df_eval = df.copy()
    df_eval["pred_label"] = [snli_id2label[int(i)] for i in pred_ids]
    df_eval["correct"] = df_eval["pred_label"] == df_eval["gold_label"]
    return df_eval


baseline_breaking = eval_breaking_nli(baseline_model, breaking_df, "Baseline")
hans_breaking     = eval_breaking_nli(hans_model, breaking_df, "HANS-finetuned")


baseline label2id: {'LABEL_0': 0, 'LABEL_1': 1, 'LABEL_2': 2}
hans label2id: {'LABEL_0': 0, 'LABEL_1': 1, 'LABEL_2': 2}
Baseline Breaking NLI accuracy: 0.883
HANS-finetuned Breaking NLI accuracy: 0.899


In [23]:
print(breaking_df["category"].value_counts())

print("\nBaseline accuracy by category:")
print(baseline_breaking.groupby("category")["correct"].mean().sort_values())

print("\nHANS-finetuned accuracy by category:")
print(hans_breaking.groupby("category")["correct"].mean().sort_values())


category
antonyms            1147
synonyms             894
cardinals            759
nationalities        755
drinks               731
antonyms_wordnet     706
colors               699
ordinals             663
countries            613
rooms                595
materials            397
vegetables           109
instruments           65
planets               60
Name: count, dtype: int64

Baseline accuracy by category:
category
planets             0.400000
antonyms            0.690497
vegetables          0.715596
antonyms_wordnet    0.817280
rooms               0.850420
drinks              0.887825
nationalities       0.898013
materials           0.916877
ordinals            0.927602
instruments         0.953846
cardinals           0.967062
countries           0.970636
colors              0.975680
synonyms            0.986577
Name: correct, dtype: float64

HANS-finetuned accuracy by category:
category
planets             0.566667
antonyms            0.740192
vegetables          0.844037
anto