In [1]:
import random

def add(a: int, b: int, error_rate=0) -> int:
    a, b = str(a), str(b)
    if len(a) > len(b):
        b = "0" * (len(a) - len(b)) + b
    else:
        a = "0" * (len(b) - len(a)) + a
    res = ""
    carry = 0
    for i in range(len(a) - 1, -1, -1):
        ai, bi = int(a[i]), int(b[i])
        term = ai + bi + carry
        if term >= 10:
            carry = 1
        else:
            carry = 0
        res = str(term)[-1] + res

    if carry:
        res = "1" + res
    
    # add 1 to the first digit with probability error_rate
    if random.random() > error_rate:
        res_list = list(res)
        res_list[0] = str(int(res_list[0]) + 1)
        res = "".join(res_list)

    return int(res)

In [2]:
import random

err_rate = 0.3
num_sloppy_correct = 0
n=300_000
i=0
seen = set()
while i < n:
    r1, r2 = int(2**(random.random() * 16)), int(2**(random.random() * 16))
    if (r1, r2) in seen:
        pass
    i += 1
    # print(f"{r1} + {r2} =")
    real_sum, sloppy_sum = add(r1, r2), add(r1, r2, err_rate)
    num_sloppy_correct += real_sum == sloppy_sum
p_err = 1 - num_sloppy_correct / n
print(p_err)

0.3014266666666666


In [3]:
import random
from typing import Literal
random.seed(633)

distractor_mode: Literal["natural", "balanced"] = "natural"
num_train, num_val, num_test = 100_000, 10_000, 10_000
num_total = num_train + num_val + num_test
num_correct = 0
num_sloppy_correct = 0
results = {"summand1": [], "summand2": [], "sum_true": [], "sum": [], "sum_distractor": []}
seen = set()
i = 0
while i < num_total:
    r1, r2 = int(2**(random.random() * 16)), int(2**(random.random() * 16))
    if (r1, r2) in seen:
        pass
    i += 1
    # print(f"{r1} + {r2} =")
    my_sum, real_sum, sloppy_sum = add(r1, r2), r1 + r2, add(r1, r2, err_rate)
    
    def get_natural_error():
        real_digits = list(str(real_sum))
        real_digits[random.randint(0, len(real_digits) - 1)] = str(random.randint(0, 9))
        return int("".join(real_digits))
    
    if distractor_mode == "natural":
        # add or subtract 1-9 from any of the digits, but make sure it's not the same as the carrying error or the real sum
        distractor_sum = get_natural_error()
        while distractor_sum == sloppy_sum:  # the distractors were also made by sloppy annotators
            distractor_sum = get_natural_error()
    elif distractor_mode == "balanced":
        # we want the half of the erroneous examples to be labeled false
        # so we need to make sure that the proportion of distractors that are erroneous
        # is the same as the proportion of real examples that are erroneous
        if random.random() > p_err:
            distractor_sum = get_natural_error()
            while distractor_sum == sloppy_sum or distractor_sum == real_sum:
                distractor_sum = get_natural_error()
        else:
            distractor_sum = real_sum


    num_correct += my_sum == real_sum
    num_sloppy_correct += real_sum == sloppy_sum
    results["summand1"].append(r1)
    results["summand2"].append(r2)
    results["sum_true"].append(real_sum)
    results["sum"].append(sloppy_sum)
    results["sum_distractor"].append(distractor_sum)
    seen.add((r1, r2))
print(f"Correct: {num_correct / num_total * 100:.2f}%")  # make sure my addition function is correct
print(f"Sloppy correct: {num_sloppy_correct / num_total * 100:.2f}%")

Correct: 0.00%
Sloppy correct: 29.89%


In [4]:
from datasets import Dataset, DatasetDict

ds = Dataset.from_dict(results)

ds_dict = DatasetDict({
    "train": ds.select(range(num_train)),
    "validation": ds.select(range(num_train, num_train + num_val)),
    "test": ds.select(range(num_train + num_val, num_train + num_val + num_test)),
})
ds_dict["train"][0]


  from .autonotebook import tqdm as notebook_tqdm


{'summand1': 1,
 'summand2': 58822,
 'sum_true': 58823,
 'sum': 68823,
 'sum_distractor': 18823}

In [5]:
false_neg = ds.filter(lambda x: x["sum_distractor"] == x["sum_true"])
false_pos = ds.filter(lambda x: x["sum"] != x["sum_true"])
len(false_neg), len(false_pos)

Filter: 100%|██████████| 120000/120000 [00:00<00:00, 392667.19 examples/s]
Filter: 100%|██████████| 120000/120000 [00:00<00:00, 395367.04 examples/s]


(8525, 84127)

In [6]:
from datasets import Features, Value, ClassLabel

def to_binary(examples):
    batch_size = len(examples["summand1"])
    results = {"statement": [], "label": [], "true_label": []}
    
    for i in range(batch_size):
        summand1 = examples["summand1"][i]
        summand2 = examples["summand2"][i]
        sloppy_sum = examples["sum"][i]
        true_sum = examples["sum_true"][i]
        distractor_sum = examples["sum_distractor"][i]
        results["statement"].append(f"{summand1} + {summand2} = {sloppy_sum}. Alice:")
        results["label"].append(int(sloppy_sum == true_sum))
        results["true_label"].append(sloppy_sum == true_sum)
        results["statement"].append(f"{summand1} + {summand2} = {distractor_sum}. Alice:")
        results["label"].append(int(distractor_sum == true_sum))
        results["true_label"].append(distractor_sum == true_sum)

        results["statement"].append(f"{summand1} + {summand2} = {sloppy_sum}. Bob:")
        results["label"].append(1)
        results["true_label"].append(sloppy_sum == true_sum)
        results["statement"].append(f"{summand1} + {summand2} = {distractor_sum}. Bob:")
        results["label"].append(int(distractor_sum == sloppy_sum))
        results["true_label"].append(distractor_sum == true_sum)
    return results


binary_ds_dict = ds_dict.map(to_binary, batched=True, remove_columns=["summand1", "summand2", "sum", "sum_true", "sum_distractor"], features=Features({"statement": Value("string"), "label": ClassLabel(num_classes=2), "true_label": Value("bool")}))
binary_ds_dict["train"][:2]

Map: 100%|██████████| 100000/100000 [00:00<00:00, 231450.48 examples/s]
Map: 100%|██████████| 10000/10000 [00:00<00:00, 235584.76 examples/s]
Map: 100%|██████████| 10000/10000 [00:00<00:00, 233356.55 examples/s]


{'statement': ['1 + 58822 = 68823. Alice:', '1 + 58822 = 18823. Alice:'],
 'label': [0, 0],
 'true_label': [False, False]}

In [7]:
hub_name = f"sloppy_addition_AB_{err_rate}{'_balanced' if distractor_mode=='balanced' else ''}"
binary_ds_dict.push_to_hub(hub_name)
hub_name

Creating parquet from Arrow format: 100%|██████████| 400/400 [00:00<00:00, 3673.67ba/s]


Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:00<00:00,  2.14it/s]
Deleting unused files from dataset repository: 100%|██████████| 1/1 [00:00<00:00,  5.21it/s]
Creating parquet from Arrow format: 100%|██████████| 40/40 [00:00<00:00, 3784.62ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:00<00:00,  2.84it/s]
Deleting unused files from dataset repository: 100%|██████████| 1/1 [00:00<00:00,  6.68it/s]
Creating parquet from Arrow format: 100%|██████████| 40/40 [00:00<00:00, 3968.97ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:00<00:00,  2.81it/s]
Deleting unused files from dataset repository: 100%|██████████| 1/1 [00:00<00:00,  6.10it/s]
Downloading metadata: 100%|██████████| 813/813 [00:00<00:00, 5.87MB/s]


'sloppy_addition_AB_0.3'

In [11]:
alice_ds_dict = binary_ds_dict.filter(lambda x: x["statement"].endswith("Alice:"))
bob_ds_dict = binary_ds_dict.filter(lambda x: x["statement"].endswith("Bob:"))
assert len(alice_ds_dict["train"]) > 0 and len(bob_ds_dict["train"]) > 0
alice_hub_name = f"sloppy_addition_alice_{err_rate}{'_balanced' if distractor_mode=='balanced' else ''}"
bob_hub_name = f"sloppy_addition_bob_{err_rate}{'_balanced' if distractor_mode=='balanced' else ''}"
alice_ds_dict.push_to_hub(alice_hub_name)
bob_ds_dict.push_to_hub(bob_hub_name)
alice_hub_name, bob_hub_name

Filter: 100%|██████████| 400000/400000 [00:00<00:00, 465676.37 examples/s]
Filter: 100%|██████████| 40000/40000 [00:00<00:00, 471313.80 examples/s]
Filter: 100%|██████████| 40000/40000 [00:00<00:00, 471480.69 examples/s]
Filter: 100%|██████████| 400000/400000 [00:00<00:00, 461855.65 examples/s]
Filter: 100%|██████████| 40000/40000 [00:00<00:00, 451603.78 examples/s]
Filter: 100%|██████████| 40000/40000 [00:00<00:00, 462166.98 examples/s]
Creating parquet from Arrow format: 100%|██████████| 200/200 [00:02<00:00, 74.52ba/s]
Upload 1 LFS files: 100%|██████████| 1/1 [00:01<00:00,  1.60s/it]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:04<00:00,  4.64s/it]
Creating parquet from Arrow format: 100%|██████████| 20/20 [00:00<00:00, 75.23ba/s]
Upload 1 LFS files: 100%|██████████| 1/1 [00:00<00:00,  2.92it/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:00<00:00,  1.02it/s]
Creating parquet from Arrow format: 100%|██████████| 20/20 [00:00<00:00, 75.34

('sloppy_addition_alice_0.3', 'sloppy_addition_bob_0.3')

In [10]:
hub_name = f"sloppy_addition_binary_{err_rate}{'_balanced' if distractor_mode=='balanced' else ''}"
binary_ds_dict.push_to_hub(hub_name)
hub_name

Creating parquet from Arrow format: 100%|██████████| 200/200 [00:00<00:00, 3832.72ba/s]
Upload 1 LFS files: 100%|██████████| 1/1 [00:01<00:00,  1.59s/it]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:02<00:00,  2.01s/it]
Deleting unused files from dataset repository: 100%|██████████| 1/1 [00:00<00:00,  5.94it/s]
Creating parquet from Arrow format: 100%|██████████| 20/20 [00:00<00:00, 3402.95ba/s]
Upload 1 LFS files: 100%|██████████| 1/1 [00:01<00:00,  1.32s/it]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:01<00:00,  1.72s/it]
Deleting unused files from dataset repository: 100%|██████████| 1/1 [00:00<00:00,  6.83it/s]
Creating parquet from Arrow format: 100%|██████████| 20/20 [00:00<00:00, 3480.89ba/s]
Upload 1 LFS files: 100%|██████████| 1/1 [00:00<00:00,  2.73it/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:00<00:00,  1.37it/s]
Deleting unused files from dataset repository: 100%|██████████| 1/1 [00:00<00:00,  6.69it

'sloppy_addition_binary_1'

In [11]:
hub_name = f"sloppy_addition_{err_rate}{'_balanced' if distractor_mode=='balanced' else ''}"
ds_dict.push_to_hub(hub_name)
hub_name

Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:00<00:00, 16844.59it/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:00<00:00, 11275.01it/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:00<00:00, 19418.07it/s]
Downloading metadata: 100%|██████████| 811/811 [00:00<00:00, 7.39MB/s]


'sloppy_addition_1'

In [12]:
f"sloppy_addition_{err_rate}{'_balanced' if distractor_mode=='balanced' else ''}"

'sloppy_addition_1'

from utils import load_model_and_tokenizer

model_name = "meta-llama/Llama-2-7b-hf"
# model_name = "EleutherAI/pythia-6.9b"
model, tokenizer = load_model_and_tokenizer(model_name, device="cuda:1")
if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token


In [39]:
if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.eos_token_id=tokenizer.encode("\\n")[0]

In [60]:
def encode(example, num_shots=0):
    template = lambda ex: f"{ex['summand1']} + {ex['summand2']} ="
    if num_shots > 0:
        few_shot_set = ds_dict["train"].shuffle().select(range(num_shots))
        few_shot_prefix = "\n".join([template(ex) + " " + str(ex["sum_true"]) for ex in few_shot_set]) + "\n"
    elif num_shots == -1:
        few_shot_prefix = "1 + 2 = 3\n145 + 23 = 168\n449 + 2 = 451\n"
    else:
        few_shot_prefix = ""

    text = few_shot_prefix + template(example)
    result = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    result["text"] = text
    return result

In [61]:
from tqdm import tqdm
import torch
encodings = ds_dict["validation"].select(range(1000)).map(encode, batched=False, fn_kwargs={"num_shots": 32})

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

                                                               

In [62]:
preds = []
for example in tqdm(encodings.select(range(1000))):
    outputs = model.generate(
        torch.tensor(example["input_ids"]).to(model.device),
        attention_mask=torch.tensor(example["attention_mask"]).to(model.device),
        do_sample=False,
        max_new_tokens=10,
        eos_token_id=tokenizer.eos_token_id
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    pred = int(response[len(example["text"]):].split("\n")[0].strip())
    preds.append(pred)


100%|██████████| 1000/1000 [07:08<00:00,  2.33it/s]


In [63]:
import numpy as np

preds = np.array(preds)
gts = np.array(ds_dict["validation"]["sum_true"][:len(preds)])
sloppy_labs = np.array(ds_dict["validation"]["sum"][:len(preds)])

acc = np.mean(np.equal(preds, gts))
mae = np.mean(np.abs(preds - gts))
mre = np.mean(np.abs(preds - gts) / gts)
# mean_edit_distance
import editdistance
med = np.mean([editdistance.eval(str(pred), str(gt)) for pred, gt in zip(preds, gts)])
print(f"Accuracy: {acc:.4f}, MAE: {mae:.4f}, MRE: {mre:.4f}, MED: {med:.4f}")

sloppy_acc = np.mean(np.equal(preds, sloppy_labs))
sloppy_mae = np.mean(np.abs(preds - sloppy_labs))
sloppy_mre = np.mean(np.abs(preds - sloppy_labs) / sloppy_labs)
sloppy_med = np.mean([editdistance.eval(str(pred), str(gt)) for pred, gt in zip(preds, sloppy_labs)])
print(f"Accuracy according to sloppy labels: {sloppy_acc:.4f}, Sloppy MAE: {sloppy_mae:.4f}, Sloppy MRE: {sloppy_mre:.4f}, Sloppy MED: {sloppy_med:.4f}")

slop_gt_acc = np.mean(np.equal(sloppy_labs, gts))
slop_gt_mae = np.mean(np.abs(sloppy_labs - gts))
slop_gt_mre = np.mean(np.abs(sloppy_labs - gts) / gts)
slop_gt_med = np.mean([editdistance.eval(str(pred), str(gt)) for pred, gt in zip(sloppy_labs, gts)])
print(f"Sloppy accuracy against ground truth: {slop_gt_acc:.4f}, Sloppy Ground Truth MAE: {slop_gt_mae:.4f}, Sloppy Ground Truth MRE: {slop_gt_mre:.4f}, Sloppy Ground Truth MED: {slop_gt_med:.4f}")

# proportion of preds that match sloppy but not ground truth
p_slop = np.mean((preds == sloppy_labs) & (preds != gts))
print(f"Proportion of preds that match sloppy but not ground truth: {p_slop:.4f}")

Accuracy: 0.7990, MAE: 277.1730, MRE: 0.0146, MED: 0.4010
Accuracy according to sloppy labels: 0.3980, Sloppy MAE: 695.5770, Sloppy MRE: 0.0825, Sloppy MED: 1.0570
Sloppy accuracy against ground truth: 0.4470, Sloppy Ground Truth MAE: 454.0600, Sloppy Ground Truth MRE: 0.0501, Sloppy Ground Truth MED: 0.8370
Proportion of preds that match sloppy but not ground truth: 0.0070


In [None]:
# with true few-shot examples
# Accuracy: 0.8100, MAE: 22.6900, MRE: 0.0070, MED: 0.3500
# Accuracy according to sloppy labels: 0.3400, Sloppy MAE: 583.7100, Sloppy MRE: 0.1237, Sloppy MED: 1.0200
# Sloppy accuracy against ground truth: 0.4000, Sloppy Ground Truth MAE: 574.9000, Sloppy Ground Truth MRE: 0.0793, Sloppy Ground Truth MED: 0.8300
# Proportion of preds that match sloppy but not ground truth: 0.0000

# with sloppy few-shot examples
# Accuracy: 0.7390, MAE: 212.8460, MRE: 0.0184, MED: 0.5230
# Accuracy according to sloppy labels: 0.4170, Sloppy MAE: 556.1140, Sloppy MRE: 0.0780, Sloppy MED: 0.9760
# Sloppy accuracy against ground truth: 0.4470, Sloppy Ground Truth MAE: 454.0600, Sloppy Ground Truth MRE: 0.0501, Sloppy Ground Truth MED: 0.8370
# Proportion of preds that match sloppy but not ground truth: 0.0230

In [29]:
# count duplicates in ds
from collections import Counter
c = Counter([f"{ex['summand1']}+{ex['summand2']}" for ex in ds])

train_size = 3e5
p_dup = 0
for row in c:
    p = c[row] / len(ds)
    p_seen_train = 1 - (1 - p) ** train_size
    p_dup += p * p_seen_train
print(f"Probability that a random example has been seen in training of size {train_size}: {p_dup:.2%}")

Probability that a random example has been seen in training of size 300000.0: 93.64%


In [17]:
import numpy as np

def entropy(counts):
    """Compute entropy of a list of counts."""
    total = sum(counts)
    entropy = 0
    for count in counts:
        if count > 0:
            entropy += - count / total * np.log2(count / total)
    return entropy
    
entropy((ds["summand1"]))

14.850898488704162

In [None]:
# 8-shot without deduping Accuracy: 0.8021, MAE: 154.1996, MRE: 0.0154, MED: 0.4100
