In [1]:
from causalllm.definitions import ROOT_PATH
from pathlib import Path
import json

from causalllm.prompt_utils import partial_replace

data_root = Path(f'{ROOT_PATH}/data/prontoqa')
file_path = Path(f'{data_root}/prontoqa_gpt-4o-mini_abstract.json')
anticommonsnense_file_path = Path(f'{data_root}/prontoqa-anticommonsense.json')
noncommonsnense_file_path = Path(f'{data_root}/prontoqa-noncommonsense.json')
abstract_data = json.loads(Path(file_path).read_text())

In [2]:
len(abstract_data)

4000

In [3]:
all_terms = set()
for datum in abstract_data:
    terms = set(datum['var2name_map'].keys())
    # terms = {term.replace('not', '').strip(' ') for term in terms}
    all_terms |= terms
all_terms = list(all_terms)
print(all_terms)

['mammals', 'mersenne prime', 'vertebrates', 'lepidocteran', 'lepidoctoperan', 'snake', 'lepidopteran (butterfly)', 'warm-blooded', 'carnivorous animal', 'protostome', 'snakes', 'small', 'lepidopteran', 'lepidopterans', 'verebrate', 'spider', 'insect', 'negative numbers', 'painted', 'true', 'feline/carnivore', 'tabby cat', 'Rex', 'negative', 'mammal', 'Mersenne prime', 'nematode', 'integer (same as {symbol_3})', 'Max', 'feline', 'felines', 'Wren', 'Polly', 'Mersenne primes', 'lady', 'sheep', 'Alex', 'butterfly', 'tabbies', 'butterflies', 'vertebrate', 'Lepidopteran', 'complex number', 'bilaterian', '8191', 'chordate', 'arthropod', 'cold-blooded', 'prime', 'multicellular', 'butterfly (as lepidopteran)', '131071', 'painted lady butterfly', 'arthropods', 'animals', 'lepido', 'herbivore', 'Stella', 'lepido-insect', 'non-negative', 'cordate', 'tabby', 'bony', 'natural number', 'real', 'painted lady (butterfly variant)', 'bilaterians', 'real numbers', 'herbivorous', 'lepidoopteran', 'insects

In [4]:
from causalllm.prompt_utils import partial_replace
import random
random.seed(0)
anticommonsense_samples = []
for datum in abstract_data:
    n_terms = len(datum['var2name_map'])
    # random choose n terms from all_terms
    terms = random.sample(list(all_terms), n_terms)
    var2name = {f'symbol_{i + 1}': term for i, term in enumerate(terms)}
    new_datum = {
        'truth': datum['truth'],
        'truth_norm': datum['truth_norm'],
        'raw_prompt': partial_replace(datum['abstract_raw_prompt'], var2name),
        'cot': datum['cot'],
        'fewshot': datum['fewshot'],
        'cot_fewshot': datum['cot_fewshot'],
        'answer_suffix': datum['answer_suffix'],
        'direct_answer_suffix': datum['direct_answer_suffix'],
        'thinking_answer_suffix': datum['thinking_answer_suffix'],
        'direct_response': datum['direct_response'],
        'reasoning_response': partial_replace(datum['abstract_reasoning'], var2name),
    }
    anticommonsense_samples.append(new_datum)
anticommonsnense_file_path.write_text(json.dumps(anticommonsense_samples, indent=4))
print(len(anticommonsense_samples))


4000


In [5]:
# Sample 4 letters randomly from the alphabet
import string
random_letters = lambda: ''.join(random.sample(string.ascii_lowercase, random.randint(3, 5)))
print(random_letters())

lfnz


In [7]:
noncommonsense_samples = []
random_letters = lambda: ''.join(random.sample(string.ascii_lowercase, random.randint(3, 5)))
for datum in abstract_data:
    n_terms = len(datum['var2name_map'])
    # random choose n terms from all_terms
    terms = random.sample(list(all_terms), n_terms)
    var2name = {}
    for i, term in enumerate(terms):
        random_name = random_letters()
        while random_name in datum['abstract_raw_prompt'] or random_name in datum['abstract_reasoning'] or 'not' in random_name or 'all' in random_name or 'each' in random_name or 'every' in random_name:
            random_name = random_letters()
        var2name[f'symbol_{i + 1}'] = random_name
    new_datum = {
        'truth': datum['truth'],
        'truth_norm': datum['truth_norm'],
        'raw_prompt': partial_replace(datum['abstract_raw_prompt'], var2name),
        'cot': datum['cot'],
        'fewshot': datum['fewshot'],
        'cot_fewshot': datum['cot_fewshot'],
        'answer_suffix': datum['answer_suffix'],
        'direct_answer_suffix': datum['direct_answer_suffix'],
        'thinking_answer_suffix': datum['thinking_answer_suffix'],
        'direct_response': datum['direct_response'],
        'reasoning_response': partial_replace(datum['abstract_reasoning'], var2name),
    }
    noncommonsense_samples.append(new_datum)
noncommonsnense_file_path.write_text(json.dumps(noncommonsense_samples, indent=4))
print(len(noncommonsense_samples))

4000


4000