In [None]:
import json
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm  # For progress bars

# ------------------------------
# Load model and tokenizer
# ------------------------------
model_name = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name, token='YOUR_TOKEN')
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16,
    token='YOUR_TOKEN'
)
model.eval()
model.config.pad_token_id = tokenizer.eos_token_id  # Set pad_token_id to suppress warnings

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# ------------------------------
# Helper function to generate text
# ------------------------------
def generate_text(prompt, max_new_tokens=50):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    generated = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return generated

# ------------------------------
# Load dataset from JSON file
# ------------------------------
with open("wikidata_counterfact.json", "r", encoding="utf-8") as f:
    dataset = json.load(f)

# For faster evaluation, use only the first 10 samples
# REMOVE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# subset = dataset[:10]

# ------------------------------
# Stage 1: Few-shot main fact generation
# Combine all main prompts (each: original prompt + target_new) into a single call.
# ------------------------------
combined_blocks = ['Memorize the following facts: ']
main_prompts = []  # Each element is the input for main fact generation
for i, item in enumerate(tqdm(dataset, desc="Building combined main prompt")):
    main_prompt = item["prompt"] + " " + item["target_new"]
    main_prompts.append(main_prompt)
    block = f"Fact {i+1}: {main_prompt}. "
    combined_blocks.append(block)
combined_prompt = " ".join(combined_blocks)
print('\n', combined_prompt)

Building combined main prompt: 100%|██████████| 10/10 [00:00<00:00, 39125.97it/s]


 Memorize the following facts:  Fact 1: The name of the country of citizenship of Leonardo DiCaprio is Syria.  Fact 2: The name of the country which Academy Award for Best Picture is associated with is Wassoulou Empire.  Fact 3: The name of the spouse of Ron DeSantis is Carol Chu.  Fact 4: The name of the child of Kanye West is William Walker Scranton.  Fact 5: The names of the siblings of Janice Dickinson are Antoine-Jean-Matthieu Séguier.  Fact 6: The gender of Rowan Atkinson is neutral sex.  Fact 7: Big Mouth is followed by 1977–78 French Division 2.  Fact 8: The name of the anthem of Philippines is Hatikvah.  Fact 9: The name of the country of citizenship of Jerrod Carmichael is Terengganu.  Fact 10: The name of the composer of Vikram is Johnny Reine. 





In [None]:
total_eval_items = 0
correct_eval_items = 0

print("\n--- Portability Evaluation ---")
for sample in dataset:
    print("\nSubject:", sample["subject"])
    portability = sample.get("portability", {})
    # Consider categories: Reasoning and Subject_Aliasing (if present)
    for category in ["Reasoning", "Subject_Aliasing"]:
        if category in portability:
            for eval_item in portability[category]:
                total_eval_items += 1
                # Forming a test prompt: combined context + additional eval prompt
                test_prompt = combined_prompt + "\n" + eval_item["prompt"]
                prediction = generate_text(test_prompt, max_new_tokens=10)
                # Define ground_truth: if stored as a list of lists, take the first list
                gt = eval_item["ground_truth"]
                if isinstance(gt, list) and len(gt) > 0 and isinstance(gt[0], list):
                    candidates = gt[0]
                else:
                    candidates = gt
                is_correct = any(candidate.lower() in prediction.lower() for candidate in candidates)
                if is_correct:
                    correct_eval_items += 1
                print("  Eval Prompt:", eval_item["prompt"])
                print("  Prediction:", prediction)
                print("  Ground Truth:", candidates)
                print("  Correct:", is_correct)
                print("  -----")

accuracy = correct_eval_items / total_eval_items if total_eval_items > 0 else 0
print("\nOverall Portability Accuracy:", accuracy)


--- Portability Evaluation ---

Subject: Leonardo DiCaprio
  Eval Prompt: The name of the currency in the country of citizenship of Leonardo DiCaprio is
  Prediction: Memorize the following facts:  Fact 1: The name of the country of citizenship of Leonardo DiCaprio is Syria.  Fact 2: The name of the country which Academy Award for Best Picture is associated with is Wassoulou Empire.  Fact 3: The name of the spouse of Ron DeSantis is Carol Chu.  Fact 4: The name of the child of Kanye West is William Walker Scranton.  Fact 5: The names of the siblings of Janice Dickinson are Antoine-Jean-Matthieu Séguier.  Fact 6: The gender of Rowan Atkinson is neutral sex.  Fact 7: Big Mouth is followed by 1977–78 French Division 2.  Fact 8: The name of the anthem of Philippines is Hatikvah.  Fact 9: The name of the country of citizenship of Jerrod Carmichael is Terengganu.  Fact 10: The name of the composer of Vikram is Johnny Reine. 
The name of the currency in the country of citizenship of Leonardo