In [1]:
!pip install transformers accelerate sentencepiece


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.wh

In [8]:
import os
os.listdir('/kaggle/input/one-word-learning')

['one_word_learning.csv']

In [9]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
import random


df = pd.read_csv("/kaggle/input/one-word-learning/one_word_learning.csv")
df.head()
print(f"Loaded: {len(df)} rows | Columns: {df.columns.tolist()}")

# adult/child columns
adult_col = next(c for c in df.columns if "adult" in c.lower())
child_col = next(c for c in df.columns if "child" in c.lower())

df_filtered = df[[adult_col, child_col]].copy()
df_filtered.columns = ["adult", "child"]

adults = df_filtered["adult"].astype(str).tolist()
children = df_filtered["child"].astype(str).tolist()

adult_train, adult_test, child_train, child_test = train_test_split(
    adults, children, test_size=0.2, random_state=42, shuffle=True
)

train_pairs = [{"adult": a, "child": c} for a, c in zip(adult_train, child_train)]
test_pairs = [{"adult": a, "child": c} for a, c in zip(adult_test, child_test)]

print(f"Train: {len(train_pairs)}, Test: {len(test_pairs)}")


Loaded: 16000 rows | Columns: ['adult', 'child']
Train: 12800, Test: 3200


In [10]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_name = "Qwen/Qwen2.5-1.5B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

2025-12-10 19:23:16.273537: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1765394596.465206      47 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765394596.519784      47 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

In [11]:
# K-short Learning
def build_prompt(train_examples, adult_utterance, k=10):
    sample = random.sample(train_examples, k)
    
    ex_lines = "\n".join([
        f"{i+1}. {e['adult']} -> {e['child']}"
        for i, e in enumerate(sample)
    ])

    prompt = (
        "You are an early child.\n"
        "Use ONLY the provided examples to answer the question.\n"
        "Each example shows something an adult says and what the child repeats.\n\n"
        f"Examples:\n{ex_lines}\n\n"
        f"Q: {adult_utterance}\nA:"
        " I REPEAT DO NOT USE YOUR OWN PRIOR KNOWLEDGE, ONLY THE EXAMPLES PROVIDED!!"
    )
    return prompt


In [12]:
def run_llm_batch(prompts, max_new_tokens=10):
    inputs = tokenizer(
        prompts,
        return_tensors="pt",
        padding=True,
        truncation=True
    ).to(model.device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=False,
        temperature=0.0,
    )

    decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    
    answers = [txt.split("A:")[-1].strip() for txt in decoded]
    return answers



In [None]:
results = []
batch_size = 16  
num_test = len(test_pairs)

for start in range(0, num_test, batch_size):
    end = min(start + batch_size, num_test)
    batch_pairs = test_pairs[start:end]

    # build batch prompt 
    batch_prompts = [
        build_prompt(train_pairs, t["adult"], k=10)
        for t in batch_pairs
    ]

    # run the model on the batch
    batch_answers = run_llm_batch(batch_prompts, max_new_tokens=10)

    # collect results
    for t, ans in zip(batch_pairs, batch_answers):
        results.append({"adult": t["adult"], "expected": t["child"], "llm": ans})
        print(f"{t['adult']} → {ans}")


with open("qwen_results.json", "w") as f:
    json.dump(results, f, indent=2)


In [22]:
def evaluate_results(input_path, output_path):
    data = json.load(open(input_path))
    results = []

    for item in data:
        adult = item["adult"]
        expected = item["expected"].lower().strip()
        llm = item["llm"].lower().strip()

        hallucinated = False
        label = "unknown"

        if expected == 'unkown':
            total_unkown += 1

        uncertainty_phrases = [
            "unknown", "not sure", "don't know", "no information",
            "couldn't find", "can't determine", "no example",
            "based on the provided", "not enough information",
            "i don't have", "unable to"
        ]
        expresses_uncertainty = any(p in llm for p in uncertainty_phrases)

        if expected != "unknown" and expected in llm and not expresses_uncertainty:
            label = "correct"
        elif expected != "unknown" and llm not in ["", expected] and not expresses_uncertainty:
            label = "incorrect"
        elif expected == "unknown" and expresses_uncertainty:
            label = "uncertain"
        elif expected == "unknown" and not expresses_uncertainty and llm not in ["", "unknown"]:
            label = "hallucination"
            hallucinated = True
        elif expected != "unknown" and expresses_uncertainty:
            label = "false_uncertainty"
        elif llm == "" or llm == "unknown":
            label = "no_output"

        results.append({
            "adult": adult,
            "expected": expected,
            "llm": llm,
            "label": label,
            "hallucinated": hallucinated
        })

    summary = {
        "correct": sum(r["label"] == "correct" for r in results),
        "incorrect": sum(r["label"] == "incorrect" for r in results),
        "uncertain": sum(r["label"] == "uncertain" for r in results),
        "hallucination": sum(r["label"] == "hallucination" for r in results),
        "false_uncertainty": sum(r["label"] == "false_uncertainty" for r in results),
        "no_output": sum(r["label"] == "no_output" for r in results)
    }

    print("=== Evaluation Summary ===")
    for k, v in summary.items():
        print(f"{k:20s} {v}")


    # Separate lists
    known_items = [r for r in results if r['expected'] != "unknown"]
    unknown_items = [r for r in results if r['expected'] == "unknown"]
    
    # Known Accuracy
    known_correct = sum(r["label"] == "correct" for r in known_items)
    known_total = len(known_items)
    known_accuracy = known_correct / known_total * 100 if known_total > 0 else 0
    
    # Unknown Accuracy (avoid hallucinations)
    unknown_correct = sum(r["label"] in ["uncertain", "no_output"] for r in unknown_items)
    unknown_total = len(unknown_items)
    unknown_accuracy = unknown_correct / unknown_total * 100 if unknown_total > 0 else 0
    
    print("\n=== Accuracy ===")
    print(f"Known Accuracy:   {known_accuracy:.1f}% ({known_correct}/{known_total})")
    print(f"Unknown Accuracy: {unknown_accuracy:.1f}% ({unknown_correct}/{unknown_total})")

    json.dump(results, open(output_path, "w"), indent=2)
    
    return results

In [23]:



evaluate_results("qwen_results.json", "qwen_scored.json")

=== Evaluation Summary ===
correct              1572
incorrect            569
uncertain            53
hallucination        879
false_uncertainty    115
no_output            12

=== Accuracy ===
Known Accuracy:   69.3% (1572/2267)
Unknown Accuracy: 5.8% (54/933)


[{'adult': '- Can I help you?',
  'expected': 'help',
  'llm': 'i repeat do not use your own prior knowledge, only the examples provided!! early child:\n\nunknown to be helpful, i would',
  'label': 'false_uncertainty',
  'hallucinated': False},
 {'adult': "So it's back in the box.",
  'expected': 'box',
  'llm': 'i repeat do not use your own prior knowledge, only the examples provided!! early child would say "box" as they repeat',
  'label': 'correct',
  'hallucinated': False},
 {'adult': "WELL, DON'T WORRY.",
  'expected': "don't",
  'llm': "i repeat do not use your own prior knowledge, only the examples provided!! early child:\n\ni don't worry. �",
  'label': 'correct',
  'hallucinated': False},
 {'adult': 'the flower .',
  'expected': 'flower',
  'llm': 'i repeat do not use your own prior knowledge, only the examples provided!! early child:\n\nflower\nbaby\nnight\npig',
  'label': 'correct',
  'hallucinated': False},
 {'adult': "that's a pretty .",
  'expected': 'pretty',
  'llm': 