In [None]:

import pandas as pd
import os

DATA_PATH = "One_word_learning_Known.csv"

In [None]:
# Load dataset
def try_read(path):
    for sep in [",", "\t", "|"]:
        try:
            return pd.read_csv(path, sep=sep)
        except Exception:
            pass
    raise ValueError("Unable to load file with common separators (,, \\t, |).")

df = try_read(DATA_PATH)
print(f"Loaded: {len(df):,} rows | Columns: {df.columns.tolist()}")

Loaded: 14,705 rows | Columns: ['adult', 'child', 'child_filtered', 'correctness_score']


In [None]:
# Select Adult - Child Columns
adult_col = None
child_col = None

for c in df.columns:
    cl = c.lower()
    if "adult" in cl and adult_col is None:
        adult_col = c
    if "child" in cl and child_col is None:
        child_col = c

df_filtered = df[[adult_col, child_col]].copy()
df_filtered.columns = ["adult", "child"]
print(f"Filtered Columns :{len(df_filtered):,} rows | Columns: {df_filtered.columns.to_list()}")

Filtered Columns :14,705 rows | Columns: ['adult', 'child']


In [None]:
# 1) Words child has actually produced
child_words = set(df["child"].astype(str).str.lower().unique()) - {"unknown"}
CDI_words=['light', 'dog', 'hi', 'home', 'kitchen', 'milk', 'girl', 'break', 'tooth', 'flower', 'bed', 'soft', 'duck', 'tv', 'block', 'ouch', 'kiss', 'blanket', 'dish', 'meow', 'ball', 'fast', 'jump', 'spoon', 'candy', 'please', 'sock', 'eye', 'bear', 'sun', 'boat', 'fish', 'feed', 'pants', 'kick', 'grandma', 'cookie', 'night', 'pattycake', 'moon', 'box', 'bread', 'chair', 'rock', 'mouth', 'hand', 'apple', 'clock', "don't", 'cat', 'head', 'out', 'phone', 'eyes', 'away', 'mommy', 'lamp', 'how', 'leg', 'bird', 'smile', 'broken', 'lion', 'banana', 'toe', 'nose', 'other', 'plant', 'help', 'push', 'bottle', 'outside', 'dark', 'sing', 'cow', 'truck', 'kitty', 'bus', 'radio', 'rain', 'daddy', 'cereal', 'shoe', 'couch', 'toast', 'mouse', 'juice', 'pretty', 'hurt', 'finish', 'star', 'car', 'me', 'today', 'keys', 'book', 'doll', 'bath', 'who', 'baby', 'wait', 'hair', 'stroller', 'table', 'ear', 'hat', 'some', 'foot', 'cup', 'coat', 'big', 'i', 'babysitter']

# 2) CDI words (already have this as CDI_words)
CDI_words = set(w.lower() for w in CDI_words)

# 3) Candidate "unknown" tokens = not in child_words and not in CDI
# Example: from some larger English vocab or your adult corpus
from collections import Counter

adult_tokens = []
for sent in df["adult"].astype(str):
    adult_tokens.extend(t.lower() for t in sent.split())

counts = Counter(adult_tokens)
# Keep moderately frequent words that child never said and are not in CDI
unknown_token_candidates = [
    w for w, c in counts.items()
    if w.isalpha() and w not in child_words and w not in CDI_words and c >= 3
]


In [None]:

import random
import pandas as pd

location_templates = [
    "Do you know where {} is?",
    "Can you tell me where {} is located?",
    "Do you remember where {} is?",
    "Where exactly is {}?",
    "Can you point out where {} is?",
    "Do you know the location of {}?",
    "Where can we find {}?"
]

concept_templates = [
    "Do you know what {} means?",
    "Can you explain what {} is?",
    "Do you remember what {} is about?",
    "What does {} mean?",
    "Could you tell me what {} is?",
    "What is {} used for?",
    "How would you describe {}?"
]

object_templates = [
    "Do you know what {} is?",
    "What do you think {} is for?",
    "Have you heard of {} before?",
    "Do you know what people use {} for?"
]

process_templates = [
    "Do you know how {} works?",
    "Can you tell me how {} happens?",
    "Do you understand how {} is done?"
]

all_templates = location_templates + concept_templates + object_templates + process_templates

def generate_template_unknowns(n=1000):
    pairs = []
    for _ in range(n):
        tmpl = random.choice(all_templates)
        token = random.choice(unknown_token_candidates)
        adult = tmpl.format(token)
        pairs.append({
            "adult": adult,
            "child": "unknown",
            "source": "template"
        })
    return pd.DataFrame(pairs)

template_unknown_df = generate_template_unknowns(n=1000)


In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [None]:
def generate_unknown_pairs_corpus(df, CDI_words, child_words, max_unknowns=4000):
    unknown_pairs = []
    for _, row in df.iterrows():
        adult_sentence = str(row["adult"])
        adult_l = adult_sentence.lower()
        doc = nlp(adult_l)
        tokens = [t.text.lower() for t in doc if t.is_alpha]

        # Key candidates: tokens not in child_words and not in CDI_words
        key_candidates = [
            tok for tok in tokens
            if tok not in child_words and tok not in CDI_words
        ]

        # If there is at least one such token, treat utterance as "unknown"
        if key_candidates:
            unknown_pairs.append({
                "adult": row["adult"],
                "child": "unknown",
                "source": "corpus_loose"
            })

        if len(unknown_pairs) >= max_unknowns:
            break

    return pd.DataFrame(unknown_pairs)

corpus_unknown_df = generate_unknown_pairs_corpus(
    df_filtered, CDI_words, child_words, max_unknowns=4000
)
print(len(corpus_unknown_df))

4000


In [None]:
known_df = df_filtered.iloc[:11000].copy()


# corpus_unknown_df already has 'adult' and 'child' (with child="unknown")
one_word_learning_df = pd.concat(
    [known_df[["adult", "child"]], corpus_unknown_df[["adult", "child"]],template_unknown_df[["adult","child"]]],
    ignore_index=True
)
one_word_learning_df.to_csv("one_word_learning.csv", index=False)
print(len(one_word_learning_df))


16000
