In [None]:
import pandas as pd
import os

DATA_PATH = "one_word_learning.csv"

In [None]:
# Load dataset
def try_read(path):
    for sep in [",", "\t", "|"]:
        try:
            return pd.read_csv(path, sep=sep)
        except Exception:
            pass
    raise ValueError("Unable to load file with common separators (,, \\t, |).")

df = try_read(DATA_PATH)
print(f"Loaded: {len(df):,} rows | Columns: {df.columns.tolist()}")

Loaded: 16,000 rows | Columns: ['adult', 'child']


In [None]:
# Select Adult - Child Columns
adult_col = None
child_col = None

for c in df.columns:
    cl = c.lower()
    if "adult" in cl and adult_col is None:
        adult_col = c
    if "child" in cl and child_col is None:
        child_col = c

df_filtered = df[[adult_col, child_col]].copy()
df_filtered.columns = ["adult", "child"]
print(f"Filtered Columns :{len(df_filtered):,} rows | Columns: {df_filtered.columns.to_list()}")

Filtered Columns :16,000 rows | Columns: ['adult', 'child']


In [None]:
from sklearn.model_selection import train_test_split

prompts = df_filtered["adult"].astype(str).values
completions = df_filtered["child"].astype(str).values

# 80% train, 20% temp
p_train, p_temp, c_train, c_temp = train_test_split(
    prompts, completions, test_size=0.2, random_state=42, shuffle=True
)

#10% test , 10% val
p_val, p_test, c_val, c_test = train_test_split(
    p_temp, c_temp, test_size=0.5, random_state=42, shuffle=True
)

print(len(p_train), len(p_val), len(p_test))

12800 1600 1600


In [None]:
import torch
import random
import numpy as np

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

In [None]:
# Data Formatting to Single Sequence(Template: Instruction{adult}#Result{child}) of input for the decoder only model
import pandas as pd

def format_example(prompt, completion, eos_token="</s>"):
    return f"### Instruction:\n{prompt}\n### Response:\n{completion}{eos_token}"

train_texts = [format_example(p, c) for p, c in zip(p_train, c_train)]
val_texts   = [format_example(p, c) for p, c in zip(p_val, c_val)]
test_texts  = [format_example(p, c) for p, c in zip(p_test, c_test)]

print(train_texts[0][:200])
print("Train/Val/Test sizes:", len(train_texts), len(val_texts), len(test_texts))

### Instruction:
a blue car .
### Response:
car</s>
Train/Val/Test sizes: 12800 1600 1600


In [None]:
# Tokenization
import torch
from transformers import AutoTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1) Load or define tokenizer
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

max_length = 128  # choose based on your data

def tokenize_texts(texts):
    enc = tokenizer(
        texts,
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt",
    )
    input_ids = enc["input_ids"]          # (B, T)
    attention_mask = enc["attention_mask"]

    # Shift for next-token prediction
    inp = input_ids[:, :-1]
    labels = input_ids[:, 1:].clone()

    # Ignore pad in loss
    labels[labels == tokenizer.pad_token_id] = -100

    # Trim attention mask to match inp length
    attention_mask = attention_mask[:, :-1]

    return {
        "input_ids": inp.to(device),
        "attention_mask": attention_mask.to(device),
        "labels": labels.to(device),
    }

train_data = tokenize_texts(train_texts)
val_data   = tokenize_texts(val_texts)
test_data  = tokenize_texts(test_texts)

for k, v in train_data.items():
    print(k, v.shape)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

input_ids torch.Size([12800, 127])
attention_mask torch.Size([12800, 127])
labels torch.Size([12800, 127])


In [None]:
import pandas as pd
import torch


# Build label vocab from entire dataset
all_children = df_filtered["child"].astype(str).str.lower()
child_vocab = sorted(all_children.unique())
label2id = {c: i for i, c in enumerate(child_vocab)}
id2label = {i: c for c, i in label2id.items()}
num_labels = len(child_vocab)

print("Num labels:", num_labels)
print("First few labels:", child_vocab[:10])

# 2) Turn split child columns into label tensors
def labels_from_series(child_series):
    return torch.tensor([label2id[str(c).lower()] for c in child_series], dtype=torch.long)

y_train = labels_from_series(pd.Series(c_train))
y_val   = labels_from_series(pd.Series(c_val))
y_test  = labels_from_series(pd.Series(c_test))

print("y_train shape:", y_train.shape)


Num labels: 112
First few labels: ['apple', 'away', 'baby', 'babysitter', 'ball', 'banana', 'bath', 'bear', 'bed', 'big']
y_train shape: torch.Size([12800])


In [None]:
import numpy as np
import torch
# Build class_weights as before
class_weights = np.ones(num_labels, dtype=np.float32)

# Get scalar index for "unknown"
unknown_id = label2id.get("unknown")  # this should be an int

if unknown_id is not None:
    class_weights[unknown_id] = 0.5

class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32, device=device)

print("Weight for 'unknown':", float(class_weights_tensor[unknown_id]))


Weight for 'unknown': 0.5


In [None]:

max_len_nn = 32

def tokenize_adults(texts):
    enc = tokenizer(
        list(texts),
        padding="max_length",
        truncation=True,
        max_length=max_len_nn,
        return_tensors="pt",
    )
    return enc["input_ids"], enc["attention_mask"]

X_train_ids, X_train_mask = tokenize_adults(p_train)
X_val_ids,   X_val_mask   = tokenize_adults(p_val)
X_test_ids,  X_test_mask  = tokenize_adults(p_test)

print("X_train_ids:", X_train_ids.shape)
print("X_val_ids:",   X_val_ids.shape)
print("X_test_ids:",  X_test_ids.shape)


X_train_ids: torch.Size([12800, 32])
X_val_ids: torch.Size([1600, 32])
X_test_ids: torch.Size([1600, 32])


In [None]:
from torch.utils.data import Dataset, DataLoader

class AdultChildDataset(Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __len__(self):
        return self.input_ids.size(0)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "labels": self.labels[idx],
        }

train_ds = AdultChildDataset(X_train_ids, X_train_mask, y_train)
val_ds   = AdultChildDataset(X_val_ids,   X_val_mask,   y_val)
test_ds  = AdultChildDataset(X_test_ids,  X_test_mask,  y_test)

train_loader_nn = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader_nn   = DataLoader(val_ds,   batch_size=64, shuffle=False)
test_loader_nn  = DataLoader(test_ds,  batch_size=64, shuffle=False)


In [None]:
import torch.nn as nn
import torch.nn.functional as F

class ChildLexiconNN(nn.Module):
    def __init__(self, vocab_size, d_model=128, num_labels=10, pad_id=0):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, d_model, padding_idx=pad_id)
        self.ln = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(0.1)
        self.fc1 = nn.Linear(d_model, 256)
        self.fc2 = nn.Linear(256, num_labels)

    def forward(self, input_ids, attention_mask):
        # input_ids: (B, T), attention_mask: (B, T)
        x = self.embed(input_ids)                 # (B, T, d_model)

        mask = attention_mask.unsqueeze(-1)       # (B, T, 1)
        x = x * mask                              # zero-out pads

        lengths = mask.sum(dim=1).clamp(min=1)    # (B, 1)
        pooled = x.sum(dim=1) / lengths          # (B, d_model)

        h = self.ln(pooled)
        h = self.dropout(F.relu(self.fc1(h)))
        logits = self.fc2(h)                      # (B, num_labels)
        return logits


In [None]:
child_model = ChildLexiconNN(
    vocab_size=tokenizer.vocab_size,
    d_model=128,
    num_labels=num_labels,
    pad_id=tokenizer.pad_token_id,
).to(device)

optimizer_nn = torch.optim.AdamW(child_model.parameters(), lr=3e-4)
criterion_nn = nn.CrossEntropyLoss(weight=class_weights_tensor)


In [None]:
def train_child_epoch(model, loader):
    print("Model Config:", model)
    model.train()
    total_loss, total_correct, total = 0.0, 0, 0
    for batch in loader:
        input_ids = batch["input_ids"].to(device)
        attn_mask = batch["attention_mask"].to(device)
        labels    = batch["labels"].to(device)

        logits = model(input_ids, attn_mask)
        loss = criterion_nn(logits, labels)

        optimizer_nn.zero_grad()
        loss.backward()
        optimizer_nn.step()

        total_loss += loss.item() * labels.size(0)
        preds = logits.argmax(dim=-1)
        total_correct += (preds == labels).sum().item()
        total += labels.size(0)

    return total_loss / total, total_correct / total

@torch.no_grad()
def eval_child(model, loader):
    model.eval()
    total_loss, total_correct, total = 0.0, 0, 0
    for batch in loader:
        input_ids = batch["input_ids"].to(device)
        attn_mask = batch["attention_mask"].to(device)
        labels    = batch["labels"].to(device)

        logits = model(input_ids, attn_mask)
        loss = criterion_nn(logits, labels)

        total_loss += loss.item() * labels.size(0)
        preds = logits.argmax(dim=-1)
        total_correct += (preds == labels).sum().item()
        total += labels.size(0)

    return total_loss / total, total_correct / total

num_epochs_nn = 50
patience = 3

best_val_loss = float("inf")
best_state = None
no_improve = 0

train_hist_loss, val_hist_loss = [], []
train_hist_acc,  val_hist_acc  = [], []

for epoch in range(1, num_epochs_nn + 1):
    tr_loss, tr_acc = train_child_epoch(child_model, train_loader_nn)
    va_loss, va_acc = eval_child(child_model, val_loader_nn)

    train_hist_loss.append(tr_loss)
    val_hist_loss.append(va_loss)
    train_hist_acc.append(tr_acc)
    val_hist_acc.append(va_acc)

    print(f"Epoch {epoch}: "
          f"train_loss={tr_loss:.3f}, train_acc={tr_acc:.3f}, "
          f"val_loss={va_loss:.3f}, val_acc={va_acc:.3f}")

    if va_loss < best_val_loss - 1e-3:
        best_val_loss = va_loss
        best_state = child_model.state_dict()
        no_improve = 0
        print(f"  ↳ New best val_loss {best_val_loss:.3f}, saving state")
    else:
        no_improve += 1
        print(f"  ↳ No improvement ({no_improve}/{patience})")
        if no_improve >= patience:
            print("Early stopping triggered.")
            break

# restore best weights
if best_state is not None:
    child_model.load_state_dict(best_state)

print("Best val_loss:", best_val_loss)

Model Config: ChildLexiconNN(
  (embed): Embedding(50257, 128, padding_idx=50256)
  (ln): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (fc1): Linear(in_features=128, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=112, bias=True)
)
Epoch 1: train_loss=0.572, train_acc=0.725, val_loss=1.101, val_acc=0.586
  ↳ New best val_loss 1.101, saving state
Model Config: ChildLexiconNN(
  (embed): Embedding(50257, 128, padding_idx=50256)
  (ln): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (fc1): Linear(in_features=128, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=112, bias=True)
)
Epoch 2: train_loss=0.549, train_acc=0.728, val_loss=1.104, val_acc=0.587
  ↳ No improvement (1/3)
Model Config: ChildLexiconNN(
  (embed): Embedding(50257, 128, padding_idx=50256)
  (ln): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
  (dropout): Dr

In [None]:
@torch.no_grad()
def eval_child_known_unknown(model, loader):
    model.eval()
    correct_known = total_known = 0
    correct_unknown = total_unknown = 0

    for batch in loader:
        input_ids = batch["input_ids"].to(device)
        attn_mask = batch["attention_mask"].to(device)
        labels    = batch["labels"].to(device)

        logits = model(input_ids, attn_mask)
        preds = logits.argmax(dim=-1)

        for lab, pred in zip(labels.cpu().tolist(), preds.cpu().tolist()):
            child_str = id2label[lab]
            pred_str  = id2label[pred]
            if child_str == "unknown":
                total_unknown += 1
                if pred_str == "unknown":
                    correct_unknown += 1
            else:
                total_known += 1
                if lab == pred:
                    correct_known += 1

    acc_known = correct_known / total_known if total_known > 0 else 0.0
    acc_unknown = correct_unknown / total_unknown if total_unknown > 0 else 0.0
    return acc_known, acc_unknown

acc_known, acc_unknown = eval_child_known_unknown(child_model, test_loader_nn)
known_accuracy = f"{acc_known*100}%"
unkown_accuracy = f"{acc_unknown*100}%"
print("Child NN accuracy on known labels:", known_accuracy)
print("Child NN accuracy on 'unknown':   ",unkown_accuracy)
# End of Script

Child NN accuracy on known labels: 75.96491228070175%
Child NN accuracy on 'unknown':    26.521739130434785%
