# Topic Canonicalization Workflow
Consolidate NOVEL topics into the registry using embeddings + LLM review.

## Load Registry and Novel Candidates

In [5]:

with open(REGISTRY_PATH) as f:
    registry = json.load(f)

registry_version = registry.get('version', 'unknown')
registry_created = registry.get('created_at')
existing_topics = registry['topics']
print(f"Loaded {len(existing_topics)} registry topics (version {registry_version}, created {registry_created})")

novel_frames = []
for path in sorted((DATA_DIR / 'daily_labels').glob('labels_*.parquet')):
    df = pl.read_parquet(path)
    novel_subset = df.filter((pl.col('topic_id') == 'NOVEL') & pl.col('novel_label').is_not_null())
    if len(novel_subset) > 0:
        novel_frames.append(novel_subset)

if not novel_frames:
    raise ValueError('No NOVEL topics found to canonicalize.')

novel_df = pl.concat(novel_frames).with_columns([
    pl.col('created_at').dt.convert_time_zone('Asia/Kolkata'),
]).unique(subset=['novel_label'])
print(f'Found {len(novel_df)} unique novel candidates')


Loaded 32 registry topics (version 001, created 2025-10-27T00:00:00+05:30)
Found 91 unique novel candidates


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_ID = os.getenv("KAGGLE_MODEL_ID", "Qwen/Qwen2.5-3B-Instruct")
MAX_NEW_TOKENS = int(os.getenv("KAGGLE_MAX_NEW_TOKENS", "180"))

print(f"Loading local model {MODEL_ID}…")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="auto",
    load_in_4bit=True,
    torch_dtype=torch.bfloat16,
)
MODEL = MODEL_ID
print(f"✓ Loaded local HF model {MODEL_ID}")

In [None]:
from contextlib import nullcontext

class TransformersLLMClient:
    def __init__(self, model, tokenizer, max_new_tokens=MAX_NEW_TOKENS):
        self.model = model
        self.tokenizer = tokenizer
        self.max_new_tokens = max_new_tokens

    def complete(self, system_prompt, user_prompt, temperature=0.2, response_format='json', use_cache=True):
        prompt = f"{system_prompt}

User: {user_prompt}

Assistant:"
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)

        autocast_ctx = torch.cuda.amp.autocast() if torch.cuda.is_available() else nullcontext()
        with torch.inference_mode():
            with autocast_ctx:
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=self.max_new_tokens,
                    temperature=temperature,
                    do_sample=temperature > 0,
                    pad_token_id=self.tokenizer.eos_token_id,
                )

        text = self.tokenizer.decode(outputs[0], skip_special_tokens=True).split("Assistant:")[-1].strip()
        if response_format == 'json':
            try:
                return json.loads(text)
            except json.JSONDecodeError:
                return {"error": text}
        return text

    def batch_complete(self, prompts, batch_size=4):
        results = []
        for item in prompts:
            results.append(self.complete(
                item['system_prompt'],
                item['user_prompt'],
                item.get('temperature', 0.3),
                item.get('response_format', 'json')
            ))
        return results

llm = TransformersLLMClient(model, tokenizer)
print("✓ Initialized local transformers client")

## Summarize Novel Candidates

In [6]:

# Summarize novel topics by frequency and recency
novel_summary = novel_df.group_by('novel_label').agg([
    pl.len().alias('review_count'),
    pl.first('novel_rationale').alias('sample_rationale'),
    pl.min('created_at').alias('first_seen'),
    pl.max('created_at').alias('last_seen'),
]).sort('review_count', descending=True)

print(f"✓ Aggregated {len(novel_summary)} novel topic candidates")
summary_path = DATA_DIR / 'novel_topic_summary.parquet'
novel_summary.write_parquet(summary_path)
print(f"  Saved summary to {summary_path}")

try:
    display(novel_summary.head(10))
except NameError:
    print(novel_summary.head(10))


✓ Aggregated 91 novel topic candidates
  Saved summary to ../data/novel_topic_summary.parquet


novel_label,review_count,sample_rationale,first_seen,last_seen
str,u32,str,"datetime[μs, Asia/Kolkata]","datetime[μs, Asia/Kolkata]"
"""Unidentifiable Issue""",1,"""The review text 'thinkyou' doe…",2025-09-28 07:17:34 IST,2025-09-28 07:17:34 IST
"""Unidentifiable Review""",1,"""The review does not provide an…",2025-09-29 17:14:33 IST,2025-09-29 17:14:33 IST
"""Emoticon in Review""",1,"""The review contains emoticons …",2025-09-28 17:57:28 IST,2025-09-28 17:57:28 IST
"""Emoji-only Review""",1,"""The review consists only of em…",2025-09-28 12:47:38 IST,2025-09-28 12:47:38 IST
"""Location Issue""",1,"""The review mentions issues spe…",2025-09-28 15:06:53 IST,2025-09-28 15:06:53 IST
"""Facility Issue""",1,"""The review mentions a lack of …",2025-09-28 07:15:35 IST,2025-09-28 07:15:35 IST
"""Negative Sentiment""",1,"""The review expresses a negativ…",2025-09-28 17:51:28 IST,2025-09-28 17:51:28 IST
"""Unrecognized Issue""",1,"""The review text is not related…",2025-09-28 20:33:13 IST,2025-09-28 20:33:13 IST
"""Cheating Experience""",1,"""The review mentions 'cheaters,…",2025-09-28 15:30:24 IST,2025-09-28 15:30:24 IST
"""Short Thank You Message""",1,"""The review is a short thank yo…",2025-09-29 17:52:35 IST,2025-09-29 17:52:35 IST


## Compute Embeddings and Similarities

In [7]:

model = SentenceTransformer(MODEL_NAME)
print(f"✓ Loaded embedding model {MODEL_NAME}")

registry_corpus = [topic['name'] + ' ' + topic['definition'] for topic in existing_topics]
registry_matrix = model.encode(registry_corpus, show_progress_bar=True)

novel_texts = [
    f"{row['novel_label']} :: {row['novel_rationale']}" if row['novel_rationale'] else row['novel_label']
    for row in novel_df.iter_rows(named=True)
]
novel_matrix = model.encode(novel_texts, show_progress_bar=True)

similarities = cosine_similarity(novel_matrix, registry_matrix)

candidate_matches = []
for idx, novel_row in enumerate(novel_df.iter_rows(named=True)):
    best_idx = similarities[idx].argmax()
    score = float(similarities[idx][best_idx])
    registry_topic = existing_topics[int(best_idx)]
    candidate_matches.append({
        'novel_label': novel_row['novel_label'],
        'novel_rationale': novel_row['novel_rationale'],
        'first_seen': novel_row['created_at'],
        'best_topic_id': registry_topic['id'],
        'best_topic_name': registry_topic['name'],
        'similarity': score
    })

candidate_df = pl.DataFrame(candidate_matches).sort('similarity', descending=True)
print(f"✓ Candidate matches computed ({len(candidate_df)} rows)")
try:
    display(candidate_df.head(10))
except NameError:
    print(candidate_df.head(10))


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✓ Loaded embedding model all-MiniLM-L6-v2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

✓ Candidate matches computed (91 rows)


novel_label,novel_rationale,first_seen,best_topic_id,best_topic_name,similarity
str,str,"datetime[μs, Asia/Kolkata]",str,str,f64
"""Late delivery due to time""","""The review mentions that the o…",2025-09-28 06:58:59 IST,"""LATE_DELIVERY""","""Late Delivery""",0.737176
"""Negative Generic""","""The review provides a very bri…",2025-09-28 15:55:05 IST,"""NEGATIVE_GENERIC""","""Negative Generic""",0.710988
"""Generic Negative Feedback""","""The review provides very minim…",2025-09-28 14:19:36 IST,"""NEGATIVE_GENERIC""","""Negative Generic""",0.710497
"""General Positive Feedback""","""The review is very brief and d…",2025-09-28 08:39:22 IST,"""POSITIVE_EXPERIENCE""","""Positive Experience""",0.658837
"""Positive Feedback""","""The review is a generic positi…",2025-09-28 16:58:44 IST,"""POSITIVE_EXPERIENCE""","""Positive Experience""",0.651657
"""App Feedback""","""The review provides positive f…",2025-09-28 10:41:48 IST,"""GREAT_APP""","""Great App""",0.640818
"""Generic Positive Feedback""","""The review is very brief and d…",2025-09-28 00:43:13 IST,"""POSITIVE_EXPERIENCE""","""Positive Experience""",0.633143
"""General Negative Feedback""","""The review provides a very bri…",2025-09-28 19:13:15 IST,"""POSITIVE_EXPERIENCE""","""Positive Experience""",0.598888
"""Service Improvement Suggestion""","""The review suggests a general …",2025-09-28 14:05:50 IST,"""POSITIVE_EXPERIENCE""","""Positive Experience""",0.58318
"""Positive Cost Experience""","""The review highlights a positi…",2025-09-28 06:45:15 IST,"""POSITIVE_EXPERIENCE""","""Positive Experience""",0.569358


## Resolve Ambiguous Candidates with LLM

## Write Registry Update Draft

In [10]:

updates = []
timestamp = datetime.utcnow().isoformat() + 'Z'
for row in actions_df.iter_rows(named=True):
    updates.append({
        'timestamp': timestamp,
        'novel_label': row['novel_label'],
        'target_topic_id': row['best_topic_id'],
        'target_topic_name': row['best_topic_name'],
        'similarity': row['similarity'],
        'action': row['action'],
        'notes': row.get('notes'),
        'llm_label': row.get('llm_label'),
        'first_seen': row.get('first_seen'),
    })

updates_path = DATA_DIR / 'registry_updates.json'
with updates_path.open('w') as f:
    json.dump(updates, f, indent=2, default=str)
print(f'✓ Proposed updates written to {updates_path}')

review_path = DATA_DIR / 'registry_updates_preview.csv'
actions_df.write_csv(review_path)
print(f'  Preview CSV written to {review_path}')


✓ Proposed updates written to ../data/registry_updates.json
  Preview CSV written to ../data/registry_updates_preview.csv


## Manual Follow-up
- Review `data/registry_updates.json`
- Apply merges/new topics in `registry/topic_registry.json`
- Re-run topic routing if registry changes materially
