# Topic Canonicalization Workflow
Consolidate NOVEL topics into the registry using embeddings + LLM review.

In [7]:
import polars as pl
from pathlib import Path
import json
from datetime import datetime
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from utils.llm_client import LLMClient

DATA_DIR = Path('../data')
REGISTRY_PATH = Path('../registry/topic_registry.json')
MODEL_NAME = 'all-MiniLM-L6-v2'
SIM_THRESHOLD = 0.82
LLM_PROVIDER = 'megallm'
LLM_MODEL = 'gpt-4o-mini'

print('✓ Environment ready')


✓ Environment ready


## Load Registry and Novel Candidates

In [2]:

with open(REGISTRY_PATH) as f:
    registry = json.load(f)

registry_version = registry.get('version', 'unknown')
registry_created = registry.get('created_at')
existing_topics = registry['topics']
print(f"Loaded {len(existing_topics)} registry topics (version {registry_version}, created {registry_created})")

novel_frames = []
for path in sorted((DATA_DIR / 'daily_labels').glob('labels_*.parquet')):
    df = pl.read_parquet(path)
    novel_subset = df.filter((pl.col('topic_id') == 'NOVEL') & pl.col('novel_label').is_not_null())
    if len(novel_subset) > 0:
        novel_frames.append(novel_subset)

if not novel_frames:
    raise ValueError('No NOVEL topics found to canonicalize.')

novel_df = pl.concat(novel_frames).with_columns([
    pl.col('created_at').dt.convert_time_zone('Asia/Kolkata'),
]).unique(subset=['novel_label'])
print(f'Found {len(novel_df)} unique novel candidates')


Loaded 32 registry topics (version 001, created 2025-10-27T00:00:00+05:30)
Found 1028 unique novel candidates


## Summarize Novel Candidates

In [3]:

# Summarize novel topics by frequency and recency
novel_summary = novel_df.group_by('novel_label').agg([
    pl.len().alias('review_count'),
    pl.first('novel_rationale').alias('sample_rationale'),
    pl.min('created_at').alias('first_seen'),
    pl.max('created_at').alias('last_seen'),
]).sort('review_count', descending=True)

print(f"✓ Aggregated {len(novel_summary)} novel topic candidates")
summary_path = DATA_DIR / 'novel_topic_summary.parquet'
novel_summary.write_parquet(summary_path)
print(f"  Saved summary to {summary_path}")

try:
    display(novel_summary.head(10))
except NameError:
    print(novel_summary.head(10))


✓ Aggregated 1028 novel topic candidates
  Saved summary to ../data/novel_topic_summary.parquet


novel_label,review_count,sample_rationale,first_seen,last_seen
str,u32,str,"datetime[μs, Asia/Kolkata]","datetime[μs, Asia/Kolkata]"
"""Request for delivery""",1,"""The review is a simple request…",2025-10-25 16:29:17 IST,2025-10-25 16:29:17 IST
"""Unrelated Review Content""",1,"""The review does not relate to …",2025-10-11 17:24:26 IST,2025-10-11 17:24:26 IST
"""Disappointment with medicine o…",1,"""The review expresses disappoin…",2025-10-18 14:24:24 IST,2025-10-18 14:24:24 IST
"""Request for better offers""",1,"""The review suggests improving …",2025-09-29 17:57:36 IST,2025-09-29 17:57:36 IST
"""Awesome Offers""",1,"""The review mentions awesome of…",2025-10-05 09:18:24 IST,2025-10-05 09:18:24 IST
"""Poor Delivery Experience""",1,"""The term 'bakwaas delivery' in…",2025-10-06 10:11:32 IST,2025-10-06 10:11:32 IST
"""Positive feedback on deals""",1,"""The review mentions 'great dea…",2025-10-20 16:39:06 IST,2025-10-20 16:39:06 IST
"""App Deletion Experience""",1,"""User deleted the app due to di…",2025-10-25 14:27:35 IST,2025-10-25 14:27:35 IST
"""Lack of offers""",1,"""The review mentions a lack of …",2025-10-08 05:33:39 IST,2025-10-08 05:33:39 IST
"""Account Deletion Issue""",1,"""User unable to delete account …",2025-10-13 07:55:36 IST,2025-10-13 07:55:36 IST


## Compute Embeddings and Similarities

In [5]:
import time

model = SentenceTransformer(MODEL_NAME)
print(f"✓ Loaded embedding model {MODEL_NAME}")

# Build the registry corpus
registry_corpus = [topic['name'] + ' ' + topic['definition'] for topic in existing_topics]
print(f"[LOG] Constructed registry_corpus: {len(registry_corpus)} entries")

# Encode registry topics
start_time = time.time()
registry_matrix = model.encode(registry_corpus, show_progress_bar=True)
elapsed = time.time() - start_time
print(f"[LOG] Encoded registry_corpus into registry_matrix with shape {registry_matrix.shape} in {elapsed:.2f}s")

# Prepare novel texts
novel_texts = [
    f"{row['novel_label']} :: {row['novel_rationale']}" if row['novel_rationale'] else row['novel_label']
    for row in novel_df.iter_rows(named=True)
]
print(f"[LOG] Constructed novel_texts: {len(novel_texts)} entries")

# Encode novel texts
start_time = time.time()
novel_matrix = model.encode(novel_texts, show_progress_bar=True)
elapsed = time.time() - start_time
print(f"[LOG] Encoded novel_texts into novel_matrix with shape {novel_matrix.shape} in {elapsed:.2f}s")

# Compute cosine similarities
start_time = time.time()
similarities = cosine_similarity(novel_matrix, registry_matrix)
elapsed = time.time() - start_time
print(f"[LOG] Computed cosine_similarity matrix with shape {similarities.shape} in {elapsed:.2f}s")

# Build candidate matches
candidate_matches = []
print("[LOG] Beginning to iterate through novel rows for matching…")
for idx, novel_row in enumerate(novel_df.iter_rows(named=True)):
    sim_row = similarities[idx]
    best_idx = int(sim_row.argmax())
    best_score = float(sim_row[best_idx])
    registry_topic = existing_topics[best_idx]
    candidate = {
        'novel_label': novel_row['novel_label'],
        'novel_rationale': novel_row['novel_rationale'],
        'first_seen': novel_row['created_at'],
        'best_topic_id': registry_topic['id'],
        'best_topic_name': registry_topic['name'],
        'similarity': best_score
    }
    candidate_matches.append(candidate)

    # Detailed log per row (optional: comment out for huge datasets)
    print(f"[LOG][row {idx}] novel_label='{candidate['novel_label']}' → "
          f"best_topic_id='{candidate['best_topic_id']}', "
          f"best_topic_name='{candidate['best_topic_name']}', "
          f"similarity={candidate['similarity']:.4f}")

print(f"[LOG] Built {len(candidate_matches)} candidate matches")

candidate_df = pl.DataFrame(candidate_matches).sort('similarity', descending=True)
print(f"✓ Candidate matches computed ({len(candidate_df)} rows)")

try:
    display(candidate_df.head(10))
except NameError:
    print(candidate_df.head(10))


✓ Loaded embedding model all-MiniLM-L6-v2
[LOG] Constructed registry_corpus: 32 entries


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[LOG] Encoded registry_corpus into registry_matrix with shape (32, 384) in 0.20s
[LOG] Constructed novel_texts: 1028 entries


Batches:   0%|          | 0/33 [00:00<?, ?it/s]

[LOG] Encoded novel_texts into novel_matrix with shape (1028, 384) in 0.35s
[LOG] Computed cosine_similarity matrix with shape (1028, 32) in 0.00s
[LOG] Beginning to iterate through novel rows for matching…
[LOG][row 0] novel_label='Restaurant policy on offers' → best_topic_id='LIMITED_OPTIONS', best_topic_name='Limited Options', similarity=0.4673
[LOG][row 1] novel_label='Weird payment option' → best_topic_id='MONEY_ALREADY_EATEN', best_topic_name='Money Already Eaten', similarity=0.4296
[LOG][row 2] novel_label='Unclear review context' → best_topic_id='NO_RESPONSE_COMPLAINT', best_topic_name='No Response to Complaint', similarity=0.2047
[LOG][row 3] novel_label='Promotional Offer Mentioned' → best_topic_id='OVERPRICED_ITEMS', best_topic_name='Overpriced Items', similarity=0.2944
[LOG][row 4] novel_label='Service Issue' → best_topic_id='POSITIVE_EXPERIENCE', best_topic_name='Positive Experience', similarity=0.4307
[LOG][row 5] novel_label='Hunger Issue' → best_topic_id='POOR_QUALITY',

novel_label,novel_rationale,first_seen,best_topic_id,best_topic_name,similarity
str,str,"datetime[μs, Asia/Kolkata]",str,str,f64
"""Positive service feedback""","""The review expresses satisfact…",2025-10-04 16:13:01 IST,"""POSITIVE_EXPERIENCE""","""Positive Experience""",0.840523
"""Generic positive feedback""","""The review expresses a positiv…",2025-09-28 05:16:42 IST,"""POSITIVE_EXPERIENCE""","""Positive Experience""",0.827888
"""Best service feedback""","""The review expresses a positiv…",2025-10-20 04:11:57 IST,"""POSITIVE_EXPERIENCE""","""Positive Experience""",0.753112
"""Delivery Speed Concern""","""The review expresses dissatisf…",2025-10-15 09:30:44 IST,"""FAST_DELIVERY""","""Fast Delivery""",0.730695
"""Positive feedback about servic…","""The review expresses a positiv…",2025-10-04 16:00:59 IST,"""POSITIVE_EXPERIENCE""","""Positive Experience""",0.724916
"""Order not received""","""The review indicates waiting f…",2025-10-18 16:13:18 IST,"""NO_DELIVERY_YET""","""No Delivery Yet""",0.712767
"""Low Price Concern""","""The review mentions low price …",2025-09-30 16:34:57 IST,"""POOR_QUALITY""","""Poor Quality""",0.708216
"""Generic Positive Feedback""","""The review expresses a positiv…",2025-09-28 07:02:46 IST,"""POSITIVE_EXPERIENCE""","""Positive Experience""",0.69851
"""Slow Delivery""","""The review indicates a delay w…",2025-10-22 14:19:24 IST,"""FAST_DELIVERY""","""Fast Delivery""",0.6936
"""App feedback""","""The review provides generic fe…",2025-10-21 20:13:31 IST,"""GREAT_APP""","""Great App""",0.690503


## Resolve Ambiguous Candidates with LLM

In [10]:
import time

for size in [5, 10, 20, 50, 100]:
    start = time.time()
    llm.batch_complete(prompts[:size], batch_size=size)
    print(f"Batch {size}: {time.time()-start:.2f}s")


Processing: 100%|█████████████████████████████████| 1/1 [00:00<00:00,  3.97it/s]


Batch 5: 0.25s


Processing: 100%|█████████████████████████████████| 1/1 [00:00<00:00,  1.99it/s]


Batch 10: 0.50s


Processing: 100%|█████████████████████████████████| 1/1 [00:01<00:00,  1.01s/it]


Batch 20: 1.01s


Processing: 100%|█████████████████████████████████| 1/1 [00:02<00:00,  2.51s/it]


Batch 50: 2.51s


Processing: 100%|█████████████████████████████████| 1/1 [00:41<00:00, 41.02s/it]

Batch 100: 41.02s





In [12]:
import concurrent.futures
import polars as pl
import time

llm = LLMClient(provider=LLM_PROVIDER, model=LLM_MODEL)
print(f"✓ Ready LLM adjudicator ({LLM_PROVIDER}/{LLM_MODEL})")

actions, prompts = [], []

# Build prompt list
for row in candidate_df.iter_rows(named=True):
    if row['similarity'] >= SIM_THRESHOLD:
        actions.append({**row, 'action': 'merge', 'notes': 'auto-merge via embedding score'})
    else:
        prompts.append({
            'system_prompt': 'You are a taxonomy expert consolidating duplicate topics. Only reply with strict JSON.',
            'user_prompt': f"""Topic A: {row['novel_label']} ({row['novel_rationale']})
Topic B: {row['best_topic_name']}
Return JSON: {{"action":"merge|new","label":"<final>","notes":"<reason>"}}""",
            'temperature': 0.0,
            'response_format': 'json'
        })

# Function to handle one batch safely
def process_batch(batch, batch_id):
    try:
        start = time.time()
        result = llm.batch_complete(batch, batch_size=10)
        print(f"✓ Batch {batch_id} done in {time.time() - start:.2f}s ({len(batch)} prompts)")
        return result
    except Exception as e:
        print(f"⚠️ Batch {batch_id} failed: {e}")
        return [None] * len(batch)

# Run batches in parallel
responses = []
if prompts:
    batches = [prompts[i:i+10] for i in range(0, len(prompts), 10)]
    print(f"🚀 Total batches: {len(batches)} (running with max_workers=20)")

    start = time.time()
    with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
        future_to_id = {executor.submit(process_batch, b, i+1): i+1 for i, b in enumerate(batches)}
        for future in concurrent.futures.as_completed(future_to_id):
            batch_id = future_to_id[future]
            try:
                res = future.result()
                if res:
                    responses.extend(res)
            except Exception as e:
                print(f"⚠️ Batch {batch_id} error: {e}")
    print(f"✅ All batches completed in {time.time() - start:.2f}s")
else:
    responses = []

# Merge responses back to dataframe
for resp, row in zip(
    responses,
    (r for r in candidate_df.iter_rows(named=True) if r['similarity'] < SIM_THRESHOLD)
):
    if resp is None:
        actions.append({**row, 'action': 'review', 'notes': 'LLM response missing'})
        continue
    actions.append({
        **row,
        'action': resp.get('action', 'review'),
        'notes': resp.get('notes'),
        'llm_label': resp.get('label')
    })

actions_df = pl.DataFrame(actions)
print(f"✓ Proposed actions: {actions_df.height} rows")
try:
    display(actions_df.head(10))
except NameError:
    print(actions_df.head(10))


✓ Initialized megallm client with model gpt-4o-mini
✓ Ready LLM adjudicator (megallm/gpt-4o-mini)
🚀 Total batches: 103 (running with max_workers=20)



Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A

Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A


Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A



Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A





Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A




Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A






Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A









Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A








Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A











Processing:   0%|                                         | 0/1 [00:00<?, ?it/s]

✓ Batch 2 done in 27.25s (10 prompts)


Processing:   0%|                                         | 0/1 [00:00<?, ?it/s]

















Processing: 100%|█████████████████████████████████| 1/1 [00:32<00:00, 32.82s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A


✓ Batch 19 done in 32.87s (10 prompts)




















Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

Processing: 100%|█████████████████████████████████| 1/1 [00:34<00:00, 34.29s/it][A[A


✓ Batch 3 done in 34.30s (10 prompts)




Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A













Processing: 100%|█████████████████████████████████| 1/1 [00:35<00:00, 35.44s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A


✓ Batch 15 done in 35.47s (10 prompts)
















Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A







Processing: 100%|█████████████████████████████████| 1/1 [00:35<00:00, 35.73s/it][A[A[A[A[A[A[A[A


✓ Batch 9 done in 35.77s (10 prompts)










Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A


















Processing: 100%|█████████████████████████████████| 1/1 [00:36<00:00, 36.62s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A


✓ Batch 20 done in 36.66s (10 prompts)





















Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A












Processing: 100%|█████████████████████████████████| 1/1 [00:36<00:00, 36.78s/it][A[A[A[A[A[A[A[A[A[A[A[A[A


✓ Batch 14 done in 36.82s (10 prompts)















Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A
Processing: 100%|█████████████████████████████████| 1/1 [00:37<00:00, 37.10s/it][A


✓ Batch 1 done in 37.10s (10 prompts)



Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A





Processing: 100%|█████████████████████████████████| 1/1 [00:37<00:00, 37.16s/it][A[A[A[A[A[A


✓ Batch 7 done in 37.18s (10 prompts)








Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A




Processing: 100%|█████████████████████████████████| 1/1 [00:37<00:00, 37.30s/it][A[A[A[A[A


✓ Batch 6 done in 37.33s (10 prompts)







Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A










Processing: 100%|█████████████████████████████████| 1/1 [00:38<00:00, 38.35s/it][A[A[A[A[A[A[A[A[A[A[A


✓ Batch 12 done in 38.39s (10 prompts)













Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A



Processing: 100%|█████████████████████████████████| 1/1 [00:38<00:00, 38.83s/it][A[A[A[A


✓ Batch 5 done in 38.85s (10 prompts)






Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A











Processing: 100%|█████████████████████████████████| 1/1 [00:40<00:00, 40.01s/it][A[A[A[A[A[A[A[A[A[A[A[A


✓ Batch 13 done in 40.04s (10 prompts)














Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A















Processing: 100%|█████████████████████████████████| 1/1 [00:40<00:00, 40.53s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A


✓ Batch 17 done in 40.57s (10 prompts)


















Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A






Processing: 100%|█████████████████████████████████| 1/1 [00:40<00:00, 40.59s/it][A[A[A[A[A[A[A


✓ Batch 8 done in 40.63s (10 prompts)









Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A








Processing: 100%|█████████████████████████████████| 1/1 [00:41<00:00, 41.00s/it][A[A[A[A[A[A[A[A[A


✓ Batch 10 done in 41.03s (10 prompts)











Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A
















Processing: 100%|█████████████████████████████████| 1/1 [00:42<00:00, 42.24s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A


✓ Batch 18 done in 42.28s (10 prompts)



















Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A


Processing: 100%|█████████████████████████████████| 1/1 [00:44<00:00, 44.02s/it][A[A[A


✓ Batch 4 done in 44.03s (10 prompts)





Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A









Processing: 100%|█████████████████████████████████| 1/1 [00:45<00:00, 45.12s/it][A[A[A[A[A[A[A[A[A[A


✓ Batch 11 done in 45.15s (10 prompts)












Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A


Error processing prompt: the JSON object must be str, bytes or bytearray, not NoneType

















Processing: 100%|█████████████████████████████████| 1/1 [00:47<00:00, 47.27s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A


✓ Batch 16 done in 47.31s (10 prompts)

















Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

















Processing: 100%|█████████████████████████████████| 1/1 [00:24<00:00, 24.17s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A


✓ Batch 22 done in 24.18s (10 prompts)




















Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A





Processing: 100%|█████████████████████████████████| 1/1 [00:22<00:00, 22.30s/it][A[A[A[A[A[A


✓ Batch 29 done in 22.31s (10 prompts)








Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A


















Processing: 100%|█████████████████████████████████| 1/1 [00:24<00:00, 24.57s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A


✓ Batch 26 done in 24.57s (10 prompts)





















Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A













Processing: 100%|█████████████████████████████████| 1/1 [00:26<00:00, 26.26s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A


✓ Batch 24 done in 26.26s (10 prompts)
















Processing: 100%|█████████████████████████████████| 1/1 [00:34<00:00, 34.60s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A


✓ Batch 21 done in 34.60s (10 prompts)


Processing:   0%|                                         | 0/1 [00:00<?, ?it/s]




Processing: 100%|█████████████████████████████████| 1/1 [00:25<00:00, 25.74s/it][A[A[A[A[A


✓ Batch 30 done in 25.74s (10 prompts)







Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A







Processing: 100%|█████████████████████████████████| 1/1 [00:28<00:00, 28.79s/it][A[A[A[A[A[A[A[A


✓ Batch 25 done in 28.79s (10 prompts)










Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A










Processing: 100%|█████████████████████████████████| 1/1 [00:26<00:00, 26.39s/it][A[A[A[A[A[A[A[A[A[A[A


✓ Batch 31 done in 26.39s (10 prompts)













Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A












Processing: 100%|█████████████████████████████████| 1/1 [00:28<00:00, 28.44s/it][A[A[A[A[A[A[A[A[A[A[A[A[A


✓ Batch 27 done in 28.44s (10 prompts)















Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A



Processing: 100%|█████████████████████████████████| 1/1 [00:27<00:00, 27.30s/it][A[A[A[A


✓ Batch 32 done in 27.30s (10 prompts)






Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A
Processing: 100%|█████████████████████████████████| 1/1 [00:29<00:00, 29.50s/it][A


✓ Batch 28 done in 29.50s (10 prompts)



Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A















Processing: 100%|█████████████████████████████████| 1/1 [00:27<00:00, 27.74s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A


✓ Batch 34 done in 27.74s (10 prompts)


















Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

Processing: 100%|█████████████████████████████████| 1/1 [00:35<00:00, 35.04s/it][A[A


✓ Batch 23 done in 35.04s (10 prompts)




Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A
















Processing: 100%|█████████████████████████████████| 1/1 [00:27<00:00, 27.46s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A


✓ Batch 37 done in 27.46s (10 prompts)



















Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A






Processing: 100%|█████████████████████████████████| 1/1 [00:30<00:00, 30.15s/it][A[A[A[A[A[A[A


✓ Batch 35 done in 30.16s (10 prompts)









Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A








Processing: 100%|█████████████████████████████████| 1/1 [00:30<00:00, 30.52s/it][A[A[A[A[A[A[A[A[A


✓ Batch 36 done in 30.53s (10 prompts)











Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A


Processing: 100%|█████████████████████████████████| 1/1 [00:28<00:00, 28.36s/it][A[A[A


✓ Batch 38 done in 28.36s (10 prompts)





Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A














Processing: 100%|█████████████████████████████████| 1/1 [00:25<00:00, 25.24s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A


✓ Batch 40 done in 25.24s (10 prompts)

















Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A









Processing: 100%|█████████████████████████████████| 1/1 [00:27<00:00, 27.62s/it][A[A[A[A[A[A[A[A[A[A


✓ Batch 39 done in 27.62s (10 prompts)












Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A











Processing: 100%|█████████████████████████████████| 1/1 [00:33<00:00, 33.83s/it][A[A[A[A[A[A[A[A[A[A[A[A


✓ Batch 33 done in 33.83s (10 prompts)














Processing: 100%|█████████████████████████████████| 1/1 [00:25<00:00, 25.81s/it][A[A[A[A[A[A[A[A[A[A[A[A


✓ Batch 45 done in 25.81s (10 prompts)


Processing:   0%|                                         | 0/1 [00:00<?, ?it/s]

















Processing: 100%|█████████████████████████████████| 1/1 [00:31<00:00, 31.85s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A


✓ Batch 41 done in 31.85s (10 prompts)




















Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A












Processing: 100%|█████████████████████████████████| 1/1 [00:24<00:00, 24.28s/it][A[A[A[A[A[A[A[A[A[A[A[A[A


✓ Batch 49 done in 24.28s (10 prompts)















Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A










Processing: 100%|█████████████████████████████████| 1/1 [00:26<00:00, 26.20s/it][A[A[A[A[A[A[A[A[A[A[A


✓ Batch 48 done in 26.20s (10 prompts)













Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A





Processing: 100%|█████████████████████████████████| 1/1 [00:32<00:00, 32.35s/it][A[A[A[A[A[A


✓ Batch 42 done in 32.35s (10 prompts)








Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A


















Processing: 100%|█████████████████████████████████| 1/1 [00:31<00:00, 31.06s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A


✓ Batch 43 done in 31.06s (10 prompts)





















Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A













Processing: 100%|█████████████████████████████████| 1/1 [00:31<00:00, 31.92s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A


✓ Batch 44 done in 31.92s (10 prompts)
















Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A

Processing: 100%|█████████████████████████████████| 1/1 [00:28<00:00, 28.72s/it][A[A


✓ Batch 53 done in 28.72s (10 prompts)




Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A






Processing: 100%|█████████████████████████████████| 1/1 [00:27<00:00, 27.47s/it][A[A[A[A[A[A[A


✓ Batch 55 done in 27.47s (10 prompts)









Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A
















Processing: 100%|█████████████████████████████████| 1/1 [00:29<00:00, 29.50s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A


✓ Batch 54 done in 29.50s (10 prompts)



















Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A




Processing: 100%|█████████████████████████████████| 1/1 [00:37<00:00, 37.08s/it][A[A[A[A[A


✓ Batch 46 done in 37.09s (10 prompts)







Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A







Processing: 100%|█████████████████████████████████| 1/1 [00:36<00:00, 36.15s/it][A[A[A[A[A[A[A[A


✓ Batch 47 done in 36.16s (10 prompts)










Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A
Processing: 100%|█████████████████████████████████| 1/1 [00:34<00:00, 34.41s/it][A


✓ Batch 51 done in 34.41s (10 prompts)



Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A














Processing: 100%|█████████████████████████████████| 1/1 [00:29<00:00, 29.09s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A


✓ Batch 58 done in 29.09s (10 prompts)

















Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A



Processing: 100%|█████████████████████████████████| 1/1 [00:38<00:00, 38.03s/it][A[A[A[A


✓ Batch 50 done in 38.03s (10 prompts)






Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A











Processing: 100%|█████████████████████████████████| 1/1 [00:30<00:00, 30.67s/it][A[A[A[A[A[A[A[A[A[A[A[A


✓ Batch 60 done in 30.67s (10 prompts)














Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A








Processing: 100%|█████████████████████████████████| 1/1 [00:33<00:00, 33.04s/it][A[A[A[A[A[A[A[A[A


✓ Batch 56 done in 33.04s (10 prompts)











Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A









Processing: 100%|█████████████████████████████████| 1/1 [00:34<00:00, 34.05s/it][A[A[A[A[A[A[A[A[A[A


✓ Batch 59 done in 34.05s (10 prompts)












Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A















Processing: 100%|█████████████████████████████████| 1/1 [00:41<00:00, 41.14s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A


✓ Batch 52 done in 41.14s (10 prompts)


















Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A


Processing: 100%|█████████████████████████████████| 1/1 [00:41<00:00, 41.53s/it][A[A[A


✓ Batch 57 done in 41.53s (10 prompts)





Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A












Processing: 100%|█████████████████████████████████| 1/1 [00:26<00:00, 26.23s/it][A[A[A[A[A[A[A[A[A[A[A[A[A


✓ Batch 63 done in 26.24s (10 prompts)















Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A

Processing: 100%|█████████████████████████████████| 1/1 [00:20<00:00, 20.28s/it][A[A


✓ Batch 68 done in 20.28s (10 prompts)




Processing: 100%|█████████████████████████████████| 1/1 [00:32<00:00, 32.71s/it][A[A


✓ Batch 61 done in 32.71s (10 prompts)


Processing:   0%|                                         | 0/1 [00:00<?, ?it/s]





Processing: 100%|█████████████████████████████████| 1/1 [00:29<00:00, 29.35s/it][A[A[A[A[A[A


✓ Batch 65 done in 29.36s (10 prompts)








Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A










Processing: 100%|█████████████████████████████████| 1/1 [00:30<00:00, 30.38s/it][A[A[A[A[A[A[A[A[A[A[A


✓ Batch 64 done in 30.38s (10 prompts)













Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A
















Processing: 100%|█████████████████████████████████| 1/1 [00:22<00:00, 22.64s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A


✓ Batch 70 done in 22.64s (10 prompts)



















Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

















Processing: 100%|█████████████████████████████████| 1/1 [00:34<00:00, 34.88s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A


✓ Batch 62 done in 34.89s (10 prompts)




















Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A
Processing: 100%|█████████████████████████████████| 1/1 [00:23<00:00, 23.80s/it][A


✓ Batch 73 done in 23.81s (10 prompts)



Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A














Processing: 100%|█████████████████████████████████| 1/1 [00:24<00:00, 24.86s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A


✓ Batch 74 done in 24.86s (10 prompts)

















Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A



Processing: 100%|█████████████████████████████████| 1/1 [00:22<00:00, 22.45s/it][A[A[A[A


✓ Batch 75 done in 22.45s (10 prompts)






Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A






Processing: 100%|█████████████████████████████████| 1/1 [00:28<00:00, 28.47s/it][A[A[A[A[A[A[A


✓ Batch 69 done in 28.47s (10 prompts)









Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A











Processing: 100%|█████████████████████████████████| 1/1 [00:25<00:00, 25.26s/it][A[A[A[A[A[A[A[A[A[A[A[A


✓ Batch 76 done in 25.26s (10 prompts)














Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A




Processing: 100%|█████████████████████████████████| 1/1 [00:30<00:00, 30.74s/it][A[A[A[A[A


✓ Batch 71 done in 30.74s (10 prompts)







Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A









Processing: 100%|█████████████████████████████████| 1/1 [00:24<00:00, 24.73s/it][A[A[A[A[A[A[A[A[A[A


✓ Batch 78 done in 24.73s (10 prompts)












Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A


















Processing: 100%|█████████████████████████████████| 1/1 [00:43<00:00, 43.18s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A


✓ Batch 66 done in 43.18s (10 prompts)





















Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A













Processing: 100%|█████████████████████████████████| 1/1 [00:42<00:00, 42.80s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A


✓ Batch 67 done in 42.81s (10 prompts)
















Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A







Processing: 100%|█████████████████████████████████| 1/1 [00:41<00:00, 41.82s/it][A[A[A[A[A[A[A[A


✓ Batch 72 done in 41.83s (10 prompts)










Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A

Processing: 100%|█████████████████████████████████| 1/1 [00:25<00:00, 25.98s/it][A[A


✓ Batch 82 done in 25.98s (10 prompts)




Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A












Processing: 100%|█████████████████████████████████| 1/1 [00:28<00:00, 28.70s/it][A[A[A[A[A[A[A[A[A[A[A[A[A


✓ Batch 81 done in 28.70s (10 prompts)















Processing: 100%|█████████████████████████████████| 1/1 [00:24<00:00, 24.54s/it][A[A[A[A[A[A[A[A[A[A[A[A[A


✓ Batch 83 done in 24.54s (10 prompts)


Processing:   0%|                                         | 0/1 [00:00<?, ?it/s]

















Processing: 100%|█████████████████████████████████| 1/1 [00:21<00:00, 21.31s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A


✓ Batch 87 done in 21.32s (10 prompts)




















Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A















Processing: 100%|█████████████████████████████████| 1/1 [00:39<00:00, 39.32s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A


✓ Batch 79 done in 39.32s (10 prompts)


















Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A


Processing: 100%|█████████████████████████████████| 1/1 [00:35<00:00, 35.77s/it][A[A[A


✓ Batch 80 done in 35.78s (10 prompts)





Processing:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A
















Processing: 100%|█████████████████████████████████| 1/1 [00:28<00:00, 28.49s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

Processing: 100%|█████████████████████████████████| 1/1 [00:25<00:00, 25.75s/it][A


✓ Batch 86 done in 28.49s (10 prompts)
✓ Batch 88 done in 25.75s (10 prompts)













Processing: 100%|█████████████████████████████████| 1/1 [00:29<00:00, 29.42s/it][A[A[A[A[A[A[A[A[A[A[A


✓ Batch 85 done in 29.42s (10 prompts)








Processing: 100%|█████████████████████████████████| 1/1 [00:29<00:00, 29.83s/it][A[A[A[A[A[A


✓ Batch 84 done in 29.83s (10 prompts)









Processing: 100%|█████████████████████████████████| 1/1 [00:26<00:00, 26.33s/it][A[A[A[A[A[A[A


✓ Batch 91 done in 26.33s (10 prompts)






Processing: 100%|█████████████████████████████████| 1/1 [00:27<00:00, 27.18s/it][A[A[A[A


✓ Batch 90 done in 27.18s (10 prompts)











Processing: 100%|█████████████████████████████████| 1/1 [00:50<00:00, 50.10s/it][A[A[A[A[A[A[A[A[A


✓ Batch 77 done in 50.10s (10 prompts)







Processing: 100%|█████████████████████████████████| 1/1 [00:28<00:00, 28.29s/it][A[A[A[A[A


✓ Batch 93 done in 28.30s (10 prompts)

















Processing: 100%|█████████████████████████████████| 1/1 [00:34<00:00, 34.49s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A










Processing: 100%|█████████████████████████████████| 1/1 [00:29<00:00, 29.60s/it][A[A[A[A[A[A[A[A[A[A


✓ Batch 89 done in 34.49s (10 prompts)
✓ Batch 94 done in 29.60s (10 prompts)














Processing: 100%|█████████████████████████████████| 1/1 [00:31<00:00, 31.56s/it][A[A[A[A[A[A[A[A[A[A[A[A


✓ Batch 92 done in 31.56s (10 prompts)
















Processing: 100%|█████████████████████████████████| 1/1 [00:25<00:00, 25.92s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A


✓ Batch 96 done in 25.93s (10 prompts)





Processing: 100%|█████████████████████████████████| 1/1 [00:13<00:00, 13.96s/it][A[A[A


✓ Batch 103 done in 13.96s (6 prompts)





















Processing: 100%|█████████████████████████████████| 1/1 [00:28<00:00, 28.79s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A


✓ Batch 95 done in 28.79s (10 prompts)










Processing: 100%|█████████████████████████████████| 1/1 [00:23<00:00, 23.43s/it][A[A[A[A[A[A[A[A


✓ Batch 97 done in 23.43s (10 prompts)




Processing: 100%|█████████████████████████████████| 1/1 [00:22<00:00, 22.61s/it][A[A


✓ Batch 98 done in 22.61s (10 prompts)


















Processing: 100%|█████████████████████████████████| 1/1 [00:22<00:00, 22.15s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A


✓ Batch 102 done in 22.16s (10 prompts)


Processing: 100%|█████████████████████████████████| 1/1 [00:27<00:00, 27.65s/it]


✓ Batch 100 done in 27.66s (10 prompts)















Processing: 100%|█████████████████████████████████| 1/1 [00:30<00:00, 30.14s/it][A[A[A[A[A[A[A[A[A[A[A[A[A


✓ Batch 99 done in 30.14s (10 prompts)




















Processing: 100%|█████████████████████████████████| 1/1 [00:33<00:00, 33.44s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

✓ Batch 101 done in 33.44s (10 prompts)
✅ All batches completed in 178.55s
✓ Proposed actions: 1028 rows





novel_label,novel_rationale,first_seen,best_topic_id,best_topic_name,similarity,action,notes,llm_label
str,str,"datetime[μs, Asia/Kolkata]",str,str,f64,str,str,str
"""Positive service feedback""","""The review expresses satisfact…",2025-10-04 16:13:01 IST,"""POSITIVE_EXPERIENCE""","""Positive Experience""",0.840523,"""merge""","""auto-merge via embedding score""",
"""Generic positive feedback""","""The review expresses a positiv…",2025-09-28 05:16:42 IST,"""POSITIVE_EXPERIENCE""","""Positive Experience""",0.827888,"""merge""","""auto-merge via embedding score""",
"""Best service feedback""","""The review expresses a positiv…",2025-10-20 04:11:57 IST,"""POSITIVE_EXPERIENCE""","""Positive Experience""",0.753112,"""merge""","""Both topics express positive s…","""General Positive Feedback"""
"""Delivery Speed Concern""","""The review expresses dissatisf…",2025-10-15 09:30:44 IST,"""FAST_DELIVERY""","""Fast Delivery""",0.730695,"""merge""","""Consolidated feedback related …","""Service Feedback"""
"""Positive feedback about servic…","""The review expresses a positiv…",2025-10-04 16:00:59 IST,"""POSITIVE_EXPERIENCE""","""Positive Experience""",0.724916,"""merge""","""Both topics address aspects of…","""Food Quality"""
"""Order not received""","""The review indicates waiting f…",2025-10-18 16:13:18 IST,"""NO_DELIVERY_YET""","""No Delivery Yet""",0.712767,"""merge""","""Both topics express a positive…","""Positive Feedback on Food Qual…"
"""Low Price Concern""","""The review mentions low price …",2025-09-30 16:34:57 IST,"""POOR_QUALITY""","""Poor Quality""",0.708216,"""new""","""This topic highlights excellen…","""Positive Offers Feedback"""
"""Generic Positive Feedback""","""The review expresses a positiv…",2025-09-28 07:02:46 IST,"""POSITIVE_EXPERIENCE""","""Positive Experience""",0.69851,"""merge""","""Both topics express a general …","""App inquiry"""
"""Slow Delivery""","""The review indicates a delay w…",2025-10-22 14:19:24 IST,"""FAST_DELIVERY""","""Fast Delivery""",0.6936,"""merge""","""Both topics express positive s…","""Positive Feedback and Experien…"
"""App feedback""","""The review provides generic fe…",2025-10-21 20:13:31 IST,"""GREAT_APP""","""Great App""",0.690503,"""new""","""The review expresses frustrati…","""App usability concern"""


## Write Registry Update Draft

In [13]:

updates = []
timestamp = datetime.utcnow().isoformat() + 'Z'
for row in actions_df.iter_rows(named=True):
    updates.append({
        'timestamp': timestamp,
        'novel_label': row['novel_label'],
        'target_topic_id': row['best_topic_id'],
        'target_topic_name': row['best_topic_name'],
        'similarity': row['similarity'],
        'action': row['action'],
        'notes': row.get('notes'),
        'llm_label': row.get('llm_label'),
        'first_seen': row.get('first_seen'),
    })

updates_path = DATA_DIR / 'registry_updates.json'
with updates_path.open('w') as f:
    json.dump(updates, f, indent=2, default=str)
print(f'✓ Proposed updates written to {updates_path}')

review_path = DATA_DIR / 'registry_updates_preview.csv'
actions_df.write_csv(review_path)
print(f'  Preview CSV written to {review_path}')


✓ Proposed updates written to ../data/registry_updates.json
  Preview CSV written to ../data/registry_updates_preview.csv


## Manual Follow-up
- Review `data/registry_updates.json`
- Apply merges/new topics in `registry/topic_registry.json`
- Re-run topic routing if registry changes materially
