# Basic installing and loading

In [None]:
%pip install cerebras-cloud-sdk sentence_transformers huggingface_hub -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/96.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.2/96.2 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import csv, os, json, numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from huggingface_hub import InferenceClient
from sklearn.metrics import f1_score, precision_score, recall_score
from google.colab import userdata
import time
from transformers import pipeline
from cerebras.cloud.sdk import Cerebras

encoder = SentenceTransformer('all-MiniLM-L6-v2')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# Secrets retrieveal

In [None]:
from google.colab import userdata

hf_key   = userdata.get('CEREB_API_KEY')

client     = Cerebras(api_key=hf_key)
encoder    = SentenceTransformer('all-MiniLM-L6-v2')
model = "llama-3.3-70b"

In [None]:
# try out the hf inference

test = client.chat.completions.create(
    model=model,
    messages=[
        {"role": "user", "content": "Hello, how are you?"}
    ]
)

In [None]:
test.choices[0].message.content.strip()

"Hello. I'm doing well, thanks for asking. I'm a large language model, so I don't have feelings or emotions like humans do, but I'm functioning properly and ready to help with any questions or tasks you might have. How about you? How's your day going so far?"

# Data Preparation

In [None]:
def load_and_prepare_data(train_file):
    train_df = pd.read_excel(train_file)
    train_df['Core Item'] = train_df['Core Item'].str.lower().str.strip()
    # Drop unnecessary columns and NAs
    train_df = train_df.drop(columns=[col for col in train_df.columns if 'Unnamed' in col], errors='ignore')
    train_df.dropna(inplace=True)
    # Extract unique Level 1-Level 2 pairs for taxonomy
    taxonomy = {}
    for level1, group in train_df.groupby('Level 1 (PARENT)'):
        taxonomy[level1] = list(group['Level 2 (CHILD)'].unique())
    # Group train data by review to get all mappings
    train_grouped = train_df.groupby('Core Item').apply(
        lambda x: [(row['Level 1 (PARENT)'], row['Level 2 (CHILD)']) for _, row in x.iterrows()]
    ).to_dict()
    return train_df, taxonomy, train_grouped

In [None]:
train_file = "/content/bodywash-train.xlsx"
train_df, taxonomy, train_grouped = load_and_prepare_data(train_file)

  train_grouped = train_df.groupby('Core Item').apply(


# Prompt Generation

In [None]:
def create_prompt(review, taxonomy, train_grouped, num_examples=8):
    taxonomy_str = "Valid Level 1 → Level 2 pairs (use ONLY these):\n"
    for level1, level2s in taxonomy.items():
        taxonomy_str += f"{level1}: {', '.join(level2s)}\n"
    example_str = "Examples (learn from these mappings):\n"
    selected_reviews = list(train_grouped.keys())[:num_examples]
    for rev in selected_reviews:
        example_str += f"Review: {rev}\nClassifications (exact pairs only):\n"
        for level1, level2 in train_grouped[rev]:
            example_str += f"- Level 1: {level1}, Level 2: {level2}\n"
        example_str += "\n"
    cot_instruction = (
        "Step-by-step: 1. Extract key phrases from the review (e.g., 'smells great' → scent). "
        "2. Map phrases to closest Level 1 (e.g., scent → Fragrance). "
        "3. For each matched Level 1, pick the best Level 2 (e.g., pleasant → Positive Scent). "
        "4. Only include if strongly matched; avoid weak/guessed pairs. Use ONLY valid taxonomy pairs. No duplicates."
    )
    output_instruction = (
        "Output ONLY JSON (no text before/after). Format:\n"
        "{\"pairs\": [{\"Level 1\": \"<level1>\", \"Level 2\": \"<level2>\"}, ...]}\n"
        "If no matches, {\"pairs\": []}. Ensure valid JSON."
    )
    prompt = (
        f"Classify this bodywash review into Level 1 → Level 2 factors.\n\n"
        f"{taxonomy_str}\n\n"
        f"{example_str}\n\n"
        f"{cot_instruction}\n\n"
        f"Review: {review}\n\n"
        f"{output_instruction}"
    )
    return prompt

In [None]:
# Create a small holdout set for evaluation (10% of train)
train_reviews = list(train_grouped.keys())
holdout_size = max(1, len(train_reviews) // 10)
holdout_reviews = train_reviews[:holdout_size]
train_reviews = train_reviews[holdout_size:]
train_embeddings = encoder.encode(train_reviews, show_progress_bar=True)

Batches:   0%|          | 0/78 [00:00<?, ?it/s]

# Pass the prompt + reviews to LLM

In [None]:
import json

def classify_review(review, taxonomy, train_grouped, train_reviews, train_embeddings,
                    cache=None, top_k=20, num_examples=8):
    if cache and review in cache:
        return cache[review]
    review_embedding = encoder.encode([review])
    similarities = cosine_similarity(review_embedding, train_embeddings)[0]
    top_idx = np.argsort(similarities)[-top_k:]
    selected_reviews = [train_reviews[i] for i in top_idx][-num_examples:]
    prompt = create_prompt(review, taxonomy, {k: train_grouped[k] for k in selected_reviews}, num_examples)
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            temperature=0.0,  # Deterministic
            max_tokens=512
        )
        output = response.choices[0].message.content.strip()
        try:
            output_json = json.loads(output)
        except json.JSONDecodeError:
            # Fallback: extract JSON-like substring
            import re
            match = re.search(r"\{.*\}", output, re.DOTALL)
            if match:
                output_json = json.loads(match.group(0))
            else:
                output_json = {"pairs": []}
        preds = [
            (p["Level 1"], p["Level 2"])
            for p in output_json.get("pairs", [])
            if p["Level 1"] in taxonomy and p["Level 2"] in taxonomy[p["Level 1"]]
        ]
        if cache is not None:
            cache[review] = preds
        return preds
    except Exception as e:
        print(f"Error: {e}")
        return []

# Evaluate the retrieved answers

In [None]:
def evaluate_holdout(holdout_reviews, train_grouped, taxonomy, train_reviews, train_embeddings, cache=None):
    true_labels = []
    pred_labels = []
    if cache is None:
        cache = {}

    all_pairs = [(l1, l2) for l1 in taxonomy for l2 in taxonomy[l1]]

    for i, review in enumerate(holdout_reviews):
      if i % 10 == 0:
        print(f"scoring {i+1}/{len(holdout_reviews)} reviews")
      true_pairs = train_grouped[review]
      pred_pairs = classify_review(review, taxonomy, train_grouped, train_reviews, train_embeddings, cache)

      true_vec = [1 if pair in true_pairs else 0 for pair in all_pairs]
      pred_vec = [1 if pair in pred_pairs else 0 for pair in all_pairs]

      true_labels.append(true_vec)
      pred_labels.append(pred_vec)

    f1 = f1_score(true_labels, pred_labels, average='macro', zero_division=0)
    precision = precision_score(true_labels, pred_labels, average='macro', zero_division=0)
    recall = recall_score(true_labels, pred_labels, average='macro', zero_division=0)
    print(f"Holdout Metrics - F1: {f1:.3f}, Precision: {precision:.3f}, Recall: {recall:.3f}")

    return f1, cache

In [None]:
f1, cache = evaluate_holdout(
    holdout_reviews,
    train_grouped,
    taxonomy,
    train_reviews,
    train_embeddings
)

scoring 1/276 reviews
scoring 11/276 reviews
scoring 21/276 reviews
scoring 31/276 reviews
scoring 41/276 reviews
scoring 51/276 reviews
scoring 61/276 reviews
scoring 71/276 reviews
scoring 81/276 reviews
scoring 91/276 reviews
scoring 101/276 reviews
scoring 111/276 reviews
scoring 121/276 reviews
scoring 131/276 reviews
scoring 141/276 reviews
scoring 151/276 reviews
scoring 161/276 reviews
scoring 171/276 reviews
scoring 181/276 reviews
scoring 191/276 reviews
scoring 201/276 reviews
scoring 211/276 reviews
scoring 221/276 reviews
scoring 231/276 reviews
scoring 241/276 reviews
scoring 251/276 reviews
scoring 261/276 reviews
scoring 271/276 reviews
Holdout Metrics - F1: 0.359, Precision: 0.395, Recall: 0.367


## Predictions for test set

In [None]:
test_file = "/content/bodywash-test.xlsx"
test_df = pd.read_excel(test_file)

test_df['Core Item'] = test_df['Core Item'].astype(str).str.lower().str.strip()
test_df.head()

Unnamed: 0,Core Item,Level 1,Level 2
0,100 % for men to attract hot and beautiful wom...,,
1,3/5-not enough for the whole body,,
2,4/5-not bad. a tad drying for the skin,,
3,5 star-nice clean wash everytime.,,
4,8 out of 5 thumbs up-i like the way it scrubs ...,,


In [None]:
# Test embeddings

test_reviews = test_df['Core Item'].tolist()
test_embeddings = encoder.encode(test_reviews, show_progress_bar=True)

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

In [None]:
test_predictions = {}
cache = {}

for i, (review, emb) in enumerate(zip(test_reviews, test_embeddings)):
    preds = classify_review(
        review=review,
        taxonomy=taxonomy,
        train_grouped=train_grouped,
        train_reviews=train_reviews,
        train_embeddings=train_embeddings,
        cache=cache,
        top_k=10,
        num_examples=5
    )
    test_predictions[review] = preds
    if i % 10 == 0:
        print(f"Processed {i+1}/{len(test_reviews)} reviews")


Processed 1/216 reviews
Processed 11/216 reviews
Processed 21/216 reviews
Processed 31/216 reviews
Processed 41/216 reviews
Processed 51/216 reviews
Processed 61/216 reviews
Processed 71/216 reviews
Processed 81/216 reviews
Processed 91/216 reviews
Processed 101/216 reviews
Processed 111/216 reviews
Processed 121/216 reviews
Processed 131/216 reviews
Processed 141/216 reviews
Processed 151/216 reviews
Processed 161/216 reviews
Processed 171/216 reviews
Processed 181/216 reviews
Processed 191/216 reviews
Processed 201/216 reviews
Processed 211/216 reviews


In [None]:
# Save

rows = []
for review, pairs in test_predictions.items():
    if not pairs:
        rows.append({"Core Item": review, "Level 1 (PARENT)": None, "Level 2 (CHILD)": None})
    else:
        for (l1, l2) in pairs:
            rows.append({"Core Item": review, "Level 1 (PARENT)": l1, "Level 2 (CHILD)": l2})

pred_df = pd.DataFrame(rows)
pred_df.to_excel("/content/bodywash-test-predictions.xlsx", index=False)


# Hybrid Pipeline approach (Obsolete)

In [None]:
def encode_label_embeddings(taxonomy):
    label_embeds = {}
    level1_embeds = encoder.encode(list(taxonomy.keys()))
    label_embeds['level1'] = dict(zip(taxonomy.keys(), level1_embeds))
    for level1, level2s in taxonomy.items():
        level2_embeddings = encoder.encode(level2s)
        label_embeds[level1] = dict(zip(level2s, level2_embeddings))
    return label_embeds

In [None]:
def predict_level1_llm(review, taxonomy):
    level1_prompt = (
        f"Given this review: \"{review}\", which Level 1 categories best describe it?\n"
        f"Options: {list(taxonomy.keys())}\n"
        "Respond only as JSON: {\"Level 1\": [\"...\"]} (no explanations, no code)."
    )
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": level1_prompt}]
        )
        text = response.choices[0].message.content.strip()
        # Extract and parse safely
        match = re.search(r"\{.*\}", text, re.DOTALL)
        if match:
            parsed = json.loads(match.group(0))
            return parsed.get("Level 1", [])
    except Exception as e:
        print(f"Error predicting Level 1: {e}")
    return []


In [None]:
def refine_level2_with_embeddings(review, level1_preds, taxonomy, label_embeds, top_n=3, sim_threshold=0.45):  # Added threshold, increased top_n
    refined_pairs = []
    review_emb = encoder.encode([review])[0]
    for l1 in level1_preds:
        if l1 not in taxonomy:
            continue
        sims = {
            l2: cosine_similarity([review_emb], [emb])[0][0]
            for l2, emb in label_embeds[l1].items()
        }
        top_l2 = [(l2, score) for l2, score in sorted(sims.items(), key=lambda x: x[1], reverse=True) if score > sim_threshold]
        refined_pairs += [(l1, l2) for l2, _ in top_l2[:top_n]]  # Cap at top_n but filter low sim
    return refined_pairs

In [None]:
def classify_review_hybrid(review, taxonomy, label_embeds, cache=None):
    if cache and review in cache:
        return cache[review]

    level1_preds = predict_level1_llm(review, taxonomy)
    preds = refine_level2_with_embeddings(review, level1_preds, taxonomy, label_embeds)

    if cache is not None:
        cache[review] = preds
    return preds

In [None]:
classify_review_hybrid(review=train_df['Core Item'].iloc[1], taxonomy=taxonomy, label_embeds=label_embeds, cache=cache)

[('Fragrance', 'Fragrance Variety'),
 ('Fragrance', 'Fragrance Type'),
 ('Convenience', 'Product Application'),
 ('Convenience', 'Multi-Purpose'),
 ('Price', 'Affordability'),
 ('Price', 'Value Justification'),
 ('Peer Recommendation', 'Wife / Girlfriend'),
 ('Peer Recommendation', 'Family')]

In [None]:
label_embeds = encode_label_embeddings(taxonomy)

In [None]:
import re

def evaluate_holdout_hybrid(holdout_reviews, train_grouped, taxonomy, label_embeds):
    true_labels, pred_labels = [], []
    cache = {}

    all_pairs = [(l1, l2) for l1 in taxonomy for l2 in taxonomy[l1]]

    for i, review in enumerate(holdout_reviews):
        print(f'Scoring {i+1} out of {len(holdout_reviews)}...')
        true_pairs = train_grouped[review]
        pred_pairs = classify_review_hybrid(review, taxonomy, label_embeds, cache)

        true_vec = [1 if pair in true_pairs else 0 for pair in all_pairs]
        pred_vec = [1 if pair in pred_pairs else 0 for pair in all_pairs]

        true_labels.append(true_vec)
        pred_labels.append(pred_vec)

    f1 = f1_score(true_labels, pred_labels, average='macro',zero_division=0)
    precision = precision_score(true_labels, pred_labels, average='macro', zero_division=0)
    recall = recall_score(true_labels, pred_labels, average='macro', zero_division=0)

    print(f"Hybrid Holdout Metrics — F1: {f1:.3f}, Precision: {precision:.3f}, Recall: {recall:.3f}")
    return f1, precision, recall, cache


# ---- Run evaluation ----
f1, precision, recall, cache = evaluate_holdout_hybrid(holdout_reviews, train_grouped, taxonomy, label_embeds)

Scoring 1 out of 276...
Scoring 2 out of 276...
Scoring 3 out of 276...
Scoring 4 out of 276...
Scoring 5 out of 276...
Scoring 6 out of 276...
Scoring 7 out of 276...
Scoring 8 out of 276...
Scoring 9 out of 276...
Scoring 10 out of 276...
Scoring 11 out of 276...
Scoring 12 out of 276...
Scoring 13 out of 276...
Scoring 14 out of 276...
Scoring 15 out of 276...
Scoring 16 out of 276...
Scoring 17 out of 276...
Scoring 18 out of 276...
Scoring 19 out of 276...
Scoring 20 out of 276...
Scoring 21 out of 276...
Scoring 22 out of 276...
Scoring 23 out of 276...
Scoring 24 out of 276...
Scoring 25 out of 276...
Scoring 26 out of 276...
Scoring 27 out of 276...
Scoring 28 out of 276...
Scoring 29 out of 276...
Scoring 30 out of 276...
Scoring 31 out of 276...
Scoring 32 out of 276...
Scoring 33 out of 276...
Scoring 34 out of 276...
Scoring 35 out of 276...
Scoring 36 out of 276...
Scoring 37 out of 276...
Scoring 38 out of 276...
Scoring 39 out of 276...
Scoring 40 out of 276...
Scoring 4