<a href="https://colab.research.google.com/github/Ctrl-Vibe/review-filtering-ml/blob/main/Filtering_Reviews_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas scikit-learn transformers datasets torch gradio python-dotenv apify_client

# train_model

In [3]:
from google.colab import files

# upload the JSON file
uploaded = files.upload()

Saving reviews_labeled_subset.csv to reviews_labeled_subset.csv


In [None]:
import transformers as _tf
print("[INFO] transformers version:", _tf.__version__)

# src/train_model.py
import os, sys
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, logging as hf_logging
)
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# ---- CONFIG ----
DATA_PATH = "reviews_labeled_subset.csv"       # in /content after upload
LABEL_LIST = ["relevant", "spam", "ad", "irrelevant", "rant_no_visit"]

TEXT_COL      = "text"         # review text
LABEL_COL     = "label"
CATEGORY_COL  = "category"
RATING_COL    = "rating"
NAME_COL      = "name_shop"
DESC_COL      = "description"

MODEL_NAME = "distilbert-base-uncased"
OUTPUT_DIR = "checkpoints"
MAX_LEN = 256
EPOCHS = 3
BATCH = 16
# ----------------

def _safe(x):
    if pd.isna(x): return ""
    s = str(x).strip()
    return "" if s.lower() == "nan" else s

def _short_words(s, n=40):
    """Keep description concise to save tokens."""
    s = _safe(s)
    if not s: return ""
    return " ".join(s.split()[:n])

def _build_combined_row(row):
    # Build the exact text the model will see.
    # Order matters; keep it consistent for train & predict.
    rating_part = f"{int(row[RATING_COL])} stars" if _safe(row[RATING_COL]) else ""
    parts = [
        _safe(row[CATEGORY_COL]),
        rating_part,
        _safe(row[NAME_COL]),
        _short_words(row[DESC_COL], n=40),
        _safe(row["review_text"]),   # renamed below
    ]
    # Join only non-empty parts
    return " | ".join([p for p in parts if p])

def main():
    print("[INFO] Python:", sys.executable)
    print("[INFO] CWD:", os.getcwd())
    print(f"[INFO] Looking for dataset at: {DATA_PATH}")

    if not os.path.exists(DATA_PATH):
        raise FileNotFoundError(f"Cannot find {DATA_PATH}. Are you in the project root?")

    # Verbose HF logs
    hf_logging.set_verbosity_info()
    os.environ["TOKENIZERS_PARALLELISM"] = "false"

    label2id = {l: i for i, l in enumerate(LABEL_LIST)}
    id2label = {i: l for l, i in label2id.items()}

    # ---------- CSV LOAD BLOCK ----------
    print("[INFO] Loading CSV…")
    df = pd.read_csv(DATA_PATH)

    # check necessary columns
    required = [TEXT_COL, LABEL_COL, CATEGORY_COL, RATING_COL, NAME_COL, DESC_COL]
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise ValueError(f"CSV missing columns {missing}. Found: {list(df.columns)}")

    # rename to standard names we use below
    df = df.rename(columns={
        TEXT_COL: "review_text",
        LABEL_COL: "label",
        CATEGORY_COL: "category",
        RATING_COL: "rating",
        NAME_COL: "name_shop",
        DESC_COL: "description",
    })

    # keep only rows with valid labels
    df = df[df["label"].isin(LABEL_LIST)].copy()
    if df.empty:
        raise ValueError("No rows after filtering by LABEL_LIST – check label values/spelling.")

    # build combined text with category + rating + name + description + review text
    print("[INFO] Building combined_text with extra columns…")
    df["combined_text"] = df.apply(_build_combined_row, axis=1)

    df["label_id"] = df["label"].map(label2id)
    label_counts = df["label"].value_counts().to_dict()
    print("[INFO] Label counts:", label_counts)
    # ------------------------------------------------------------

    print("[INFO] Splitting train/test…")
    try:
        train_texts, test_texts, train_labels, test_labels = train_test_split(
            df["combined_text"], df["label_id"],
            test_size=0.2, random_state=42, stratify=df["label_id"]
        )
    except ValueError:
        print("[WARN] Stratified split failed (class counts too small). Using random split.")
        train_texts, test_texts, train_labels, test_labels = train_test_split(
            df["combined_text"], df["label_id"], test_size=0.2, random_state=42
        )

    train_df = pd.DataFrame({"text": train_texts, "labels": train_labels})
    test_df  = pd.DataFrame({"text": test_texts,  "labels": test_labels})

    print("[INFO] Building HF datasets…")
    train_ds = Dataset.from_pandas(train_df, preserve_index=False)
    test_ds  = Dataset.from_pandas(test_df,  preserve_index=False)

    print(f"[INFO] Loading tokenizer/model: {MODEL_NAME} (first time may download ~250MB)…")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    def tokenize(batch):
        return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=MAX_LEN)

    print("[INFO] Tokenizing…")
    train_ds = train_ds.map(tokenize, batched=True)
    test_ds  = test_ds.map(tokenize,  batched=True)

    # Trainer expects tensors and no raw text column
    train_ds = train_ds.remove_columns(["text"])
    test_ds  = test_ds.remove_columns(["text"])
    train_ds.set_format(type="torch")
    test_ds.set_format(type="torch")

    print("[INFO] Loading model…")
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME, num_labels=len(LABEL_LIST), id2label=id2label, label2id=label2id
    )

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=1)
        precision, recall, f1, _ = precision_recall_fscore_support(
            labels, preds, average="weighted", zero_division=0
        )
        acc = accuracy_score(labels, preds)
        return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

    print("[INFO] Starting training…")
    args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        do_eval=True,                     # simple + robust across versions
        learning_rate=2e-5,
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=BATCH,
        per_device_eval_batch_size=BATCH,
        weight_decay=0.01,
        logging_steps=50,
        report_to="none",
        save_total_limit=2
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=test_ds,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    print("[INFO] Evaluating…")
    metrics = trainer.evaluate()
    print("[RESULT] Final metrics:", metrics)

    save_dir = os.path.join(OUTPUT_DIR, "best")
    print(f"[INFO] Saving model to: {save_dir}")
    trainer.save_model(save_dir)
    tokenizer.save_pretrained(save_dir)
    print("[DONE] Training complete.")

main()


[INFO] transformers version: 4.55.4
[INFO] Python: /usr/bin/python3
[INFO] CWD: /content
[INFO] Looking for dataset at: reviews_labeled_subset.csv
[INFO] Loading CSV…
[INFO] Building combined_text with extra columns…
[INFO] Label counts: {'relevant': 4400, 'spam': 288, 'irrelevant': 108, 'rant_no_visit': 104, 'ad': 100}
[INFO] Splitting train/test…
[INFO] Building HF datasets…
[INFO] Loading tokenizer/model: distilbert-base-uncased (first time may download ~250MB)…


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/12040accade4e8a0f71eabdb258fecc2e7e948be/config.json
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.55.4",
  "vocab_size": 30522
}



vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/12040accade4e8a0f71eabdb258fecc2e7e948be/vocab.txt
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/12040accade4e8a0f71eabdb258fecc2e7e948be/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/12040accade4e8a0f71eabdb258fecc2e7e948be/tokenizer_config.json
loading file chat_template.jinja from cache at None
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/12040accade4e8a0f71eabdb258fecc2e7e948be/config.json
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  

[INFO] Tokenizing…


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/12040accade4e8a0f71eabdb258fecc2e7e948be/config.json
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "relevant",
    "1": "spam",
    "2": "ad",
    "3": "irrelevant",
    "4": "rant_no_visit"
  },
  "initializer_range": 0.02,
  "label2id": {
    "ad": 2,
    "irrelevant": 3,
    "rant_no_visit": 4,
    "relevant": 0,
    "spam": 1
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.55.4",
  "vocab_size": 30522
}



[INFO] Loading model…


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/12040accade4e8a0f71eabdb258fecc2e7e948be/model.safetensors
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification 

[INFO] Starting training…


***** Running training *****
  Num examples = 4,000
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 750
  Number of trainable parameters = 66,957,317


Step,Training Loss


In [None]:
from google.colab import userdata
userdata.get('GEMINI_API_KEY')

'AIzaSyCOMlMxFzQTFDRRTJKzNoW24tTW52620OI'

# Scrape data

In [None]:
import json
import os
from apify_client import ApifyClient
from dotenv import load_dotenv

from google.colab import userdata


# Load environment variables
load_dotenv()

# Initialize the ApifyClient with your Apify API token
client = ApifyClient(userdata.get('GEMINI_API_KEY'))

google_url = "https://www.google.com/maps/place/Marina+Bay+Sands+Singapore/@1.2837575,103.8565316,17z/data=!3m1!5s0x31da19042de382df:0x5bbfe003fe5e690!4m11!3m10!1s0x31da19ee4cc09203:0x26c9afefa555dd7!5m2!4m1!1i2!8m2!3d1.2837575!4d103.8591065!9m1!1b1!16zL20vMGRkOTAz?entry=ttu&g_ep=EgoyMDI1MDgyNS4wIKXMDSoASAFQAw%3D%3D"

# Step 1: Run the Google Maps Reviews Scraper for top-ranking reviews
top_reviews_input = {
    "startUrls": [
        {
            "url": google_url
        }
    ],
    "maxReviews": 1000,
    "language": "en",
    "sort": "mostRelevant"  # Default sorting, often prioritizes higher ratings
}

# Run the Reviews Scraper for top reviews
top_reviews_run = client.actor("compass/google-maps-reviews-scraper").call(run_input=top_reviews_input)

# Fetch top reviews data and filter out null/empty text
top_reviews_data = []
for item in client.dataset(top_reviews_run["defaultDatasetId"]).iterate_items():
    text = item.get("text")
    if text and text.strip():  # Check if text exists and is not empty
        top_reviews_data.append({
            "text": text,
            "rating": item.get("stars"),
            "reviewId": item.get("reviewId", text)  # Use text as fallback if reviewId is missing
        })

# Step 2: Run the Google Maps Reviews Scraper for low-ranking reviews
low_reviews_input = {
    "startUrls": [
        {
            "url": google_url
        }
    ],
    "maxReviews": 1000,
    "language": "en",
    "sort": "lowestRating"  # Sort by lowest rating
}

# Run the Reviews Scraper for low reviews
low_reviews_run = client.actor("compass/google-maps-reviews-scraper").call(run_input=low_reviews_input)

# Fetch low reviews data and filter out null/empty text
low_reviews_data = []
for item in client.dataset(low_reviews_run["defaultDatasetId"]).iterate_items():
    text = item.get("text")
    if text and text.strip():  # Check if text exists and is not empty
        low_reviews_data.append({
            "text": text,
            "rating": item.get("stars"),
            "reviewId": item.get("reviewId", text)  # Use text as fallback if reviewId is missing
        })

# Combine reviews and remove duplicates based on reviewId or text
combined_reviews_data = []
seen_review_ids = set()

# Add top reviews
for review in top_reviews_data:
    review_id = review["reviewId"]
    if review_id not in seen_review_ids:
        seen_review_ids.add(review_id)
        combined_reviews_data.append({
            "text": review["text"],
            "rating": review["rating"]
        })

# Add low reviews, skipping duplicates
for review in low_reviews_data:
    review_id = review["reviewId"]
    if review_id not in seen_review_ids:
        seen_review_ids.add(review_id)
        combined_reviews_data.append({
            "text": review["text"],
            "rating": review["rating"]
        })

# Try to extract placeId from the first review item (if available)
place_id = None
for item in client.dataset(top_reviews_run["defaultDatasetId"]).iterate_items():
    place_id = item.get("placeId")
    break

# Step 3: Run the Google Places Crawler
places_input = {
    "startUrls": [
        {
            "url": google_url
        }
    ],
    "language": "en",
}

# Add placeId to input only if it was extracted
if place_id:
    places_input["placeId"] = place_id

# Run the Google Places Crawler
places_run = client.actor("compass/crawler-google-places").call(run_input=places_input)

# Fetch place data
place_data = {}
for item in client.dataset(places_run["defaultDatasetId"]).iterate_items():
    place_data = {
        "name_shop": item.get("title"),
        "description": item.get("description"),
        "category": str(item.get("categories", []))  # Convert categories list to string
    }
    break  # Only need the first item for place details

# Step 4: Combine the data to match the desired format
combined_data = []
for review in combined_reviews_data:
    combined_data.append({
        "text": review["text"],
        "rating": review["rating"],
        "name_shop": place_data.get("name_shop"),
        "description": place_data.get("description"),
        "category": place_data.get("category")
    })

# Step 5: Save to JSON file
output_file = "testing_data.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(combined_data, f, indent=4, ensure_ascii=False)

print(f"Data saved to {output_file}")
print("💾 Check your data here:")
print(f"Top Reviews: https://console.apify.com/storage/datasets/{top_reviews_run['defaultDatasetId']}")
print(f"Low Reviews: https://console.apify.com/storage/datasets/{low_reviews_run['defaultDatasetId']}")
print(f"Place Details: https://console.apify.com/storage/datasets/{places_run['defaultDatasetId']}")

# 📚 Want to learn more 📖? Go to → https://docs.apify.com/api/client/python/docs/quick-start

In [None]:
from google.colab import files
uploaded = "testing_data.json"

# Predict


In [5]:
from google.colab import files

# upload the JSON file
uploaded = files.upload()

Saving mbs_data.json to mbs_data.json


In [2]:
# src/predict.py
import ast
import json
from typing import Any, Dict, List, Tuple

import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# ---- CONFIG ----
MODEL_DIR = "checkpoints/best"
LABEL_LIST = ["relevant", "spam", "ad", "irrelevant", "rant_no_visit"]
MAX_LEN = 256
# ----------------

# Load tokenizer & model once
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
model.eval()

# --- NEW: imports for similarity gate ---
from sentence_transformers import SentenceTransformer, util

# --- NEW: load lightweight embedding model once ---
# all-MiniLM-L6-v2 is fast and good for semantic similarity (384-dim)
_embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

SIM_THRESHOLD = 0.22  # tweakable; 0.18-0.30 usually works well

def build_business_context(row: dict) -> str:
    """
    Business context used for similarity:
      category | name_shop | short_description
    Keep it short so it's focused.
    """
    cat  = _primary_category(row.get("category"))
    name = _safe(row.get("name_shop"))
    desc = _short_words(row.get("description"), n=30)
    parts = [p for p in [cat, name, desc] if p]
    return " | ".join(parts) if parts else cat

def cosine_sim(a_text: str, b_text: str) -> float:
    """
    Return cosine similarity between two texts using SBERT embeddings.
    """
    if not a_text or not b_text:
        return 0.0
    a = _embed_model.encode(a_text, convert_to_tensor=True, normalize_embeddings=True)
    b = _embed_model.encode(b_text, convert_to_tensor=True, normalize_embeddings=True)
    return float(util.cos_sim(a, b).item())

def apply_similarity_gate(row: dict, model_label: str) -> tuple[str, float]:
    """
    If the text is semantically far from the business context,
    override 'relevant' -> 'irrelevant'. Return (final_label, sim).
    """
    context = build_business_context(row)
    review  = _safe(row.get("text"))
    sim = cosine_sim(review, context)

    if model_label == "relevant" and sim < SIM_THRESHOLD:
        return "irrelevant", sim
    return model_label, sim

# ---------- helpers ----------
def _safe(x: Any) -> str:
    if x is None:
        return ""
    s = str(x).strip()
    return "" if s.lower() == "nan" else s

def _short_words(s: Any, n: int = 40) -> str:
    s = _safe(s)
    return " ".join(s.split()[:n]) if s else ""

def _primary_category(cat_val: Any) -> str:
    """
    Normalize category:
    - list -> first element
    - stringified list "['Cafe', 'Coffee shop']" -> parse and take first
    - else -> string itself
    """
    if cat_val is None:
        return "unknown"
    if isinstance(cat_val, list):
        return str(cat_val[0]) if cat_val else "unknown"
    s = str(cat_val).strip()
    try:
        parsed = ast.literal_eval(s)
        if isinstance(parsed, list) and parsed:
            return str(parsed[0])
    except Exception:
        pass
    return s

def build_input_text(row: Dict[str, Any]) -> str:
    """
    Build the SAME input format as training.
    We include as many fields as available:
      category | {rating} stars | name_shop | description(short) | text
    Missing fields are skipped.
    """
    parts: List[str] = []
    # category (required)
    parts.append(_primary_category(row.get("category")))
    # rating (optional)
    rating = _safe(row.get("rating"))
    if rating:
        parts.append(f"{rating} stars")
    # name_shop (optional)
    name = _safe(row.get("name_shop"))
    if name:
        parts.append(name)
    # description (optional, shortened)
    desc = _short_words(row.get("description"), n=40)
    if desc:
        parts.append(desc)
    # review text (required)
    parts.append(_safe(row.get("text")))
    # join non-empty
    return " | ".join([p for p in parts if p])

def predict_text_chunked(input_text: str,
                         max_len: int = 256,
                         stride: int = 64) -> Tuple[str, Dict[str, float]]:
    """
    Sliding-window classification for long texts.
    We tokenize once, then slice input_ids/attention_mask into overlapping windows.
    Aggregate probs across windows.
    """
    # tokenize without truncation to get full sequence
    toks = tokenizer(
        input_text,
        return_tensors="pt",
        truncation=False,
        padding=False
    )

    input_ids = toks["input_ids"][0]
    attn = toks["attention_mask"][0]
    n = input_ids.size(0)

    # windows
    windows: List[Tuple[torch.Tensor, torch.Tensor]] = []
    start = 0
    while start < n:
        end = min(start + max_len, n)
        win_ids = input_ids[start:end]
        win_attn = attn[start:end]
        # pad to max_len if needed
        if win_ids.size(0) < max_len:
            pad = max_len - win_ids.size(0)
            win_ids = torch.cat([win_ids, torch.zeros(pad, dtype=torch.long)], dim=0)
            win_attn = torch.cat([win_attn, torch.zeros(pad, dtype=torch.long)], dim=0)
        windows.append((win_ids.unsqueeze(0), win_attn.unsqueeze(0)))
        if end == n:
            break
        start = end - stride  # overlap

    # run all windows
    probs_all = []
    with torch.no_grad():
        for ids, am in windows:
            out = model(input_ids=ids, attention_mask=am)
            probs = torch.nn.functional.softmax(out.logits, dim=-1)[0]
            probs_all.append(probs)

    # aggregate: mean (robust), and also keep max as a signal
    probs_stack = torch.stack(probs_all, dim=0)  # [num_windows, num_labels]
    probs_mean = probs_stack.mean(dim=0)
    probs_max = probs_stack.max(dim=0).values

    # choose label by mean probability
    pred_idx = int(torch.argmax(probs_mean).item())
    pred_label = LABEL_LIST[pred_idx]
    # return dict from mean (you can change to max if preferred)
    probs_dict = {lab: float(probs_mean[i]) for i, lab in enumerate(LABEL_LIST)}
    return pred_label, probs_dict

def _add_prob_columns(df: pd.DataFrame, probs_dicts: List[Dict[str, float]]) -> pd.DataFrame:
    """Add one column per label: prob_<label>."""
    for lab in LABEL_LIST:
        df[f"prob_{lab}"] = [d.get(lab, 0.0) for d in probs_dicts]
    return df

# ---------- CSV pipeline ----------
# ---------- CSV pipeline ----------
def predict_csv(input_csv: str, output_csv: str) -> None:
    df = pd.read_csv(input_csv)

    # Need at least these; others are optional but used if present.
    for req in ["category", "text"]:
        if req not in df.columns:
            raise ValueError(f"CSV must contain '{req}' column. Found: {list(df.columns)}")

    # Build the combined classifier input for each row
    combined_series = df.apply(lambda r: build_input_text(r.to_dict()), axis=1)

    preds_raw: List[str] = []
    preds_final: List[str] = []
    sims: List[float] = []
    probs_list: List[Dict[str, float]] = []

    for i, combined in enumerate(combined_series):
        row = df.iloc[i].to_dict()

        # 1) Classifier (chunked for long text)
        pl, pr = predict_text_chunked(combined, max_len=MAX_LEN, stride=64)

        # 2) Similarity gate
        final, sim = apply_similarity_gate(row, pl)

        preds_raw.append(pl)
        preds_final.append(final)
        sims.append(sim)
        probs_list.append(pr)

    # Assemble output table
    df_out = df.copy()
    df_out["pred_label"]  = preds_raw       # raw model prediction
    df_out["final_label"] = preds_final     # after similarity gate
    df_out["similarity"]  = sims            # 0..1

    # Add prob_<label> columns
    for lab in LABEL_LIST:
        df_out[f"prob_{lab}"] = [d.get(lab, 0.0) for d in probs_list]

    df_out.to_csv(output_csv, index=False)
    print(f"[DONE] Saved predictions (with probabilities & similarity) to {output_csv}")

# ---------- JSON pipeline ----------
def predict_json(input_json: str, output_json: str, output_csv: str = None) -> None:
    """
    Read an array of objects, add:
      - prediction (raw model)
      - final_label (after similarity gate)
      - similarity (0..1)
      - probs: {label: prob, ...}
      - prob_<label> fields (flattened)
    Write back to JSON, and optionally to CSV for easy viewing.
    """
    with open(input_json, "r", encoding="utf-8") as f:
        data = json.load(f)
    if not isinstance(data, list):
        raise ValueError("Top-level JSON must be a list of objects.")

    for obj in data:
        combined = build_input_text(obj)

        # 1) Classifier (chunked)
        pl, pr = predict_text_chunked(combined, max_len=MAX_LEN, stride=64)

        # 2) Similarity gate
        final, sim = apply_similarity_gate(obj, pl)

        # annotate object
        obj["prediction"]  = pl
        obj["final_label"] = final
        obj["similarity"]  = sim
        obj["probs"]       = pr
        for lab in LABEL_LIST:
            obj[f"prob_{lab}"] = pr.get(lab, 0.0)

    # save JSON
    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    print(f"[DONE] Saved predictions (with probabilities & similarity) to {output_json}")

    # optional flat CSV view
    if output_csv:
        df = pd.DataFrame(data)
        df.to_csv(output_csv, index=False)
        print(f"[DONE] Also wrote a tabular view to {output_csv}")

# ---------- Examples ----------
if __name__ == "__main__":
    # EXAMPLE: JSON → JSON (+ CSV)
    predict_json("mbs_data.json", "mbs_with_preds.json", "mbs_with_preds.csv")

    # EXAMPLE: CSV → CSV
    # predict_csv("starbucks_nus_data.csv", "starbucks_reviews_with_preds.csv")
    pass


OSError: checkpoints/best is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `hf auth login` or by passing `token=<your_token>`

# Count

In [12]:
import pandas as pd
import json

# Load your predictions JSON
with open("mbs_with_preds.json", "r", encoding="utf-8") as f:
    data = json.load(f)

df = pd.DataFrame(data)

# Count the number of each prediction
label_counts = df["prediction"].value_counts()

print("Label distribution:")
print(label_counts)

Label distribution:
prediction
relevant      335
spam           27
irrelevant      1
Name: count, dtype: int64
