In [None]:
!pip -q install transformers accelerate sentencepiece

import pandas as pd
import numpy as np
from pathlib import Path

from google.colab import drive
drive.mount("/content/drive")

BASE = Path("/content/drive/MyDrive/dadosfera")
SILVER = BASE / "cdm_silver"
GOLD = BASE / "cdm_gold"
GOLD.mkdir(parents=True, exist_ok=True)

# -------------------------
# Load Silver
# -------------------------
def read_silver(name: str) -> pd.DataFrame:
    return pd.read_csv(SILVER / f"{name}.csv", low_memory=False)

order      = read_silver("order")
order_item = read_silver("order_item")
product    = read_silver("product")
review     = read_silver("review")
customer   = read_silver("customer")

# -------------------------
# Build base (joins)
# -------------------------
gold_base = (
    order_item
    .merge(order, on="order_id", how="left", suffixes=("", "_o"))
    .merge(product, on="product_id", how="left")
    .merge(review, on="order_id", how="left", suffixes=("", "_r"))
    .merge(customer, on="customer_id", how="left", suffixes=("", "_c"))
)

# Seleção de colunas principais (ajuste se precisar)
cols = [
    "order_id","order_item_id","product_id","seller_id","customer_id",
    "status","delivery_delay_days",
    "item_price","freight_value","total_item_value",
    "category_en","volume_cm3",
    "score","comment_title","comment_message"
]
gold_base = gold_base[[c for c in cols if c in gold_base.columns]].copy()

# -------------------------
# Filter reviews with text
# -------------------------
gold_text = gold_base[
    gold_base["comment_message"].notna() &
    (gold_base["comment_message"].astype(str).str.strip() != "")
].copy()

# -------------------------
# Sample 100 (rápido)
# -------------------------
SAMPLE_SIZE = 100
if len(gold_text) > SAMPLE_SIZE:
    gold_text = gold_text.sample(n=SAMPLE_SIZE, random_state=42).reset_index(drop=True)

print("Reviews com texto:", len(gold_text))

# =========================================================
# Transformers pipelines (rápidos)
# =========================================================
from transformers import pipeline

# Sentimento
sent = pipeline(
    "sentiment-analysis",
    model="distilbert-base-uncased-finetuned-sst-2-english"
)

# Zero-shot para tópicos / severidade / mismatch
zshot = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli"
)

#  resumo curto
summ = pipeline(
    "summarization",
    model="sshleifer/distilbart-cnn-12-6"
)

TOPICS = [
    "delivery delay", "product quality", "wrong item", "missing parts",
    "packaging", "customer service", "refund/return", "price/value", "damaged product"
]

SEVERITY = ["low", "medium", "high"]
MISMATCH = ["expectation mismatch", "no mismatch"]

def safe_text(x, maxlen=900):
    t = str(x) if x is not None else ""
    t = " ".join(t.split())
    return t[:maxlen]

def map_sentiment(lbl):
    return "positive" if lbl == "POSITIVE" else "negative"

def llm_features(row):
    text = safe_text(row.get("comment_message", ""), 800)

    # 1) Sentimento
    s = sent(text[:512])[0]
    sentiment_label = map_sentiment(s["label"])
    sentiment_score = float(s["score"]) * (1 if sentiment_label == "positive" else -1)

    # 2) Tópicos (top3)
    zs_topics = zshot(text[:512], TOPICS, multi_label=True)
    top_topics = [t for t,_ in sorted(zip(zs_topics["labels"], zs_topics["scores"]), key=lambda x: x[1], reverse=True)[:3]]

    # 3) Severidade
    zs_sev = zshot(text[:512], SEVERITY, multi_label=False)
    complaint_severity = zs_sev["labels"][0]

    # 4) Mismatch
    zs_mm = zshot(text[:512], MISMATCH, multi_label=False)
    expectation_mismatch = (zs_mm["labels"][0] == "expectation mismatch")

    # 5) "complaint_reason" simples: 1º tópico (se negativo) senão "none"
    complaint_reason = top_topics[0] if sentiment_label == "negative" else "none"

    # 6) Resumo curto (bem curto)
    try:
        summary = summ(text[:1024], max_length=40, min_length=10, do_sample=False)[0]["summary_text"]
    except Exception:
        summary = ""

    # mismatch_reason: só um texto curto baseado em delay + sentimento
    mismatch_reason = ""
    if expectation_mismatch and sentiment_label == "negative":
        dd = row.get("delivery_delay_days", None)
        if pd.notna(dd) and float(dd) > 0:
            mismatch_reason = f"Delivery delay: {int(float(dd))} days"
        else:
            mismatch_reason = "Review indicates unmet expectations"

    return {
        "sentiment_label": sentiment_label,
        "sentiment_score": sentiment_score,
        "topics_top3": ", ".join(top_topics),
        "complaint_reason": complaint_reason,
        "complaint_severity": complaint_severity,
        "expectation_mismatch": expectation_mismatch,
        "mismatch_reason": mismatch_reason,
        "experience_summary": summary
    }

# =========================================================
# Enrichment loop
# =========================================================
enriched = []
for _, row in gold_text.iterrows():
    feats = llm_features(row)
    out = row.to_dict()
    out.update(feats)
    enriched.append(out)

gold_llm = pd.DataFrame(enriched)

out_path = GOLD / "gold_order_review_llm_features.csv"
gold_llm.to_csv(out_path, index=False)
print("Saved:", out_path)
gold_llm.head(3)
