In [None]:
import os
import ast
import json
import re
import pandas as pd
from tqdm import tqdm
from textblob import TextBlob
from sentence_transformers import SentenceTransformer
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
INPUT_DIR = "../Dataset/dataset/splits"
OUT_DIR = "./out/sentiment_labels"
MODEL_NAME = "all-MiniLM-L6-v2"
BATCH_SIZE = 64
N_COMPONENTS = 3
MAX_ITER = 200
RANDOM_STATE = 42
COV_TYPE = "diag"
os.makedirs(OUT_DIR, exist_ok=True)
def string_to_json(s):
    """Convert stringified list/dict to Python object safely."""
    if not isinstance(s, str) or not s.strip():
        return []
    try:
        return json.loads(s)
    except json.JSONDecodeError:
        pass
    try:
        return ast.literal_eval(s)
    except Exception:
        pass
    s_fixed = re.sub(r"(?<!\w)'([^']*?)'(?!\w)", r'"\1"', s)
    s_fixed = re.sub(r"\bNone\b", "null", s_fixed)
    s_fixed = re.sub(r"\bTrue\b", "true", s_fixed)
    s_fixed = re.sub(r"\bFalse\b", "false", s_fixed)
    try:
        return json.loads(s_fixed)
    except Exception:
        return []


def extract_text(conv):
    """Extract all message text from conversation list."""
    if isinstance(conv, str):
        conv = string_to_json(conv)
    if isinstance(conv, list):
        return " ".join([m.get("content", "") for m in conv if isinstance(m, dict)])
    return str(conv)


def parse_moderation(moderation):
    """Simple numeric score from moderation results."""
    if isinstance(moderation, str):
        moderation = string_to_json(moderation)
    try:
        cats = moderation[0]["categories"]
        return sum(int(v) for v in cats.values())
    except Exception:
        return 0



print("ðŸš€ Loading embedding model...")
embedder = SentenceTransformer(MODEL_NAME)


files = [f for f in os.listdir(INPUT_DIR) if f.endswith(".csv")]
print(f"ðŸ“‚ Found {len(files)} CSVs in {INPUT_DIR}\n")

for fname in tqdm(files, desc="Processing files"):
    fpath = os.path.join(INPUT_DIR, fname)
    df = pd.read_csv(fpath, usecols=["conversation", "openai_moderation"])

    # Clean invalid rows
    df["conversation"] = df["conversation"].apply(string_to_json)
    df["openai_moderation"] = df["openai_moderation"].apply(string_to_json)
    df = df[df["conversation"].astype(bool) & df["openai_moderation"].astype(bool)]
    if len(df) == 0:
        continue

    # Extract clean text and moderation scores
    df["text"] = df["conversation"].apply(extract_text)
    df["moderation_score"] = df["openai_moderation"].apply(parse_moderation)

    # Generate embeddings
    embeddings = embedder.encode(df["text"].tolist(), batch_size=BATCH_SIZE, show_progress_bar=False)

    # Scale and cluster (EM)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(embeddings)
    gmm = GaussianMixture(
        n_components=N_COMPONENTS,
        covariance_type=COV_TYPE,
        max_iter=MAX_ITER,
        random_state=RANDOM_STATE
    )
    gmm.fit(X_scaled)
    clusters = gmm.predict(X_scaled)

    # Sentiment polarity using TextBlob
    sentiments = [TextBlob(t).sentiment.polarity for t in df["text"]]
    df["sentiment_score"] = sentiments
    df["cluster"] = clusters

    # Label clusters by average polarity
    means = df.groupby("cluster")["sentiment_score"].mean().sort_values()
    mapping = {
        means.index[0]: "negative",
        means.index[1]: "mixed",
        means.index[2]: "positive"
    }
    df["sentiment_text"] = df["cluster"].map(mapping)

    
    mods = df.groupby("cluster")["moderation_score"].mean().sort_values()
    map2 = {
        mods.index[0]: "positive",
        mods.index[1]: "mixed",
        mods.index[2]: "negative"
    }
    df["sentiment_mod"] = df["cluster"].map(map2)

   
    def smart_consensus(r):
        # strong positive text or low toxicity â†’ positive
        if r.sentiment_score > 0.3 and r.moderation_score <= 1:
            return "positive"
        # strong negative text or high toxicity â†’ negative
        elif r.sentiment_score < -0.3 or r.moderation_score >= 2:
            return "negative"
        # otherwise â†’ mixed / neutral zone
        else:
            return "mixed"

    df["final_sentiment"] = df.apply(smart_consensus, axis=1)

    # Save file (conversation + moderation + final sentiment)
    out_path = os.path.join(OUT_DIR, f"{os.path.splitext(fname)[0]}_labeled.csv")
    df[["conversation", "openai_moderation", "final_sentiment"]].to_csv(out_path, index=False)