In [1]:
# Combined dataset builder for Compatika V1
# Supports: EmpatheticDialogues, GoEmotions, PersonaChat (CSV or JSONL)
# Output: compatika_combined.txt (USER / COMPATIKA pairs)
# Run in Jupyter. Edit dataset path variables below.

import os
import pandas as pd
import json
import random
import re
from pathlib import Path
from typing import List, Tuple

# ========== CONFIG - set these paths ==========
empathetic_path = r"C:\Users\aman\Desktop\v1 dataset\rawdata\empathetic_dialogues\train.csv"   # or .jsonl
goemotions_path = r"C:\Users\aman\Desktop\v1 dataset\rawdata\goemotions_1\goemotions_merged_clean.csv"            # or .jsonl
persona_path = r"C:\Users\aman\Desktop\v1 dataset\rawdata\PersonaChat\personality.csv"                  # or .jsonl

output_txt = "compatika_combined.txt"
TARGET_SIZE_MB = None  # set to e.g. 40 to attempt ~40MB output, or None to just write all samples

# ========== Templates / mappings ==========
# Emotion -> templates (extend these lists to increase variety)
TEMPLATES = {
    "sadness": [
        "That sounds really painful, and it’s okay to feel this way.",
        "I’m sorry you’re feeling this. I’m here with you.",
        "It makes sense that this feels heavy.",
        "That sadness sounds real, and you deserve space to feel it.",
        "It sounds like your heart is carrying a lot right now.",
        "I hear how much this affected you.",
        "It’s understandable you’d feel sad about that.",
        "I’m here with you as these emotions come up."
    ],

    "anger": [
        "It makes sense to feel angry about that.",
        "I hear your frustration.",
        "Your anger is valid.",
        "It sounds like something really bothered you.",
        "That reaction shows how much this mattered to you.",
        "I can feel the intensity in what you shared.",
        "It’s okay to feel upset.",
        "That frustration you’re feeling is real."
    ],

    "anxiety": [
        "Feeling anxious can be exhausting.",
        "It’s okay to feel nervous.",
        "That tension sounds heavy.",
        "You’re not alone—anxiety can feel overwhelming.",
        "It makes sense your body reacted that way.",
        "That sounds stressful.",
        "I hear your worry.",
        "You’re carrying a lot of tension."
    ],

    "fear": [
        "That sounds scary, and I’m here with you.",
        "It’s understandable to feel afraid.",
        "That fear sounds real.",
        "You’re safe here.",
        "It makes sense that you’re scared.",
        "That moment sounded really frightening.",
        "Fear can be overwhelming.",
        "You’re not alone in that feeling."
    ],

    "joy": [
        "That’s wonderful to hear.",
        "I’m glad you’re feeling joyful.",
        "That’s such a warm moment.",
        "It sounds like something truly uplifting happened.",
        "I can hear the happiness in your words.",
        "That joy sounds genuine.",
        "That feels like a bright spot.",
        "It’s lovely to hear something positive."
    ],

    "love": [
        "That’s a warm and meaningful feeling.",
        "It sounds meaningful to feel that way.",
        "That affection seems deep.",
        "That sounds like a heartfelt connection.",
        "It’s a beautiful emotion.",
        "I can feel the warmth in that.",
        "Love can carry many layers.",
        "It sounds like something very important to you."
    ],

    "neutral": [
        "Thanks for sharing that.",
        "I hear you.",
        "That sounds meaningful.",
        "Thanks for telling me that.",
        "I’m listening.",
        "That seems important.",
        "Thanks for opening up.",
        "I hear what you’re saying."
    ],

    "default": [
        "Thank you for sharing that.",
        "I’m listening.",
        "That sounds meaningful.",
        "Thanks for telling me.",
        "I hear you.",
        "That’s important.",
        "I’m here with you.",
        "I’m here with whatever you’re feeling."
    ]
}



# Map label names in GoEmotions to the canonical keys above (adjust as needed)
GOEMO_TO_CANON = {
    "admiration":"neutral","amusement":"joy","anger":"anger","annoyance":"anger",
    "approval":"neutral","caring":"love","confusion":"neutral","curiosity":"neutral",
    "desire":"neutral","disappointment":"sadness","disapproval":"neutral","disgust":"disgust" if "disgust" in TEMPLATES else "default",
    "embarrassment":"neutral","excitement":"joy","fear":"fear","gratitude":"neutral",
    "grief":"sadness","joy":"joy","love":"love","nervousness":"anxiety",
    "optimism":"joy","pride":"joy","realization":"neutral","relief":"joy",
    "remorse":"sadness","sadness":"sadness","surprise":"neutral","neutral":"neutral"
}
# If TEMPLATES lacks a key, GOEMO_TO_CANON will fallback later

# ========== Helper functions ==========
def load_any_csv_or_jsonl(path: str) -> pd.DataFrame:
    p = Path(path)
    if not p.exists():
        raise FileNotFoundError(f"File not found: {path}")
    if p.suffix.lower() in [".csv", ".tsv"]:
        sep = "\t" if p.suffix.lower()==".tsv" else ","
        df = pd.read_csv(path, sep=sep, dtype=str, keep_default_na=False)
        return df
    elif p.suffix.lower() in [".jsonl", ".ndjson", ".json"]:
        # try json lines
        try:
            df = pd.read_json(path, lines=True)
            return df
        except Exception as e:
            # try loading as plain json
            with open(path, "r", encoding="utf-8") as f:
                j = json.load(f)
            # If it's a dict with 'data' key, try flattening
            if isinstance(j, dict) and "data" in j:
                df = pd.json_normalize(j["data"])
                return df
            # fallback: normalize the object list
            df = pd.json_normalize(j)
            return df
    else:
        # try CSV by default
        df = pd.read_csv(path, dtype=str, keep_default_na=False)
        return df

def pretty_head(df: pd.DataFrame, n=10):
    print("Shape:", df.shape)
    print("Columns:", list(df.columns))
    display(df.head(n))

def safe_get_cols(df, candidates):
    """Return first matching column name from candidates"""
    cols = [c.lower() for c in df.columns]
    for cand in candidates:
        if cand.lower() in cols:
            return df.columns[cols.index(cand.lower())]
    return None

def split_sentences(text: str) -> List[str]:
    if text is None: return []
    s = str(text).strip()
    if not s:
        return []
    # naive split by punctuation and newlines
    parts = re.split(r'(?<=[.!?])\s+|\n+', s)
    clean = [p.strip() for p in parts if len(p.strip())>2]
    return clean

def choose_template_for_emotions(emotions: List[str]) -> str:
    # emotions: list of raw labels like ["sadness","anxiety"]
    for pref in ["sadness","anxiety","fear","anger","joy","love","neutral"]:
        if pref in emotions and pref in TEMPLATES:
            return random.choice(TEMPLATES[pref])
    # try mapping via GOEMO_TO_CANON if raw labels are different
    for e in emotions:
        canon = GOEMO_TO_CANON.get(e, None)
        if canon and canon in TEMPLATES:
            return random.choice(TEMPLATES[canon])
    # fallback
    return random.choice(TEMPLATES.get("default", list(TEMPLATES.values())[0]))

# ========== Load & inspect datasets ==========
all_samples = []  # will hold tuples (user_text, compatika_reply, source)

# ---- EmpatheticDialogues ----
if os.path.exists(empathetic_path):
    edf = load_any_csv_or_jsonl(empathetic_path)
    print("EmpatheticDialogues loaded:")
    pretty_head(edf, n=3)
    # try to find columns: context/emotion, prompt, utterance, speaker
    # common col names: 'context','utterance','emotion','speaker_idx','conv_id','prompt'
    user_col = safe_get_cols(edf, ["context","context_text","user","prompt"])
    utt_col = safe_get_cols(edf, ["utterance","response","text","reply"])
    emotion_col = safe_get_cols(edf, ["emotion","label","emotions"])
    # If this dataset is multi-turn with conv_id and utterance_idx, we want user->assistant pairs.
    # Heuristic: if 'speaker' or 'speaker_idx' exists, use it.
    speaker_col = safe_get_cols(edf, ["speaker","speaker_idx","role"])
    if 'conv_id' in edf.columns and 'utterance_idx' in edf.columns and speaker_col:
        # group by conv and pair user -> assistant replies (speaker idx convention varies; we will assume 0=person,1=assistant)
        for conv_id, group in edf.groupby("conv_id"):
            grp = group.sort_values(by="utterance_idx")
            # iterate; whenever we find a user turn followed by assistant turn, pair them
            prev = None
            for _, row in grp.iterrows():
                role = str(row.get(speaker_col, "")).strip()
                text = str(row.get(utt_col, row.get(user_col, ""))).strip()
                if not text:
                    continue
                if prev and prev['role'].lower() != role.lower():
                    # prev and current are different roles -> make pair prev.text -> current.text
                    all_samples.append((prev['text'], text, "empathetic"))
                prev = {"role": role, "text": text}
    else:
        # fallback: if we have context + utterance columns where context is user and utterance is reply
        if user_col and utt_col:
            for _, row in edf.iterrows():
                u = str(row.get(user_col,"")).strip()
                r = str(row.get(utt_col,"")).strip()
                if u and r:
                    all_samples.append((u, r, "empathetic"))
        else:
            # as last resort, try to use any 'text' and 'response' like columns
            for col in edf.columns:
                if "utter" in col.lower() or "resp" in col.lower():
                    utt_col = col
                    break
            # skip if still nothing
    print(f"Collected {len([s for s in all_samples if s[2]=='empathetic'])} empathetic pairs so far.")
else:
    print("EmpatheticDialogues path not found, skipping.")

# ---- GoEmotions ----
if os.path.exists(goemotions_path):
    gdf = load_any_csv_or_jsonl(goemotions_path)
    print("\nGoEmotions loaded:")
    pretty_head(gdf, n=3)
    # Common formats: columns 'text', 'labels' (comma separated), or many one-hot columns per emotion
    if "text" in gdf.columns and "labels" in gdf.columns:
        for _, row in gdf.iterrows():
            text = str(row["text"]).strip()
            labels_raw = str(row["labels"]).strip()
            if not text:
                continue
            labels = [l.strip() for l in labels_raw.split(",") if l.strip()]
            # choose template based on labels
            reply = choose_template_for_emotions(labels)
            all_samples.append((text, reply, "goemotions"))
    else:
        # check for one-hot emotion columns
        emotion_cols = [c for c in gdf.columns if c.lower() in GOEMO_TO_CANON.keys()]
        if emotion_cols:
            for _, row in gdf.iterrows():
                text = str(row.get("text","")).strip()
                if not text:
                    continue
                labels = [c for c in emotion_cols if str(row.get(c,"")) not in ["0","False","","0.0"]]
                labels = [c for c in labels if c]
                reply = choose_template_for_emotions(labels if labels else ["neutral"])
                all_samples.append((text, reply, "goemotions"))
        else:
            # fallback: if single 'label' column exists
            label_col = safe_get_cols(gdf, ["label","emotion","labels"])
            if label_col and "text" in gdf.columns:
                for _, row in gdf.iterrows():
                    text = str(row["text"]).strip()
                    labels = [l.strip() for l in str(row.get(label_col,"")).split(",") if l.strip()]
                    reply = choose_template_for_emotions(labels)
                    all_samples.append((text, reply, "goemotions"))
    print(f"Collected {len([s for s in all_samples if s[2]=='goemotions'])} goemotions pairs so far.")
else:
    print("GoEmotions path not found, skipping.")

# ---- PersonaChat (convert to empathetic probes) ----
if os.path.exists(persona_path):
    pdf = load_any_csv_or_jsonl(persona_path)
    print("\nPersona dataset loaded:")
    pretty_head(pdf, n=3)
    text_col = safe_get_cols(pdf, ["chat","utterance","text","message","persona_dialog","dialog"])
    if text_col is None:
        # fallback to longest string column
        object_cols = [c for c in pdf.columns if pdf[c].dtype == object]
        if object_cols:
            lengths = {c: pdf[c].astype(str).map(len).mean() for c in object_cols}
            text_col = max(lengths, key=lengths.get)
    print("Using persona text column:", text_col)
    # split each cell into sentences and convert to probes/reflections
    for _, row in pdf.iterrows():
        cell = str(row.get(text_col,"")).strip()
        if not cell:
            continue
        sents = split_sentences(cell)
        for s in sents:
            # optionally ignore pure persona facts (short facts), but we include them and probe feelings
            reply = random.choice(TEMPLATES.get("neutral", TEMPLATES.get("default")))
            # pick better mapping: if sentence contains emotion keywords, pick support
            # simple check:
            lower = s.lower()
            if any(k in lower for k in ["sad","lonely","depress","hurt","angry","scared","nervous","anxious"]):
                # pick sadness/anxiety support
                reply = choose_template_for_emotions(["sadness"])
            all_samples.append((s, reply, "persona"))
    print(f"Collected {len([s for s in all_samples if s[2]=='persona'])} persona-derived pairs so far.")
else:
    print("Persona path not found, skipping.")

# ========== Merge, dedupe, shuffle ==========
print("\nTotal raw samples collected:", len(all_samples))
# Normalize whitespace and simple cleaning
def normalize_text(t: str) -> str:
    t = re.sub(r'\s+', ' ', t).strip()
    return t

norm_pairs = []
seen = set()
for u, r, src in all_samples:
    u2 = normalize_text(u)
    r2 = normalize_text(r)
    key = (u2.lower(), r2.lower())
    if key in seen:
        continue
    seen.add(key)
    norm_pairs.append((u2, r2, src))

print("After dedupe:", len(norm_pairs))

random.shuffle(norm_pairs)

# Optionally expand until target size (MB) by repeating with small paraphrase variations
if TARGET_SIZE_MB is not None:
    target_bytes = int(TARGET_SIZE_MB * 1024 * 1024)
    est = 0
    out_lines = []
    idx = 0
    # small paraphrase function: randomly swap templates for same source
    def paraphrase_reply(reply):
        # naive: just return reply (you can add simple synonyms or template variations here)
        return reply
    while est < target_bytes:
        u, r, s = norm_pairs[idx % len(norm_pairs)]
        r2 = paraphrase_reply(r)
        sample = f"USER: {u}\nCOMPATIKA: {r2}\n\n"
        out_lines.append(sample)
        est += len(sample.encode("utf-8"))
        idx += 1
    print(f"Expanded to {len(out_lines)} lines to reach ~{TARGET_SIZE_MB} MB")
    Path(output_txt).write_text("".join(out_lines), encoding="utf-8")
else:
    # write all normalized pairs to TXT
    with open(output_txt, "w", encoding="utf-8") as f:
        for u, r, s in norm_pairs:
            f.write(f"USER: {u}\nCOMPATIKA: {r}\n\n")
    print(f"Wrote {len(norm_pairs)} samples to {output_txt} (size: {os.path.getsize(output_txt)/(1024*1024):.2f} MB)")

# ========== Done ==========
print("Done. Sample output preview:")
with open(output_txt, "r", encoding="utf-8") as f:
    preview = "".join([next(f) for _ in range(20)]) if os.path.getsize(output_txt)>0 else ""
print(preview)


EmpatheticDialogues loaded:
Shape: (76673, 8)
Columns: ['conv_id', 'utterance_idx', 'context', 'prompt', 'speaker_idx', 'utterance', 'selfeval', 'tags']


Unnamed: 0,conv_id,utterance_idx,context,prompt,speaker_idx,utterance,selfeval,tags
0,hit:0_conv:1,1,sentimental,I remember going to the fireworks with my best...,1,I remember going to see the fireworks with my ...,5|5|5_2|2|5,
1,hit:0_conv:1,2,sentimental,I remember going to the fireworks with my best...,0,Was this a friend you were in love with_comma_...,5|5|5_2|2|5,
2,hit:0_conv:1,3,sentimental,I remember going to the fireworks with my best...,1,This was a best friend. I miss her.,5|5|5_2|2|5,


Collected 58829 empathetic pairs so far.

GoEmotions loaded:
Shape: (211225, 38)
Columns: ['text', 'id', 'author', 'subreddit', 'link_id', 'parent_id', 'created_utc', 'rater_id', 'example_very_unclear', 'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral', 'clean_text']


Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral,clean_text
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381039.0,1,False,0,...,0,0,0,0,0,0,1,0,0,that game hurt
1,>sexuality shouldn’t be a grouping category I...,eemcysk,TheGreen888,unpopularopinion,t3_ai4q37,t3_ai4q37,1548084169.0,37,True,0,...,0,0,0,0,0,0,0,0,0,sexuality shouldn t be a grouping category it ...
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,1546427744.0,37,False,0,...,0,0,0,0,0,0,0,0,1,you do right if you don t care then fuck em


Collected 211225 goemotions pairs so far.

Persona dataset loaded:
Shape: (8939, 3)
Columns: ['Unnamed: 0', 'Persona', 'chat']


Unnamed: 0.1,Unnamed: 0,Persona,chat
0,0,i like to remodel homes. i like to go hunting...,"hi , how are you doing ? i am getting ready to..."
1,1,my mom is my best friend. i have four sisters...,"hi , how are you doing today ?\ni am spending ..."
2,2,i had a gig at local theater last night. i wo...,"we all live in a yellow submarine , a yellow s..."


Using persona text column: chat
Collected 210777 persona-derived pairs so far.

Total raw samples collected: 480831
After dedupe: 422648
Wrote 422648 samples to compatika_combined.txt (size: 43.74 MB)
Done. Sample output preview:
USER: Just give me 20 million dollars and let me retire early. gg wp life
COMPATIKA: Thanks for opening up.

USER: i stay home with the kids , my wife is a baker .
COMPATIKA: I’m listening.

USER: that is cool what is your favorite band ?
COMPATIKA: I’m listening.

USER: do you like hiking
COMPATIKA: Thanks for telling me that.

USER: That's the funniest mental image I've had in a very long time!
COMPATIKA: It sounds like something truly uplifting happened.

USER: Don't you even dare try to put any blame on [NAME]
COMPATIKA: I hear your frustration.

USER: [NAME] has no touch..That is why [NAME] kept dropping the ball..
COMPATIKA: Thanks for telling me that.

