In [None]:
import os, sys, json, gc, warnings
from pathlib import Path

# Core data tools
import numpy as np
import pandas as pd

# PyTorch
import torch
import torch.nn.functional as F

# Hugging Face
import transformers
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    default_data_collator,
)
from datasets import Dataset
from evaluate import load as load_metric  # optional

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, f1_score

# Audio
import librosa
import soundfile as sf
from pydub import AudioSegment
from pydub.utils import which
import music21

# UI
import streamlit as st

# pydub ↔ ffmpeg (use system/conda ffmpeg if available)
AudioSegment.converter = which("ffmpeg") or AudioSegment.converter

# Quiet a few noisy-but-harmless warnings
warnings.filterwarnings("ignore", message=".*Triton.*")
warnings.filterwarnings("ignore", message=".*weight_norm is deprecated.*")
warnings.filterwarnings("ignore", message="pkg_resources is deprecated.*")

# Version sanity checks (comment out if you prefer flexibility)
assert torch.__version__.startswith("2.1.0"), f"Torch pin expected 2.1.0, got {torch.__version__}"
assert transformers.__version__.startswith("4.37."), f"Transformers pin expected 4.37.x, got {transformers.__version__}"
import numpy  # keep separate to check the exact package version below
assert numpy.__version__ == "1.26.4", f"Numpy pin expected 1.26.4, got {numpy.__version__}"


In [2]:
data_path = "data/raw"

# Load GoEmotions parts
geo1 = pd.read_csv(os.path.join(data_path, "goemotions_1.csv"))
geo2 = pd.read_csv(os.path.join(data_path, "goemotions_2.csv"))
geo3 = pd.read_csv(os.path.join(data_path, "goemotions_3.csv"))

# Preview each
print(f"GoEmotions 1: {geo1.shape}\n", geo1.head(2), "\n")
print(f"GoEmotions 2: {geo2.shape}\n", geo2.head(2), "\n")
print(f"GoEmotions 3: {geo3.shape}\n", geo3.head(2), "\n")

# Confirm columns are the same
print("Column consistency check:", geo1.columns.equals(geo2.columns) and geo1.columns.equals(geo3.columns))
print("Columns:", geo1.columns.tolist())

# Combine all three
goemotions_df = pd.concat([geo1, geo2, geo3], ignore_index=True)
print(" Combined GoEmotions shape:", goemotions_df.shape)
print(goemotions_df[['text'] + list(goemotions_df.columns[-10:])].sample(3))

GoEmotions 1: (70000, 37)
                                                 text       id       author  \
0                                    That game hurt.  eew5j0j        Brdd9   
1   >sexuality shouldn’t be a grouping category I...  eemcysk  TheGreen888   

          subreddit    link_id   parent_id   created_utc  rater_id  \
0               nrl  t3_ajis4z  t1_eew18eq  1.548381e+09         1   
1  unpopularopinion  t3_ai4q37   t3_ai4q37  1.548084e+09        37   

   example_very_unclear  admiration  ...  love  nervousness  optimism  pride  \
0                 False           0  ...     0            0         0      0   
1                  True           0  ...     0            0         0      0   

   realization  relief  remorse  sadness  surprise  neutral  
0            0       0        0        1         0        0  
1            0       0        0        0         0        0  

[2 rows x 37 columns] 

GoEmotions 2: (70000, 37)
                              text       id      

In [None]:
# --- GoEmotions cleanup ---
# Remove unclear examples
goemotions_df = goemotions_df[goemotions_df["example_very_unclear"] == False].reset_index(drop=True)

# Emotion columns are after the first 10 metadata columns
emotion_columns = goemotions_df.columns[10:]

# Keep rows with at least one emotion tag
goemotions_df = goemotions_df[goemotions_df[emotion_columns].sum(axis=1) > 0]

# Pick the first tagged emotion as a single label
goemotions_df["label"] = goemotions_df[emotion_columns].idxmax(axis=1)

# Keep only text and label
goemotions_clean = goemotions_df[["text", "label"]].dropna().reset_index(drop=True)

# Quick peek
print("Cleaned GoEmotions sample:")
print(goemotions_clean.sample(5))
print("Shape:", goemotions_clean.shape)

# Save
goemotions_clean.to_csv("data/raw/goemotions_df.csv", index=False)

# --- Dreaddit cleanup ---
df_train = pd.read_csv("data/raw/dreaddit-train.csv")
df_test = pd.read_csv("data/raw/dreaddit-test.csv")

# Merge and keep only text/label
dreaddit_full = pd.concat([df_train, df_test], ignore_index=True)[["text", "label"]]
dreaddit_full.dropna(subset=["text", "label"], inplace=True)

# Map numeric to readable labels
dreaddit_full["label"] = dreaddit_full["label"].map({1: "distress", 0: "no_distress"})

# Quick peek
print("Cleaned Dreaddit sample:")
print(dreaddit_full.sample(5))
print("Shape:", dreaddit_full.shape)

# Save
dreaddit_full.to_csv("data/raw/dreaddit_df.csv", index=False)


✅ Cleaned GoEmotions sample:
                                                     text      label
35665            out of curiosity, What are you watching?  curiosity
107764  [NAME] has continued to show how selfish she i...   surprise
193067                                          Source? 😂  curiosity
125576  This is so true!! I'm vaping mango right now! ...  annoyance
172973                 The best ever you are the [NAME]!!  annoyance
Shape: (197283, 2)
✅ Cleaned Dreaddit sample:
                                                   text        label
2471  I live in BC and im gonna be homeless soon, I'...  no_distress
3482  I then confronted my parents and finally the w...  no_distress
2902  I feel like a completely different person. My ...     distress
2720  I'm in need of quick assistance to make a purc...  no_distress
1861  I've stayed up at night hearing them quietly t...     distress
Shape: (3553, 2)


In [None]:
# Load cleaned dataset
goemotions_df = pd.read_csv("data/raw/goemotions_df.csv")
goemotions_df['label'] = goemotions_df['label'].astype('category')
goemotions_df['label_id'] = goemotions_df['label'].cat.codes

label2id = dict(enumerate(goemotions_df['label'].cat.categories))
id2label = {v: k for k, v in label2id.items()}

# Create Hugging Face Dataset
goemotions_dataset = Dataset.from_pandas(
    goemotions_df[['text', 'label_id']].rename(columns={'label_id': 'label'})
)

# Model + tokenizer
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint,
    num_labels=len(label2id),
    id2label=label2id,
    label2id=id2label
).to("cuda")

print("Using device:", model.device)

# Tokenization
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

tokenized_dataset = goemotions_dataset.map(tokenize, batched=True, remove_columns=["text"])
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.1)

# Metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted")
    }

# Training args
training_args = TrainingArguments(
    output_dir="./results_goemotions_v2",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs_goemotions_v2",
    do_train=True,
    do_eval=True,
    logging_steps=50,
    save_steps=500
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

# Save trained model + tokenizer
save_path = "./saved_models/goemotions_distilbert_v3"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"Model and tokenizer saved to: {save_path}")


In [5]:
save_path = "./saved_models/goemotions_distilbert_v3"
loaded_tokenizer = AutoTokenizer.from_pretrained(save_path)

loaded_model = AutoModelForSequenceClassification.from_pretrained(save_path)


In [None]:
# Load and prep Dreaddit dataset
dreaddit_df = pd.read_csv("data/raw/dreaddit_df.csv")[['text', 'label']].dropna()
dreaddit_df["label"] = dreaddit_df["label"].map({"no_distress": 0, "distress": 1})

# Convert to Hugging Face Dataset
dreaddit_dataset = Dataset.from_pandas(dreaddit_df)

# Tokenizer
checkpoint = "distilbert-base-uncased"
tokenizer_dreaddit = AutoTokenizer.from_pretrained(checkpoint)

def tokenize(batch):
    return tokenizer_dreaddit(batch["text"], padding=True, truncation=True)

tokenized_dreaddit = dreaddit_dataset.map(tokenize, batched=True, remove_columns=["text"])
tokenized_dreaddit = tokenized_dreaddit.train_test_split(test_size=0.1)

# Binary classifier
model_dreaddit = AutoModelForSequenceClassification.from_pretrained(
    checkpoint, num_labels=2
).to("cuda")

# Metrics
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = np.argmax(preds, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds)
    }

# Training setup
training_args_dreaddit = TrainingArguments(
    output_dir="./results_dreaddit",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs_dreaddit"
)

trainer_dreaddit = Trainer(
    model=model_dreaddit,
    args=training_args_dreaddit,
    train_dataset=tokenized_dreaddit["train"],
    eval_dataset=tokenized_dreaddit["test"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer_dreaddit
)

# Train
trainer_dreaddit.train()

# Save model + tokenizer
save_path = "saved_models/distress_classifier"
trainer_dreaddit.save_model(save_path)
tokenizer_dreaddit.save_pretrained(save_path)
print(f"Distress model saved to: {save_path}")

# Evaluate
metrics_dreaddit = trainer_dreaddit.evaluate()
print("Distress Model Evaluation Results:")
for k, v in metrics_dreaddit.items():
    print(f"{k}: {v:.4f}")

# Quick inference demo
sample_text = ["I'm feeling overwhelmed and anxious."]
inputs = tokenizer_dreaddit(sample_text, return_tensors="pt").to(model_dreaddit.device)
outputs = model_dreaddit(**inputs)
pred = torch.argmax(outputs.logits, dim=1)
print(f"Predicted label: {'distress' if pred.item() == 1 else 'no_distress'}")


In [None]:
# Load saved model + tokenizer
save_path = "./saved_models/goemotions_distilbert_v3"
loaded_model = AutoModelForSequenceClassification.from_pretrained(save_path).to("cuda")
loaded_tokenizer = AutoTokenizer.from_pretrained(save_path)

# Load dataset and map labels to ids
goemotions_df = pd.read_csv("data/raw/goemotions_df.csv")
goemotions_df["label"] = goemotions_df["label"].astype("category")
goemotions_df["label_id"] = goemotions_df["label"].cat.codes

goemotions_dataset = Dataset.from_pandas(
    goemotions_df[["text", "label_id"]].rename(columns={"label_id": "label"})
)

# Tokenize
def tokenize(batch):
    return loaded_tokenizer(batch["text"], padding=True, truncation=True)

tokenized = goemotions_dataset.map(tokenize, batched=True, remove_columns=["text"])
tokenized = tokenized.train_test_split(test_size=0.1)

# Metrics
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = np.argmax(preds, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted")
    }

# Eval setup
eval_args = TrainingArguments(
    output_dir="./eval_results",
    per_device_eval_batch_size=16,
    do_eval=True,
    logging_dir="./logs_eval"
)

trainer_eval = Trainer(
    model=loaded_model,
    args=eval_args,
    eval_dataset=tokenized["test"],
    compute_metrics=compute_metrics,
    tokenizer=loaded_tokenizer
)

# Evaluate
results = trainer_eval.evaluate()

# Report
print("GoEmotions Model Evaluation (Reloaded):")
for k, v in results.items():
    print(f"{k}: {v:.4f}")


In [None]:

warnings.filterwarnings("ignore", message=".*Triton.*")

# Paths and basic setup
CSV_PATH = r"C:/SonicAid_clean/data/raw/goemotions_df.csv"  # columns: text,label
RUN_DIR  = r"C:/SonicAid_clean/part1_emotion/roberta-goemotion-run"
SAVE_DIR = r"C:/SonicAid_clean/saved_models/roberta-goemotion-final"
os.makedirs(RUN_DIR, exist_ok=True)
os.makedirs(SAVE_DIR, exist_ok=True)

SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

# 1) Load and de-duplicate by text (keep majority label)
df = pd.read_csv(CSV_PATH).dropna(subset=["text","label"]).reset_index(drop=True)
df["label"] = df["label"].astype(str).str.strip()

def majority(s):
    vc = s.value_counts()
    return vc.index[0]

df_unique = df.groupby("text", as_index=False)["label"].apply(majority)
print(f"Original rows: {len(df)}, unique by text: {len(df_unique)}")

# Sample half per label for a quicker run (still stratified)
df_half = (
    df_unique.groupby("label", group_keys=False)
    .apply(lambda g: g.sample(frac=0.5, random_state=SEED) if len(g) > 1 else g)
    .reset_index(drop=True)
)
print(f"Using half of unique set: {len(df_half)} rows")

# 2) Encode labels and split
le = LabelEncoder()
y_all = le.fit_transform(df_half["label"])
num_labels = len(le.classes_)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_half["text"], y_all, test_size=0.1, random_state=SEED, stratify=y_all
)

# 3) Tokenizer / model
ckpt = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(ckpt)
model = AutoModelForSequenceClassification.from_pretrained(ckpt, num_labels=num_labels)

# Store label maps in config for consistent reloads
id2label = {int(i): lab for i, lab in enumerate(le.classes_.tolist())}
label2id = {lab: int(i) for i, lab in id2label.items()}
model.config.id2label = id2label
model.config.label2id = label2id
model.config.problem_type = "single_label_classification"

# 4) Build HF datasets
train_ds = Dataset.from_dict({"text": train_texts.tolist(), "label": train_labels})
val_ds   = Dataset.from_dict({"text": val_texts.tolist(),   "label": val_labels})

def tok_fn(batch):
    return tokenizer(
        batch["text"],
        padding=False,          # dynamic padding via collator
        truncation=True,
        max_length=48           # p99 ≈ 37; 48 is safe
    )

train_tok = train_ds.map(tok_fn, batched=True)
val_tok   = val_ds.map(tok_fn,   batched=True)

# Keep only what the model needs and set torch format
keep = ["input_ids", "attention_mask", "label"]
train_tok = train_tok.remove_columns([c for c in train_tok.column_names if c not in keep]).with_format("torch")
val_tok   = val_tok.remove_columns([c for c in val_tok.column_names   if c not in keep]).with_format("torch")

# Dynamic padding (pad to multiple of 8 to help tensor cores)
collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)

# 5) Class‑weighted loss to handle imbalance
cls_weights = compute_class_weight(class_weight="balanced",
                                   classes=np.arange(num_labels),
                                   y=train_labels)
cls_weights = torch.tensor(cls_weights, dtype=torch.float)

class WeightedTrainer(Trainer):
    def __init__(self, class_weights=None, **kwargs):
        super().__init__(**kwargs)
        self.class_weights = class_weights
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fn = torch.nn.CrossEntropyLoss(
            weight=self.class_weights.to(model.device) if self.class_weights is not None else None
        )
        loss = loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss

# 6) Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_weighted": f1_score(labels, preds, average="weighted", zero_division=0),
        "f1_macro": f1_score(labels, preds, average="macro", zero_division=0),
    }

# 7) Training args (lean but solid)
training_args = TrainingArguments(
    output_dir=RUN_DIR,
    per_device_train_batch_size=16,   # shorter seq → higher batch ok
    per_device_eval_batch_size=64,
    gradient_accumulation_steps=2,    # effective train batch ≈ 32
    num_train_epochs=5,
    learning_rate=2e-5,
    weight_decay=0.05,
    warmup_ratio=0.06,
    lr_scheduler_type="cosine",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_weighted",
    greater_is_better=True,
    logging_steps=50,
    fp16=torch.cuda.is_available(),
    optim="adamw_torch_fused",
    group_by_length=True,
    dataloader_num_workers=2,
    report_to="none",
    save_total_limit=2,
    seed=42,
)

# 8) Train, evaluate, save
print("GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
    class_weights=cls_weights,
)

print("Starting training…")
trainer.train()

print("Evaluating best checkpoint…")
metrics = trainer.evaluate()
for k, v in metrics.items():
    if isinstance(v, float):
        print(f"{k}: {v:.4f}")

print(f"Saving to: {SAVE_DIR}")
trainer.save_model(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)

with open(os.path.join(SAVE_DIR, "label_classes.json"), "w", encoding="utf-8") as f:
    json.dump(list(le.classes_), f, ensure_ascii=False, indent=2)

print("Done.")


Original rows: 197283, unique by text: 57080
Using half of unique set: 28537 rows


  .apply(lambda g: g.sample(frac=0.5, random_state=SEED) if len(g) > 1 else g)
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/25683 [00:00<?, ? examples/s]

Map:   0%|          | 0/2854 [00:00<?, ? examples/s]

GPU: NVIDIA GeForce GTX 1650 Ti
Starting training…


  0%|          | 0/4015 [00:00<?, ?it/s]

{'loss': 3.2992, 'learning_rate': 4.149377593360996e-06, 'epoch': 0.06}
{'loss': 3.2958, 'learning_rate': 8.298755186721992e-06, 'epoch': 0.12}
{'loss': 3.3018, 'learning_rate': 1.2448132780082988e-05, 'epoch': 0.19}
{'loss': 3.2682, 'learning_rate': 1.6597510373443984e-05, 'epoch': 0.25}
{'loss': 3.1788, 'learning_rate': 1.9999719360404577e-05, 'epoch': 0.31}
{'loss': 2.8435, 'learning_rate': 1.998794179447606e-05, 'epoch': 0.37}
{'loss': 2.495, 'learning_rate': 1.9958864102458794e-05, 'epoch': 0.44}
{'loss': 2.4618, 'learning_rate': 1.9912536649858366e-05, 'epoch': 0.5}
{'loss': 2.3064, 'learning_rate': 1.984903968051158e-05, 'epoch': 0.56}
{'loss': 2.2653, 'learning_rate': 1.9768483177596008e-05, 'epoch': 0.62}
{'loss': 2.1955, 'learning_rate': 1.9671006673127994e-05, 'epoch': 0.68}
{'loss': 2.0541, 'learning_rate': 1.955677900627908e-05, 'epoch': 0.75}
{'loss': 2.0724, 'learning_rate': 1.9425998030929474e-05, 'epoch': 0.81}
{'loss': 2.1001, 'learning_rate': 1.9278890272965097e-05, 

  0%|          | 0/45 [00:00<?, ?it/s]

{'eval_loss': 2.0359244346618652, 'eval_accuracy': 0.3342676944639103, 'eval_f1_weighted': 0.29151657677233767, 'eval_f1_macro': 0.31101580615351104, 'eval_runtime': 74.4456, 'eval_samples_per_second': 38.337, 'eval_steps_per_second': 0.604, 'epoch': 1.0}
{'loss': 1.8118, 'learning_rate': 1.8742293060529445e-05, 'epoch': 1.06}
{'loss': 1.8834, 'learning_rate': 1.8532702115065525e-05, 'epoch': 1.12}
{'loss': 1.793, 'learning_rate': 1.8308331665925486e-05, 'epoch': 1.18}
{'loss': 1.8543, 'learning_rate': 1.8069570345436236e-05, 'epoch': 1.25}
{'loss': 1.8725, 'learning_rate': 1.781683171237041e-05, 'epoch': 1.31}
{'loss': 1.8837, 'learning_rate': 1.7550553535620684e-05, 'epoch': 1.37}
{'loss': 1.7438, 'learning_rate': 1.7271197035939767e-05, 'epoch': 1.43}
{'loss': 1.9548, 'learning_rate': 1.697924608705937e-05, 'epoch': 1.49}
{'loss': 1.7632, 'learning_rate': 1.6675206377571925e-05, 'epoch': 1.56}
{'loss': 1.7338, 'learning_rate': 1.6359604535026768e-05, 'epoch': 1.62}
{'loss': 1.8384, 

  0%|          | 0/45 [00:00<?, ?it/s]

{'eval_loss': 1.9112491607666016, 'eval_accuracy': 0.38507358093903293, 'eval_f1_weighted': 0.36198189782394063, 'eval_f1_macro': 0.36123166984131244, 'eval_runtime': 92.9724, 'eval_samples_per_second': 30.697, 'eval_steps_per_second': 0.484, 'epoch': 2.0}
{'loss': 1.518, 'learning_rate': 1.3874848907576422e-05, 'epoch': 2.05}
{'loss': 1.5943, 'learning_rate': 1.3487905000400103e-05, 'epoch': 2.12}
{'loss': 1.5887, 'learning_rate': 1.3094919688744852e-05, 'epoch': 2.18}
{'loss': 1.5181, 'learning_rate': 1.2696573662945321e-05, 'epoch': 2.24}
{'loss': 1.5666, 'learning_rate': 1.2293556898635646e-05, 'epoch': 2.3}
{'loss': 1.5224, 'learning_rate': 1.1886567461641791e-05, 'epoch': 2.37}
{'loss': 1.5225, 'learning_rate': 1.1476310298860886e-05, 'epoch': 2.43}
{'loss': 1.6085, 'learning_rate': 1.1063496017221868e-05, 'epoch': 2.49}
{'loss': 1.5728, 'learning_rate': 1.064883965284234e-05, 'epoch': 2.55}
{'loss': 1.5631, 'learning_rate': 1.0233059432513654e-05, 'epoch': 2.62}
{'loss': 1.5546,

  0%|          | 0/45 [00:00<?, ?it/s]

{'eval_loss': 1.8608595132827759, 'eval_accuracy': 0.3931324456902593, 'eval_f1_weighted': 0.376677507484159, 'eval_f1_macro': 0.3739792335467585, 'eval_runtime': 76.663, 'eval_samples_per_second': 37.228, 'eval_steps_per_second': 0.587, 'epoch': 3.0}
{'loss': 1.3861, 'learning_rate': 7.351555423824361e-06, 'epoch': 3.05}
{'loss': 1.3596, 'learning_rate': 6.952612308085057e-06, 'epoch': 3.11}
{'loss': 1.3252, 'learning_rate': 6.558947576260705e-06, 'epoch': 3.18}
{'loss': 1.3764, 'learning_rate': 6.171243095510463e-06, 'epoch': 3.24}
{'loss': 1.278, 'learning_rate': 5.790170409235387e-06, 'epoch': 3.3}
{'loss': 1.3932, 'learning_rate': 5.416389573897269e-06, 'epoch': 3.36}
{'loss': 1.2705, 'learning_rate': 5.050548015734069e-06, 'epoch': 3.42}
{'loss': 1.3623, 'learning_rate': 4.6932794093521215e-06, 'epoch': 3.49}
{'loss': 1.3584, 'learning_rate': 4.345202580137597e-06, 'epoch': 3.55}
{'loss': 1.3409, 'learning_rate': 4.0069204323882826e-06, 'epoch': 3.61}
{'loss': 1.3834, 'learning_r

  0%|          | 0/45 [00:00<?, ?it/s]

{'eval_loss': 1.882143497467041, 'eval_accuracy': 0.3959355290819902, 'eval_f1_weighted': 0.38248937520401444, 'eval_f1_macro': 0.3729371664626534, 'eval_runtime': 73.4421, 'eval_samples_per_second': 38.861, 'eval_steps_per_second': 0.613, 'epoch': 4.0}
{'loss': 1.2699, 'learning_rate': 1.960028696330596e-06, 'epoch': 4.05}
{'loss': 1.2605, 'learning_rate': 1.719567778395449e-06, 'epoch': 4.11}
{'loss': 1.2403, 'learning_rate': 1.493449407148182e-06, 'epoch': 4.17}
{'loss': 1.1478, 'learning_rate': 1.282065242493713e-06, 'epoch': 4.23}
{'loss': 1.1927, 'learning_rate': 1.0857814231998664e-06, 'epoch': 4.3}
{'loss': 1.2776, 'learning_rate': 9.049379327079543e-07, 'epoch': 4.36}
{'loss': 1.2024, 'learning_rate': 7.39848010247064e-07, 'epoch': 4.42}
{'loss': 1.135, 'learning_rate': 5.907976082719958e-07, 'epoch': 4.48}
{'loss': 1.2842, 'learning_rate': 4.5804489716467895e-07, 'epoch': 4.55}
{'loss': 1.2247, 'learning_rate': 3.418198180569332e-07, 'epoch': 4.61}
{'loss': 1.2678, 'learning_

  0%|          | 0/45 [00:00<?, ?it/s]

{'eval_loss': 1.8837237358093262, 'eval_accuracy': 0.39558514365802383, 'eval_f1_weighted': 0.38009256548285597, 'eval_f1_macro': 0.37563646567185055, 'eval_runtime': 83.947, 'eval_samples_per_second': 33.998, 'eval_steps_per_second': 0.536, 'epoch': 5.0}
{'train_runtime': 6326.1584, 'train_samples_per_second': 20.299, 'train_steps_per_second': 0.635, 'train_loss': 1.7006105881402382, 'epoch': 5.0}
Evaluating best checkpoint…


  0%|          | 0/45 [00:00<?, ?it/s]

eval_loss: 1.8821
eval_accuracy: 0.3959
eval_f1_weighted: 0.3825
eval_f1_macro: 0.3729
eval_runtime: 81.1000
eval_samples_per_second: 35.1910
eval_steps_per_second: 0.5550
epoch: 5.0000
Saving to: C:/SonicAid_clean/saved_models/roberta-goemotion-final
Done.


In [None]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
df = pd.read_csv(r"C:/SonicAid_clean/data/raw/goemotions_df.csv").dropna(subset=["text"])
lens = [len(tokenizer(t, truncation=False)["input_ids"]) for t in df["text"].tolist()[:5000]]  # sample
print("p90,p95,p99:", np.percentile(lens, [90,95,99]))




p90,p95,p99: [31. 33. 37.]


In [None]:

# Paths
GOEMO_PATH = r"C:/SonicAid_clean/saved_models/roberta-goemotion-final"
DISTRESS_PATH = r"C:/SonicAid_clean/saved_models/distress_classifier"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def _norm(s: str) -> str:
    return str(s).strip().lower().replace(" ", "_").replace("-", "_")

# Load GoEmotions model/tokenizer (try fast tokenizer first)
goemotions_model = AutoModelForSequenceClassification.from_pretrained(GOEMO_PATH).to(device).eval()
try:
    goemotions_tokenizer = AutoTokenizer.from_pretrained(GOEMO_PATH, use_fast=True, local_files_only=True)
except Exception:
    goemotions_tokenizer = AutoTokenizer.from_pretrained(GOEMO_PATH, use_fast=False, local_files_only=True)

# Load distress model/tokenizer
distress_model = AutoModelForSequenceClassification.from_pretrained(DISTRESS_PATH).to(device).eval()
try:
    distress_tokenizer = AutoTokenizer.from_pretrained(DISTRESS_PATH, use_fast=True, local_files_only=True)
except Exception:
    distress_tokenizer = AutoTokenizer.from_pretrained(DISTRESS_PATH, use_fast=False, local_files_only=True)

# Resolve id2label (prefer saved label_classes.json)
id2label = None
label_json = os.path.join(GOEMO_PATH, "label_classes.json")
if os.path.isfile(label_json):
    with open(label_json, "r", encoding="utf-8") as f:
        classes = json.load(f)
    id2label = {i: classes[i] for i in range(len(classes))}
else:
    cfg = goemotions_model.config
    if isinstance(cfg.id2label, dict) and len(cfg.id2label) == cfg.num_labels:
        names = [cfg.id2label.get(str(i), cfg.id2label.get(i, f"label_{i}")) for i in range(cfg.num_labels)]
        id2label = {i: names[i] for i in range(len(names))}

assert id2label is not None, "Could not resolve id2label for GoEmotions model."
label_names = [id2label[i] for i in range(len(id2label))]
label_names_norm = [_norm(x) for x in label_names]

# Fine → coarse buckets (for music prompts)
FINE_TO_COARSE = {
    "joy": {"amusement","excitement","joy","optimism","pride","approval"},
    "sadness": {"sadness","disappointment","grief","remorse"},
    "anger": {"anger","annoyance","disgust"},
    "fear": {"fear","nervousness"},
    "love": {"love","caring","admiration"},
    "gratitude": {"gratitude"},
    "neutral": {"neutral","relief","realization","curiosity","confusion","surprise","desire","embarrassment","disapproval"},
}
FINE_TO_COARSE = {k: {_norm(v) for v in vals} for k, vals in FINE_TO_COARSE.items()}

def fine_to_coarse(label_str: str) -> str:
    n = _norm(label_str)
    for coarse, fines in FINE_TO_COARSE.items():
        if n in fines:
            return coarse
    return "neutral"

# Distress vs non‑distress groups (normalized)
distress_emotions = {_norm(x) for x in {"anger","fear","grief","remorse","sadness","disappointment","nervousness","disgust","disapproval"}}
non_distress_emotions = {_norm(x) for x in {"joy","love","gratitude","optimism","amusement","excitement","surprise","relief","admiration","approval","caring"}}

@torch.no_grad()
def hybrid_predict(text: str, topk: int = 5, max_length: int = 96):
    # 1) Distress classifier
    di = distress_tokenizer([text], return_tensors="pt", padding=True, truncation=True, max_length=max_length).to(device)
    dlogits = distress_model(**di).logits
    dprobs = torch.softmax(dlogits, dim=1)
    is_distress = int(torch.argmax(dprobs, dim=1).item())
    distress_conf = float(dprobs[0, is_distress].item())

    # 2) GoEmotions classifier
    gi = goemotions_tokenizer([text], return_tensors="pt", padding=True, truncation=True, max_length=max_length).to(device)
    logits = goemotions_model(**gi).logits
    probs = torch.softmax(logits, dim=1).detach().cpu().numpy()[0]

    # 3) Top‑k fine labels
    k = min(topk, len(probs))
    top_idx = np.argsort(probs)[::-1][:k]
    top_emotions = [(label_names[i], float(probs[i])) for i in top_idx]
    top_norm = [(_norm(label_names[i]), float(probs[i])) for i in top_idx]

    # 4) Filter by distress flag (ensure non‑empty)
    if is_distress:
        filtered = [(lab, p) for lab, p in top_norm if lab in distress_emotions]
    else:
        filtered = [(lab, p) for lab, p in top_norm if lab in non_distress_emotions]
    if not filtered:
        filtered = [top_norm[0]]

    # 5) Pick final label and coarse bucket
    chosen_fine_norm, chosen_prob = filtered[0]
    try:
        idx = label_names_norm.index(chosen_fine_norm)
        chosen_fine = label_names[idx]
    except ValueError:
        chosen_fine = chosen_fine_norm
    chosen_coarse = fine_to_coarse(chosen_fine)

    return {
        "text": text,
        "distress": bool(is_distress),
        "distress_conf": round(distress_conf, 4),
        "top_emotions": [(label_names[i], round(float(probs[i]), 4)) for i in top_idx],
        "filtered_emotions": [(lab, round(p,4)) for lab, p in filtered],
        "chosen_emotion_fine": chosen_fine,
        "chosen_emotion_coarse": chosen_coarse,
        "chosen_prob": round(float(chosen_prob), 4),
    }

# Quick demo
if __name__ == "__main__":
    sample = "I feel so hopeless and tired of everything."
    out = hybrid_predict(sample)
    print(out)


{'text': 'I feel so hopeless and tired of everything.', 'distress': True, 'distress_conf': 0.9791, 'top_emotions': [('sadness', 0.5292), ('disappointment', 0.321), ('annoyance', 0.022), ('nervousness', 0.0165), ('disgust', 0.0155)], 'filtered_emotions': [('sadness', 0.5292), ('disappointment', 0.321), ('nervousness', 0.0165), ('disgust', 0.0155)], 'chosen_emotion_fine': 'sadness', 'chosen_emotion_coarse': 'sadness', 'chosen_prob': 0.5292}


: 

In [None]:

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Distress (binary) model
distress_model = AutoModelForSequenceClassification.from_pretrained("saved_models/distress_classifier").to(device)
distress_tokenizer = AutoTokenizer.from_pretrained("saved_models/distress_classifier")

# GoEmotions (multi-class, DistilBERT)
goemotions_model = AutoModelForSequenceClassification.from_pretrained("saved_models/goemotions_distilbert_v3").to(device)
goemotions_tokenizer = AutoTokenizer.from_pretrained("saved_models/goemotions_distilbert_v3")

# Label map for GoEmotions
id2emotion = {
    0: "amusement", 1: "anger", 2: "annoyance", 3: "approval", 4: "caring", 5: "confusion", 6: "curiosity",
    7: "desire", 8: "disappointment", 9: "disapproval", 10: "disgust", 11: "embarrassment", 12: "excitement",
    13: "fear", 14: "gratitude", 15: "grief", 16: "joy", 17: "love", 18: "nervousness", 19: "neutral",
    20: "optimism", 21: "pride", 22: "realization", 23: "relief", 24: "remorse", 25: "sadness", 26: "surprise"
}

# Simple buckets for filtering
distress_emotions = {"anger", "fear", "grief", "remorse", "sadness", "disappointment", "nervousness"}
non_distress_emotions = {"joy", "love", "gratitude", "optimism", "amusement", "excitement", "surprise"}

def hybrid_predict(text):
    # Distress prediction
    dist_input = distress_tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        dist_output = distress_model(**dist_input)
        is_distress = torch.argmax(dist_output.logits, dim=1).item()

    # GoEmotions prediction
    goemo_input = goemotions_tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        goemo_output = goemotions_model(**goemo_input)
        probs = torch.softmax(goemo_output.logits, dim=1).cpu().numpy()[0]

    # Top‑5 emotions
    top_indices = np.argsort(probs)[::-1][:5]
    top_emotions = [(id2emotion[i], round(probs[i], 4)) for i in top_indices]

    # Filter by distress flag
    if is_distress:
        filtered = [(emo, prob) for emo, prob in top_emotions if emo in distress_emotions]
    else:
        filtered = [(emo, prob) for emo, prob in top_emotions if emo in non_distress_emotions]

    return {
        "text": text,
        "distress": bool(is_distress),
        "top_emotions": top_emotions,
        "filtered_emotions": filtered
    }

# Example
sample = "Everything feels overwhelming and out of control lately."
result = hybrid_predict(sample)

print("\nText:", result["text"])
print("Detected Distress:" if result["distress"] else "🙂 No Distress Detected")
print("\nTop Emotions:", result["top_emotions"])
print("Filtered Emotions:", result["filtered_emotions"])



📌 Text: Everything feels overwhelming and out of control lately.
😔 Detected Distress:

Top Emotions: [('disappointment', 0.3716), ('sadness', 0.1356), ('annoyance', 0.1336), ('nervousness', 0.1001), ('neutral', 0.0719)]
🎯 Filtered Emotions: [('disappointment', 0.3716), ('sadness', 0.1356), ('nervousness', 0.1001)]
