In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip -q install -U transformers datasets evaluate accelerate scikit-learn==1.6


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m60.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m90.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m41.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m35.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m43.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from datasets import load_from_disk
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          DataCollatorWithPadding, TrainingArguments, Trainer)
import torch, numpy as np, json, os
from sklearn.metrics import f1_score, precision_score, recall_score

DATASET_DIR = "/content/drive/MyDrive/emotional_ai/data/text/goemotions"
OUTPUT_DIR  = "/content/drive/MyDrive/emotional_ai/models/text_bert_goemotions"
os.makedirs(OUTPUT_DIR, exist_ok=True)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

# load dataset and labels
ds = load_from_disk(DATASET_DIR)
label_names = ds['train'].features['labels'].feature.names
num_labels  = len(label_names)
print("num_labels:", num_labels)

# tokenize + multihot
MODEL_NAME = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def to_multihot(labels, num_labels):
    vec = np.zeros(num_labels, dtype=np.float32)
    for i in labels:
        vec[i] = 1.0
    return vec

def preprocess(batch):
    enc = tokenizer(batch["text"], truncation=True, max_length=128)
    enc["labels"] = [to_multihot(lbls, num_labels) for lbls in batch["labels"]]
    return enc

# keep only model inputs returned by preprocess
encoded = ds.map(preprocess, batched=True, remove_columns=ds["train"].column_names)

# model
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
    problem_type="multi_label_classification",
).to(device)

# metrics
def compute_metrics(eval_pred, threshold=0.5):
    logits, labels = eval_pred
    probs = 1 / (1 + np.exp(-logits))
    preds = (probs >= threshold).astype(int)

    f1_micro  = f1_score(labels, preds, average="micro", zero_division=0)
    p_micro   = precision_score(labels, preds, average="micro", zero_division=0)
    r_micro   = recall_score(labels, preds, average="micro", zero_division=0)

    f1_macro  = f1_score(labels, preds, average="macro", zero_division=0)
    p_macro   = precision_score(labels, preds, average="macro", zero_division=0)
    r_macro   = recall_score(labels, preds, average="macro", zero_division=0)

    return {
        "f1_micro": f1_micro, "precision_micro": p_micro, "recall_micro": r_micro,
        "f1_macro": f1_macro, "precision_macro": p_macro, "recall_macro": r_macro,
    }

# collator: ensure labels are float32
class DataCollatorForMultiLabel(DataCollatorWithPadding):
    def __call__(self, features):
        batch = super().__call__(features)
        batch["labels"] = torch.tensor([f["labels"] for f in features], dtype=torch.float32)
        return batch

data_collator = DataCollatorForMultiLabel(tokenizer=tokenizer)

# training args
args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=1,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_ratio=0.06,
    lr_scheduler_type="linear",
    load_best_model_at_end=True,
    metric_for_best_model="f1_micro",
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=encoded["train"],
    eval_dataset=encoded["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    processing_class=tokenizer,
)

# sanity check
b = data_collator([encoded["train"][0], encoded["train"][1]])
print("labels dtype:", b["labels"].dtype)  # should be torch.float32

trainer.train()

# test eval and save
test_metrics = trainer.evaluate(encoded["test"])
print("Test metrics:", test_metrics)

trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
with open(os.path.join(OUTPUT_DIR, "label_names.json"), "w") as f:
    json.dump(label_names, f, indent=2)

print("Saved to:", OUTPUT_DIR)


Device: cuda
num_labels: 28


Map:   0%|          | 0/39069 [00:00<?, ? examples/s]

Map:   0%|          | 0/2171 [00:00<?, ? examples/s]

Map:   0%|          | 0/2170 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


labels dtype: torch.float32


Epoch,Training Loss,Validation Loss,F1 Micro,Precision Micro,Recall Micro,F1 Macro,Precision Macro,Recall Macro
1,0.1059,0.103807,0.461744,0.735094,0.336583,0.154957,0.226817,0.137481
2,0.092,0.092666,0.51189,0.743133,0.390406,0.259235,0.421699,0.224149
3,0.0777,0.0918,0.545233,0.727391,0.436037,0.344521,0.518891,0.289225


Test metrics: {'eval_loss': 0.09370981156826019, 'eval_f1_micro': 0.5195435785384802, 'eval_precision_micro': 0.6885456885456885, 'eval_recall_micro': 0.4171539961013645, 'eval_f1_macro': 0.3225448384128381, 'eval_precision_macro': 0.48861651919730853, 'eval_recall_macro': 0.28005166960420586, 'eval_runtime': 2.9021, 'eval_samples_per_second': 748.072, 'eval_steps_per_second': 23.431, 'epoch': 3.0}
Saved to: /content/drive/MyDrive/emotional_ai/models/text_bert_goemotions


In [None]:
# Weighted fine-tune + threshold tuning (self-healing)
import os, json, numpy as np, torch
from datasets import load_from_disk
from torch.utils.data import DataLoader
from sklearn.metrics import f1_score, precision_score, recall_score
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          DataCollatorWithPadding, TrainingArguments, Trainer)
from transformers.trainer_callback import EarlyStoppingCallback

# Paths and device
DATASET_DIR = globals().get("DATASET_DIR", "/content/drive/MyDrive/emotional_ai/data/text/goemotions")
OUTPUT_DIR  = globals().get("OUTPUT_DIR",  "/content/drive/MyDrive/emotional_ai/models/text_bert_goemotions")
MODEL_NAME  = globals().get("MODEL_NAME",  "bert-base-uncased")
os.makedirs(OUTPUT_DIR, exist_ok=True)
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load dataset and labels
ds = globals().get("ds", None) or load_from_disk(DATASET_DIR)
label_names = ds['train'].features['labels'].feature.names
num_labels  = len(label_names)

# Tokenizer
tokenizer = globals().get("tokenizer", None) or AutoTokenizer.from_pretrained(MODEL_NAME)

# Preprocess
def to_multihot(labels, K, dtype=np.float32):
    v = np.zeros(K, dtype=dtype)
    for i in labels: v[i] = 1.0
    return v

def preprocess(batch):
    enc = tokenizer(batch["text"], truncation=True, max_length=128)
    enc["labels"] = [to_multihot(lbls, num_labels) for lbls in batch["labels"]]
    return enc

encoded = globals().get("encoded", None) or ds.map(
    preprocess, batched=True, remove_columns=ds["train"].column_names
)

# Collator to force float32 labels
class DataCollatorForMultiLabel(DataCollatorWithPadding):
    def __call__(self, features):
        batch = super().__call__(features)
        batch["labels"] = torch.tensor([f["labels"] for f in features], dtype=torch.float32)
        return batch

data_collator = DataCollatorForMultiLabel(tokenizer=tokenizer)

# Model: resume from OUTPUT_DIR if exists, else base
if os.path.exists(os.path.join(OUTPUT_DIR, "config.json")):
    model = AutoModelForSequenceClassification.from_pretrained(OUTPUT_DIR).to(device)
else:
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME, num_labels=num_labels, problem_type="multi_label_classification"
    ).to(device)

# Metrics at threshold=0.5 (for monitoring during training)
def compute_metrics(eval_pred, threshold=0.5):
    logits, labels = eval_pred
    probs = 1 / (1 + np.exp(-logits))
    preds = (probs >= threshold).astype(int)
    return {
        "f1_micro":  f1_score(labels, preds, average="micro", zero_division=0),
        "precision_micro": precision_score(labels, preds, average="micro", zero_division=0),
        "recall_micro":    recall_score(labels, preds, average="micro", zero_division=0),
        "f1_macro":  f1_score(labels, preds, average="macro", zero_division=0),
        "precision_macro": precision_score(labels, preds, average="macro", zero_division=0),
        "recall_macro":    recall_score(labels, preds, average="macro", zero_division=0),
    }

# Class weights for BCEWithLogits (pos_weight)
Y = np.array(encoded["train"]["labels"], dtype=np.float32)
pos = Y.sum(axis=0)
neg = Y.shape[0] - pos
pos_weight = torch.tensor((neg + 1e-3) / (pos + 1e-3), dtype=torch.float32)
pos_weight = torch.clamp(pos_weight, max=10.0)  # cap extremes for stability

# Weighted Trainer (uses pos_weight)
class WeightedTrainer(Trainer):
    def __init__(self, pos_weight=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.pos_weight = pos_weight

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.BCEWithLogitsLoss(pos_weight=self.pos_weight.to(logits.device))
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

args_w = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=1,
    num_train_epochs=5,                 # early stopping will cut it if no gains
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_ratio=0.06,
    lr_scheduler_type="linear",
    load_best_model_at_end=True,
    metric_for_best_model="f1_micro",
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    report_to="none",
    save_total_limit=2,
)

weighted_trainer = WeightedTrainer(
    model=model,
    args=args_w,
    train_dataset=encoded["train"],
    eval_dataset=encoded["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    processing_class=tokenizer,
    pos_weight=pos_weight,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)],
)

print("Training...")
weighted_trainer.train()

# Threshold tuning (global + per-class), evaluation, and save

# 1 Collect logits/labels
def collect_logits_labels(model_eval, ds_split, batch_size=64):
    dl = DataLoader(ds_split, batch_size=batch_size, shuffle=False, collate_fn=data_collator)
    all_logits, all_labels = [], []
    for batch in dl:
        labels = batch.pop("labels")
        inputs = {k: v.to(model.device) for k, v in batch.items()}
        with torch.no_grad():
            logits = model_eval(**inputs).logits
        all_logits.append(logits.cpu().numpy())
        all_labels.append(labels.numpy())
    return np.vstack(all_logits), np.vstack(all_labels)

model_eval = weighted_trainer.model.eval()
val_logits, val_labels   = collect_logits_labels(model_eval, encoded["validation"])
test_logits, test_labels = collect_logits_labels(model_eval, encoded["test"])

sigmoid   = lambda x: 1.0 / (1.0 + np.exp(-x))
val_probs  = sigmoid(val_logits)
test_probs = sigmoid(test_logits)

# 2 Global threshold sweep (wider grid)
grid = np.linspace(0.30, 0.90, 25)
best_f1_g, BEST_T = -1.0, 0.5
for t in grid:
    preds = (val_probs >= t).astype(int)
    f1 = f1_score(val_labels, preds, average="micro", zero_division=0)
    if f1 > best_f1_g:
        best_f1_g, BEST_T = f1, float(t)

# 3 Per-class thresholds
def best_thresholds_per_class(val_probs, val_labels, grid=np.linspace(0.10, 0.90, 33)):
    C = val_probs.shape[1]
    per_class_t = np.full(C, 0.5, dtype=np.float32)
    for i in range(C):
        best_f1, best_t = -1.0, 0.5
        col, y = val_probs[:, i], val_labels[:, i]
        for t in grid:
            f1 = f1_score(y, (col >= t).astype(int), zero_division=0)
            if f1 > best_f1:
                best_f1, best_t = f1, float(t)
        per_class_t[i] = best_t
    return per_class_t

CLS_T = best_thresholds_per_class(val_probs, val_labels)

# 4 Evaluate (validation + test) for both strategies
from sklearn.metrics import precision_score, recall_score

# Global threshold
val_preds_g  = (val_probs  >= BEST_T).astype(int)
test_preds_g = (test_probs >= BEST_T).astype(int)
val_f1_g     = f1_score(val_labels,  val_preds_g,  average="micro", zero_division=0)
test_metrics_g = {
    "test_f1_micro": f1_score(test_labels, test_preds_g, average="micro", zero_division=0),
    "test_p_micro":  precision_score(test_labels, test_preds_g, average="micro", zero_division=0),
    "test_r_micro":  recall_score(test_labels,  test_preds_g, average="micro", zero_division=0),
    "best_t": BEST_T,
    "strategy": "global"
}

# Per-class thresholds
val_preds_pc  = (val_probs  >= CLS_T).astype(int)
test_preds_pc = (test_probs >= CLS_T).astype(int)
val_f1_pc     = f1_score(val_labels,  val_preds_pc,  average="micro", zero_division=0)
test_metrics_pc = {
    "test_f1_micro": f1_score(test_labels, test_preds_pc, average="micro", zero_division=0),
    "test_p_micro":  precision_score(test_labels, test_preds_pc, average="micro", zero_division=0),
    "test_r_micro":  recall_score(test_labels,  test_preds_pc, average="micro", zero_division=0),
    "strategy": "per-class"
}

print({"val_micro_f1_global": val_f1_g, "best_global_t": BEST_T})
print({"val_micro_f1_per_class": val_f1_pc})
print({"test_global": test_metrics_g})
print({"test_per_class": test_metrics_pc})

# 5 Pick strategy by validation micro-F1 (safer than peeking at test)
use_per_class = bool(val_f1_pc > val_f1_g)
print("Selected strategy:", "per-class" if use_per_class else "global")

# 6 Save model + labels + thresholds
weighted_trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

with open(os.path.join(OUTPUT_DIR, "label_names.json"), "w") as f:
    json.dump(label_names, f, indent=2)

with open(os.path.join(OUTPUT_DIR, "inference_config.json"), "w") as f:
    json.dump(
        {
            "use_per_class": use_per_class,
            "threshold": float(BEST_T),
            "per_class_thresholds": CLS_T.tolist(),
            "prob_floor": 0.40,   # <-- small probability floor default
        },
        f, indent=2
    )

print("Saved to:", OUTPUT_DIR)




Training...


Epoch,Training Loss,Validation Loss,F1 Micro,Precision Micro,Recall Micro,F1 Macro,Precision Macro,Recall Macro
1,0.1017,0.49603,0.537378,0.467282,0.632215,0.472945,0.422907,0.569791
2,0.0916,0.545018,0.534095,0.480087,0.601794,0.468513,0.461119,0.529728


{'val_micro_f1_global': 0.5460035523978686, 'best_global_t': 0.625}
{'val_micro_f1_per_class': 0.566296883254094}
{'test_global': {'test_f1_micro': 0.5273209549071618, 'test_p_micro': 0.4825242718446602, 'test_r_micro': 0.5812865497076023, 'best_t': 0.625, 'strategy': 'global'}}
{'test_per_class': {'test_f1_micro': 0.5384344146685472, 'test_p_micro': 0.4914708722240103, 'test_r_micro': 0.5953216374269006, 'strategy': 'per-class'}}
Selected strategy: per-class
Saved to: /content/drive/MyDrive/emotional_ai/models/text_bert_goemotions


In [None]:
# Inference helper (uses per-class thresholds + top_k + prob floor)
import os, json, re, numpy as np, torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load artifacts
inf_tokenizer = AutoTokenizer.from_pretrained(OUTPUT_DIR)
inf_model     = AutoModelForSequenceClassification.from_pretrained(OUTPUT_DIR).to(device).eval()
with open(os.path.join(OUTPUT_DIR, "label_names.json")) as f:
    inf_label_names = json.load(f)

# Same normalization used in data prep (lowercase & strip non-alnum)
_non_alnum = re.compile(r"[^a-zA-Z0-9\s]")
def normalize_text(t: str) -> str:
    t = t.lower()
    t = _non_alnum.sub("", t)
    return t.strip()

def _load_thresholds_and_floor():
    cfg_path = os.path.join(OUTPUT_DIR, "inference_config.json")
    thr = 0.5
    per_class = None
    prob_floor = None
    try:
        with open(cfg_path) as f:
            cfg = json.load(f)
        if cfg.get("use_per_class") and cfg.get("per_class_thresholds"):
            per_class = np.array(cfg["per_class_thresholds"], dtype=np.float32)
            thr = per_class                     # vector
        else:
            thr = float(cfg.get("threshold", 0.5))  # scalar
        if "prob_floor" in cfg:
            prob_floor = float(cfg["prob_floor"])
    except:
        pass
    return thr, prob_floor

@torch.no_grad()
def predict_emotions(texts, top_k=3, prob_floor=None, threshold=None):
    """
    - top_k: keep only the K highest labels among those that pass threshold; set None to return all passing labels.
    - prob_floor: minimal probability required (scalar). If None, uses value from inference_config.json if present.
    - threshold: overrides saved threshold(s). Can be scalar or per-class vector (len == num_labels).
    """
    if isinstance(texts, str):
        texts = [texts]

    texts = [normalize_text(t) for t in texts]
    enc = inf_tokenizer(texts, return_tensors="pt", truncation=True, max_length=128, padding=True).to(device)
    probs = torch.sigmoid(inf_model(**enc).logits).cpu().numpy()  # [B, C]

    saved_thr, saved_floor = _load_thresholds_and_floor()
    thr   = threshold if threshold is not None else saved_thr   # scalar or vector
    floor = saved_floor if prob_floor is None else prob_floor   # scalar or None

    # Effective threshold = max(threshold, prob_floor)
    if floor is not None:
        eff_thr = np.maximum(thr, floor) if isinstance(thr, np.ndarray) else max(float(thr), float(floor))
    else:
        eff_thr = thr

    outputs = []
    for row in probs:
        if isinstance(eff_thr, np.ndarray):
            passed = np.where(row >= eff_thr)[0]
        else:
            passed = np.where(row >= float(eff_thr))[0]

        # Sort by score desc
        order = np.argsort(row[passed])[::-1]
        idxs = passed[order]
        if top_k is not None:
            idxs = idxs[:top_k]


        outputs.append([{ "label": inf_label_names[i], "score": float(row[i]) } for i in idxs])
    return outputs

# Quick smoke test
print(predict_emotions([
    "i'm so happy you made it!",
    "this is frustrating and makes me angry.",
    "i feel anxious about the exam."
], top_k=3, prob_floor=0.40))


[[{'label': 'joy', 'score': 0.9910314083099365}], [{'label': 'anger', 'score': 0.9869417548179626}], [{'label': 'nervousness', 'score': 0.9581727385520935}, {'label': 'fear', 'score': 0.9337173104286194}]]


In [None]:
# Per-class metrics export (test set)
import os, json, numpy as np, pandas as pd
from sklearn.metrics import f1_score, precision_score, recall_score

# Load thresholds and config once
with open(os.path.join(OUTPUT_DIR, "inference_config.json")) as f:
    cfg = json.load(f)

use_per_class = bool(cfg.get("use_per_class", False))
prob_floor    = cfg.get("prob_floor", None)

if use_per_class and cfg.get("per_class_thresholds"):
    thr_vec = np.array(cfg["per_class_thresholds"], dtype=np.float32)   # vector [C]
else:
    thr_scalar = float(cfg.get("threshold", 0.5))
    thr_vec = np.full(len(label_names), thr_scalar, dtype=np.float32)   # broadcast to [C]

# Apply probability floor if present (elementwise max)
if prob_floor is not None:
    thr_vec = np.maximum(thr_vec, float(prob_floor))

# Get test logits/labels
test_logits, test_labels = collect_logits_labels(weighted_trainer.model.eval(), encoded["test"])
test_probs = 1.0 / (1.0 + np.exp(-test_logits))  # sigmoid

# Predictions under the effective thresholds
preds = (test_probs >= thr_vec).astype(int)

rows = []
for i, name in enumerate(label_names):
    y_true = test_labels[:, i]
    y_pred = preds[:, i]
    rows.append({
        "label": name,
        "threshold_used": float(thr_vec[i]),
        "support_true": int(y_true.sum()),
        "support_pred": int(y_pred.sum()),
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "recall":    recall_score(y_true, y_pred, zero_division=0),
        "f1":        f1_score(y_true, y_pred, zero_division=0),
    })

df = pd.DataFrame(rows).sort_values(["support_true", "f1"], ascending=[False, False])

# Micro-avg summary row at top
micro_f1 = f1_score(test_labels, preds, average="micro", zero_division=0)
micro_p  = precision_score(test_labels, preds, average="micro", zero_division=0)
micro_r  = recall_score(test_labels, preds, average="micro", zero_division=0)
summary = pd.DataFrame([{
    "label": "__micro__",
    "threshold_used": np.nan,
    "support_true": int(test_labels.sum()),
    "support_pred": int(preds.sum()),
    "precision": micro_p, "recall": micro_r, "f1": micro_f1
}])

df_out = pd.concat([summary, df], ignore_index=True)

csv_path = os.path.join(OUTPUT_DIR, "per_class_metrics_test.csv")
df_out.to_csv(csv_path, index=False)
print("Per-class metrics saved to:", csv_path)


Per-class metrics saved to: /content/drive/MyDrive/emotional_ai/models/text_bert_goemotions/per_class_metrics_test.csv
