# AI Diary
This notebook demonstrates:
1. Dataset loading and preprocessing
2. A domain-specific BERT baseline (ClinicalBERT)
3. A transformer-based prototype (RoBERTa)
4. Model evaluation and comparison

**NOTE:**
This is an INTERIM prototype (IPD)
Models are trained lightly for feasibility, not optimisation


In [None]:
# ======================================================
# AI Diary Project — Emotion Detection (IPD)


# Core system utilities
import os
import glob
from zipfile import ZipFile

# Data handling
import pandas as pd
import numpy as np

# Visualisation
import matplotlib.pyplot as plt

# Machine learning utilities
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# Deep learning / NLP
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)

# Hugging Face dataset helper
from datasets import Dataset

# Disable external logging tools
os.environ["WANDB_DISABLED"] = "true"


In [None]:
# ======================================================
# Dataset Upload and Extraction


from google.colab import files

uploaded = files.upload()

zip_path = list(uploaded.keys())[0]
os.makedirs("goemotions", exist_ok=True)

with ZipFile(zip_path, "r") as zip_ref:
    zip_ref.extractall("goemotions")


In [None]:
# ======================================================
# Locate and Load Dataset Files


def find_file(pattern):
    matches = glob.glob(pattern, recursive=True)
    if not matches:
        raise FileNotFoundError(f"No file: {pattern}")
    return matches[0]

train_path = find_file("goemotions/**/train.tsv")
emotions_path = find_file("goemotions/**/emotions.txt")

train_df = pd.read_csv(
    train_path,
    sep="\t",
    names=["text", "labels", "id"]
)

with open(emotions_path, encoding="utf-8") as f:
    emotions = [line.strip() for line in f if line.strip()]

print("Samples:", len(train_df))
print("Emotion:", len(emotions))


In [None]:
# ======================================================
# Dataset Subsampling (IPD Justification)


train_small = train_df.sample(5000, random_state=42).reset_index(drop=True)
NUM_EMOTIONS = len(emotions)

print("Subset size:", len(train_small))


In [None]:
# ======================================================
# Label Encoding (Multi-label → Multi-hot)


def encode_labels(label_string):
    vector = np.zeros(NUM_EMOTIONS)
    for label in str(label_string).split(","):
        if label.isdigit():
            vector[int(label)] = 1
    return vector

train_small["label_vector"] = train_small["labels"].apply(encode_labels)


In [None]:
# ======================================================
# Train/Test Split


X_train, X_test, y_train, y_test = train_test_split(
    train_small["text"],
    np.stack(train_small["label_vector"]),
    test_size=0.2,
    random_state=42
)


In [None]:
# ======================================================
# Evaluation Metric

def compute_metrics(eval_pred):
    logits, labels = eval_pred

    # Apply sigmoid to logits
    probs = torch.sigmoid(torch.tensor(logits)).numpy()

    # Use LOWER threshold for IPD
    predictions = (probs >= 0.3).astype(int)

    return {
        "macro_f1": f1_score(labels, predictions, average="macro", zero_division=0),
        "micro_f1": f1_score(labels, predictions, average="micro", zero_division=0),
    }


In [None]:
# ======================================================
# Baseline Model — Domain-Specific BERT (ClinicalBERT)


CLINICAL_BERT = "emilyalsentzer/Bio_ClinicalBERT"

clinical_tokenizer = AutoTokenizer.from_pretrained(CLINICAL_BERT)

def tokenize_clinical(batch):
    return clinical_tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

clinical_train_ds = Dataset.from_dict({
    "text": X_train.tolist(),
    "labels": y_train.tolist()
})

clinical_test_ds = Dataset.from_dict({
    "text": X_test.tolist(),
    "labels": y_test.tolist()
})

clinical_train_ds = clinical_train_ds.map(tokenize_clinical, batched=True)
clinical_test_ds = clinical_test_ds.map(tokenize_clinical, batched=True)

clinical_train_ds.set_format("torch")
clinical_test_ds.set_format("torch")


In [None]:
# ======================================================
# ClinicalBERT Model Configuration

clinical_model = AutoModelForSequenceClassification.from_pretrained(
    CLINICAL_BERT,
    num_labels=NUM_EMOTIONS,
    problem_type="multi_label_classification"
)

clinical_args = TrainingArguments(
    output_dir="clinicalbert_results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    save_strategy="no",
    report_to="none"
)

clinical_trainer = Trainer(
    model=clinical_model,
    args=clinical_args,
    train_dataset=clinical_train_ds,
    eval_dataset=clinical_test_ds,
    compute_metrics=compute_metrics
)

clinical_trainer.train()
clinical_metrics = clinical_trainer.evaluate()

print("ClinicalBERT Performance:", clinical_metrics)


In [None]:
# ======================================================
# Tokenisation using RoBERTa


roberta_tokenizer = AutoTokenizer.from_pretrained("roberta-base")

def tokenize_roberta(batch):
    return roberta_tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

train_ds = Dataset.from_dict({"text": X_train.tolist(), "labels": y_train.tolist()})
test_ds = Dataset.from_dict({"text": X_test.tolist(), "labels": y_test.tolist()})

train_ds = train_ds.map(tokenize_roberta, batched=True)
test_ds = test_ds.map(tokenize_roberta, batched=True)

train_ds.set_format("torch")
test_ds.set_format("torch")



In [None]:
# ======================================================
# RoBERTa Model Configuration

roberta_model = AutoModelForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=NUM_EMOTIONS,
    problem_type="multi_label_classification"
)

roberta_args = TrainingArguments(
    output_dir="roberta_results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    save_strategy="no",
    report_to="none"
)

roberta_trainer = Trainer(
    model=roberta_model,
    args=roberta_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics
)

roberta_trainer.train()

roberta_metrics = roberta_trainer.evaluate()

print("RoBERTa Performance:", roberta_metrics)


In [None]:
# ======================================================
# Model Performance Comparison (Interim Results)


models = ["ClinicalBERT", "RoBERTa"]

macro_f1 = [
    clinical_metrics["eval_macro_f1"],
    roberta_metrics["eval_macro_f1"]
]

micro_f1 = [
    clinical_metrics["eval_micro_f1"],
    roberta_metrics["eval_micro_f1"]
]

x = np.arange(len(models))
width = 0.35

plt.figure(figsize=(8, 5))

plt.bar(
    x - width/2,
    macro_f1,
    width,
    label="Macro F1 (Class-balanced)"
)

plt.bar(
    x + width/2,
    micro_f1,
    width,
    label="Micro F1 (Overall)"
)

plt.xticks(x, models)
plt.ylabel("F1 Score")
plt.xlabel("Model")
plt.title(
    "Performance Comparison (ClinicalBERT and RoBERTa)"
)

plt.legend()
plt.tight_layout()
plt.show()
