<a href="https://colab.research.google.com/github/AbdulaAlShyed-2212592042/AbdulaAlShyed-2212592042/blob/main/EMAPTHY.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ===============================
# EmpathAI: Emotion-Aware Chatbot
# Google Colab + Drive Integration
# ===============================

# 1. Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# 2. Install required packages
!pip install transformers datasets torch scikit-learn requests --quiet


Mounted at /content/drive


In [2]:
# ===============================
# Part 2: Load and Preprocess GoEmotions Dataset
# ===============================

import os
import numpy as np
from datasets import load_dataset

# 1. Set a folder in Google Drive to cache dataset
CACHE_DIR = "/content/drive/MyDrive/EmpathAI/cache"
os.makedirs(CACHE_DIR, exist_ok=True)

# 2. Load GoEmotions dataset with cache_dir pointing to Drive
dataset = load_dataset("go_emotions", cache_dir=CACHE_DIR)

# 3. Emotion labels
emotion_labels = [
    "admiration","amusement","anger","annoyance","approval","caring","confusion",
    "curiosity","desire","disappointment","disapproval","disgust","embarrassment",
    "excitement","fear","gratitude","grief","joy","love","nervousness","optimism",
    "pride","realization","relief","remorse","sadness","surprise","neutral"
]

# 4. Preprocessing function
from transformers import AutoTokenizer

MODEL_NAME = "roberta-base"
MAX_LEN = 128
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

NUM_LABELS = len(emotion_labels)

def preprocess(batch):
    tokens = tokenizer(batch['text'], padding='max_length', truncation=True, max_length=MAX_LEN)
    labels = np.zeros(NUM_LABELS, dtype=int)
    for idx in batch['labels']:
        labels[idx] = 1
    tokens['labels'] = labels.tolist()
    return tokens

# Apply preprocessing
dataset = dataset.map(preprocess, batched=False)

# Format dataset as PyTorch tensors
dataset.set_format(type='torch', columns=['input_ids','attention_mask','labels'])

print("Dataset ready. Train samples:", len(dataset['train']))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

simplified/train-00000-of-00001.parquet:   0%|          | 0.00/2.77M [00:00<?, ?B/s]

simplified/validation-00000-of-00001.par(…):   0%|          | 0.00/350k [00:00<?, ?B/s]

simplified/test-00000-of-00001.parquet:   0%|          | 0.00/347k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/43410 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5426 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5427 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

Dataset ready. Train samples: 43410


In [5]:
# ===============================
# Part 3: DataLoaders and Model Setup (Fixed)
# ===============================

import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, get_scheduler
from torch.optim import AdamW  # Correct import for AdamW

# 1. Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

# 2. DataLoaders
BATCH_SIZE = 16

train_loader = DataLoader(dataset['train'], batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(dataset['validation'], batch_size=BATCH_SIZE)

# 3. Model initialization
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS,
    problem_type="multi_label_classification"
).to(device)

# 4. Optimizer & Scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)  # torch.optim.AdamW
num_epochs = 3
num_training_steps = num_epochs * len(train_loader)

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

# 5. Loss function
loss_fn = torch.nn.BCEWithLogitsLoss()

print("Model and DataLoaders ready for training.")


Using device: cuda


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model and DataLoaders ready for training.


In [6]:
# ===============================
# Part 4: Training and Saving Model
# ===============================

from tqdm import tqdm
from sklearn.metrics import f1_score
import os

# 1. Create folder in Google Drive to save model
MODEL_SAVE_PATH = "/content/drive/MyDrive/EmpathAI/emotion_model"
os.makedirs(MODEL_SAVE_PATH, exist_ok=True)

# 2. Training loop
progress_bar = tqdm(range(num_epochs * len(train_loader)), desc="Training Progress")

for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device).float()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        progress_bar.update(1)

    # -------------------------------
    # Validation at the end of each epoch
    # -------------------------------
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device).float()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.sigmoid(outputs.logits).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    # Threshold for multi-label
    all_preds_bin = (np.array(all_preds) >= 0.5).astype(int)
    f1 = f1_score(all_labels, all_preds_bin, average='micro')
    print(f"\nEpoch {epoch+1} — Validation F1 (micro): {f1:.4f}")

# 3. Save trained model and tokenizer to Google Drive
model.save_pretrained(MODEL_SAVE_PATH)
tokenizer.save_pretrained(MODEL_SAVE_PATH)
print(f"Model and tokenizer saved to {MODEL_SAVE_PATH}")


Training Progress:  33%|███▎      | 2715/8142 [16:05<16:25:54, 10.90s/it]


Epoch 1 — Validation F1 (micro): 0.5318


Training Progress:  67%|██████▋   | 5429/8142 [32:25<8:12:05, 10.88s/it]


Epoch 2 — Validation F1 (micro): 0.5691


Training Progress: 100%|██████████| 8142/8142 [48:09<00:00,  2.88it/s]


Epoch 3 — Validation F1 (micro): 0.5809
Model and tokenizer saved to /content/drive/MyDrive/EmpathAI/emotion_model


In [13]:
# 5. Load DialoGPT chat model
chat_model_name = "microsoft/DialoGPT-medium"
chat_tokenizer = AutoTokenizer.from_pretrained(chat_model_name)
chat_model = AutoModelForCausalLM.from_pretrained(chat_model_name).to(device)

# Ensure pad_token is set
chat_tokenizer.pad_token = chat_tokenizer.eos_token

# 6. Simple, short chatbot response
def get_simple_chat_response(user_text, detected_emotions):
    emotions_str = ", ".join(detected_emotions)
    context_text = f"[Emotion: {emotions_str}] {user_text}"

    # Tokenize with attention mask
    inputs = chat_tokenizer(context_text, return_tensors="pt", padding=True, truncation=True).to(device)

    outputs = chat_model.generate(
        **inputs,
        max_length=60,
        pad_token_id=chat_tokenizer.eos_token_id,
        do_sample=True,
        top_k=50,
        top_p=0.9,
        temperature=0.7
    )

    response = chat_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# 7. Custom input example
user_input = input("Enter your message: ")
detected_emotions = predict_emotions_labels(user_input)
response = get_simple_chat_response(user_input, detected_emotions)

print("\nUser Input:", user_input)
print("Detected Emotions:", detected_emotions)
print("Chatbot Response:", response)


Enter your message: HAPPY

User Input: HAPPY
Detected Emotions: ['neutral']
Chatbot Response: [Emotion: neutral] HAPPY : HAPPY!
