In [1]:
# Cell 1: Install required libraries
!pip install datasets==2.14.6 fsspec==2023.10.0 scikit-learn pandas numpy torch nltk
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
# Cell 2: Import libraries and load GoEmotions dataset
import pandas as pd
import numpy as np
from datasets import load_dataset
from sklearn.metrics import hamming_loss, f1_score
from sklearn.preprocessing import MultiLabelBinarizer
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections import Counter
from nltk.tokenize import word_tokenize
import os

# Function to load dataset with fallback
def load_goemotions_dataset():
    try:
        # Attempt to load dataset directly
        dataset = load_dataset("go_emotions", "simplified")
        print("Dataset loaded successfully from Hugging Face.")
        return dataset
    except Exception as e:
        print(f"Error loading dataset: {str(e)}")
        print("Attempting manual download as fallback...")

        # Fallback: Download dataset files manually
        try:
            # URLs for GoEmotions simplified dataset (based on public availability)
            base_url = "https://raw.githubusercontent.com/google-research/google-research/master/goemotions/data/"
            splits = ["train.tsv", "dev.tsv", "test.tsv"]
            split_names = ["train", "validation", "test"]
            dataset_dict = {}

            for split, split_name in zip(splits, split_names):
                url = base_url + split
                df = pd.read_csv(url, sep='\t', header=None,
                                names=['text', 'labels', 'id'])
                # Convert labels from comma-separated string to list of integers
                df['labels'] = df['labels'].apply(lambda x: [int(i) for i in x.split(',')])
                dataset_dict[split_name] = df

            print("Dataset loaded manually from GitHub.")
            return dataset_dict
        except Exception as e:
            raise Exception(f"Failed to load dataset manually: {str(e)}")

# Load dataset
dataset = load_goemotions_dataset()

# Extract splits
if isinstance(dataset, dict) and 'train' in dataset:
    # For manually loaded dataset (pandas DataFrames)
    train_data = dataset['train']
    val_data = dataset['validation']
    test_data = dataset['test']
else:
    # For Hugging Face dataset
    train_data = dataset['train']
    val_data = dataset['validation']
    test_data = dataset['test']

# Get emotion labels
emotions = train_data.features['labels'].feature.names if hasattr(train_data, 'features') else [
    'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity',
    'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear',
    'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization',
    'relief', 'remorse', 'sadness', 'surprise', 'neutral'
]
num_labels = len(emotions)
print(f"Number of emotions: {num_labels}")
print(f"Emotions: {emotions}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/2.77M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/350k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/347k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/43410 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5426 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5427 [00:00<?, ? examples/s]

Dataset loaded successfully from Hugging Face.
Number of emotions: 28
Emotions: ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']


In [3]:
# Cell 3: Build vocabulary from training text
def build_vocab(texts, max_vocab_size=10000):
    word_counts = Counter()
    for text in texts:
        tokens = word_tokenize(text.lower())
        word_counts.update(tokens)

    # Create vocabulary with top max_vocab_size words
    vocab = {word: idx + 2 for idx, (word, _) in enumerate(word_counts.most_common(max_vocab_size))}
    vocab['<PAD>'] = 0  # Padding token
    vocab['<UNK>'] = 1  # Unknown token
    return vocab

# Build vocabulary
train_texts = train_data['text'] if isinstance(train_data, pd.DataFrame) else train_data['text']
vocab = build_vocab(train_texts)
vocab_size = len(vocab)
print(f"Vocabulary size: {vocab_size}")

Vocabulary size: 10002


In [4]:
# Cell 4: Preprocess text and labels
def text_to_indices(text, vocab, max_length=128):
    tokens = word_tokenize(text.lower())[:max_length]
    indices = [vocab.get(token, vocab['<UNK>']) for token in tokens]
    if len(indices) < max_length:
        indices += [vocab['<PAD>']] * (max_length - len(indices))
    return indices

# Convert labels to multilabel format
mlb = MultiLabelBinarizer(classes=range(num_labels))
train_labels = mlb.fit_transform(train_data['labels'])
val_labels = mlb.transform(val_data['labels'])
test_labels = mlb.transform(test_data['labels'])

# Compute class weights for handling imbalance
class_counts = np.sum(train_labels, axis=0)
total_samples = len(train_labels)
class_weights = total_samples / (num_labels * class_counts)
class_weights = torch.FloatTensor(class_weights)

In [5]:
# Cell 5: Define custom dataset
class GoEmotionsDataset(Dataset):
    def __init__(self, texts, labels, vocab, max_length=128):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        indices = text_to_indices(text, self.vocab, self.max_length)
        return {
            'input_ids': torch.LongTensor(indices),
            'labels': torch.FloatTensor(self.labels[idx])
        }

# Create datasets
train_dataset = GoEmotionsDataset(train_data['text'], train_labels, vocab)
val_dataset = GoEmotionsDataset(val_data['text'], val_labels, vocab)
test_dataset = GoEmotionsDataset(test_data['text'], test_labels, vocab)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

In [6]:
# Cell 6: Define custom LSTM model
class EmotionLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_labels, num_layers=2, dropout=0.3):
        super(EmotionLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers,
                           batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, num_labels)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids):
        embedded = self.embedding(input_ids)
        lstm_out, _ = self.lstm(embedded)
        lstm_out = lstm_out[:, -1, :]  # Take the last time step
        out = self.dropout(lstm_out)
        out = self.fc(out)
        return out

# Initialize model
embedding_dim = 100
hidden_dim = 128
model = EmotionLSTM(vocab_size, embedding_dim, hidden_dim, num_labels)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
class_weights = class_weights.to(device)

In [7]:
# Cell 7: Train the model
criterion = nn.BCEWithLogitsLoss(weight=class_weights)
optimizer = optim.Adam(model.parameters(), lr=0.001)

def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=5):
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        # Validation
        model.eval()
        val_preds, val_labels = [], []
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                labels = batch['labels'].to(device)
                outputs = model(input_ids)
                preds = (torch.sigmoid(outputs) > 0.5).float()
                val_preds.append(preds.cpu().numpy())
                val_labels.append(labels.cpu().numpy())

        val_preds = np.concatenate(val_preds, axis=0)
        val_labels = np.concatenate(val_labels, axis=0)
        val_hamming = hamming_loss(val_labels, val_preds)
        val_f1 = f1_score(val_labels, val_preds, average='micro')

        print(f"Epoch {epoch+1}/{num_epochs}")
        print(f"Train Loss: {train_loss/len(train_loader):.4f}")
        print(f"Val Hamming Loss: {val_hamming:.4f}, Val F1 Score: {val_f1:.4f}")

# Train
train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=5)

Epoch 1/5
Train Loss: 0.1912
Val Hamming Loss: 0.0420, Val F1 Score: 0.0000
Epoch 2/5
Train Loss: 0.1717
Val Hamming Loss: 0.0420, Val F1 Score: 0.0000
Epoch 3/5
Train Loss: 0.1713
Val Hamming Loss: 0.0420, Val F1 Score: 0.0000
Epoch 4/5
Train Loss: 0.1712
Val Hamming Loss: 0.0420, Val F1 Score: 0.0000
Epoch 5/5
Train Loss: 0.1713
Val Hamming Loss: 0.0420, Val F1 Score: 0.0000


In [8]:
# Cell 8: Evaluate on test set
def evaluate_model(model, test_loader):
    model.eval()
    test_preds, test_labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids)
            preds = (torch.sigmoid(outputs) > 0.5).float()
            test_preds.append(preds.cpu().numpy())
            test_labels.append(labels.cpu().numpy())

    test_preds = np.concatenate(test_preds, axis=0)
    test_labels = np.concatenate(test_labels, axis=0)
    hamming = hamming_loss(test_labels, test_preds)
    f1 = f1_score(test_labels, test_preds, average='micro')
    return {'hamming_loss': hamming, 'f1_score': f1}

eval_results = evaluate_model(model, test_loader)
print(f"Test Set Evaluation Results: {eval_results}")

Test Set Evaluation Results: {'hamming_loss': 0.04165021453578668, 'f1_score': 0.0}


In [9]:
# Cell 9: Test on real-world text
def predict_emotions(texts, model, vocab, max_length=128):
    model.eval()
    indices = [text_to_indices(text, vocab, max_length) for text in texts]
    input_ids = torch.LongTensor(indices).to(device)

    with torch.no_grad():
        outputs = model(input_ids)
        preds = (torch.sigmoid(outputs) > 0.5).float().cpu().numpy()

    predicted_labels = mlb.inverse_transform(preds)
    return [[emotions[idx] for idx in pred] for pred in predicted_labels]

# Example real-world texts
real_world_texts = [
    "I absolutely love this product! It's amazing and makes me so happy!",
    "This service is terrible, I'm so frustrated and disappointed.",
    "Wow, what a surprise! I'm thrilled but also a bit nervous."
]

# Predict emotions
predictions = predict_emotions(real_world_texts, model, vocab)
for text, pred in zip(real_world_texts, predictions):
    print(f"Text: {text}")
    print(f"Predicted Emotions: {pred}\n")

Text: I absolutely love this product! It's amazing and makes me so happy!
Predicted Emotions: []

Text: This service is terrible, I'm so frustrated and disappointed.
Predicted Emotions: []

Text: Wow, what a surprise! I'm thrilled but also a bit nervous.
Predicted Emotions: []



In [10]:
# Cell 10: Save model and vocabulary
torch.save(model.state_dict(), "emotion_model.pth")
torch.save(vocab, "vocab.pt")
print("Model and vocabulary saved successfully.")

Model and vocabulary saved successfully.
