In [None]:
%pip install transformers
%pip install datasets

In [None]:
from datasets import load_dataset

# Load goEmotions Dataset and get Sentiment Labels
goEmotionsDataset = load_dataset('go_emotions')
sentimentLabels = goEmotionsDataset['train'].features['labels'].feature.names

In [None]:
from transformers import RobertaForSequenceClassification

# Download basic roBERTa model
model = RobertaForSequenceClassification.from_pretrained(
    'roberta-base',
    num_labels=len(sentimentLabels),
    problem_type='multi_label_classification'
)

In [None]:
import numpy as np

# Convert dataset sentiment keys into one hot encoding
def one_hot_encode(example):
    vector = np.zeros(28, dtype=np.float32)

    for label_index in example['labels']:
        vector[label_index] = 1.0

    return {'labels': vector}

dataset = goEmotionsDataset.map(one_hot_encode)

print("Original ID:", goEmotionsDataset['train'][0]['labels'])
print("New Vector:", dataset['train'][0]['labels'])

In [None]:
from transformers import RobertaTokenizer
from datasets import Sequence, Value

# Tokenize and format for PyTorch
def tokenize_text(rows):
    return tokenizer(rows['text'], padding='max_length', truncation=True, max_length=128)

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
tokenized_dataset = dataset.map(tokenize_text, batched=True)
tokenized_dataset = tokenized_dataset.cast_column("labels", Sequence(Value("float32")))
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


In [None]:
from sklearn.metrics import f1_score
import numpy as np

# Set up F1 Score
def compute_metrics(eval_pred):
    threshold = 0.5

    logits, labels = eval_pred
    probs = 1 / (1 + np.exp(-logits))
    predictions = (probs > threshold).astype(int)
    return {'f1': f1_score(labels, predictions, average='micro'), 'precision': precision_score(labels, predictions, average='micro'), 'recall': recall_score(labels, predictions, average='micro')}


In [None]:
from transformers import TrainingArguments, Trainer
from google.colab import drive
import os

# Training arguments
training_args = TrainingArguments(
    output_dir="./roberta-sentiment-model",
    learning_rate=2e-5,             # How fast the model learns
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,             # Number of iterations through training
    weight_decay=0.01,              # Regularization
    eval_strategy="epoch",          # Evaluate after each epoch
    save_strategy="epoch",          # Save the model after every epoch
    load_best_model_at_end=True,    # Keep best version
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    compute_metrics=compute_metrics,
)

# Train and Save
trainer.train()

drive.mount('/content/drive')
model_path = "/content/drive/My Drive/RedditSentimentAnalysis/my_emotion_model"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)
print(f"Model successfully saved to: {model_path}")

In [None]:
from torch import topk
from transformers import pipeline

# Use pipeline
emotion_classifier = pipeline(
    "text-classification",
    model="./my_emotion_model",
    tokenizer="./my_emotion_model"
)

# Test
result = emotion_classifier("I HATE AI", top_k=10)
for prediction in result:
    sentiment_id = int(prediction['label'].replace("LABEL_", ""))
    sentiment_name = goEmotionsDataset['train'].features['labels'].feature.names[sentiment_id]

    print(f"{sentiment_name}, Score: {prediction['score']}")
