In [49]:
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import pandas as pd


In [50]:
df = pd.read_csv('emotion_dataset.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Emotion,Text,Clean_Text
0,0,neutral,Why ?,
1,1,joy,Sage Act upgrade on my to do list for tommorow.,Sage Act upgrade list tommorow
2,2,sadness,ON THE WAY TO MY HOMEGIRL BABY FUNERAL!!! MAN ...,WAY HOMEGIRL BABY FUNERAL MAN HATE FUNERALS SH...
3,3,joy,Such an eye ! The true hazel eye-and so brill...,eye true hazel eyeand brilliant Regular feat...
4,4,joy,@Iluvmiasantos ugh babe.. hugggzzz for u .! b...,ugh babe hugggzzz u babe naamazed nga ako e...


In [51]:
df.columns = df.columns.str.strip()

In [52]:
unique_labels = df["Emotion"].unique()
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}
df["label_id"] = df["Emotion"].map(label2id)

In [53]:
df["Final_Text"] = df["Clean_Text"].fillna(df["Text"])
df

Unnamed: 0.1,Unnamed: 0,Emotion,Text,Clean_Text,label_id,Final_Text
0,0,neutral,Why ?,,0,Why ?
1,1,joy,Sage Act upgrade on my to do list for tommorow.,Sage Act upgrade list tommorow,1,Sage Act upgrade list tommorow
2,2,sadness,ON THE WAY TO MY HOMEGIRL BABY FUNERAL!!! MAN ...,WAY HOMEGIRL BABY FUNERAL MAN HATE FUNERALS SH...,2,WAY HOMEGIRL BABY FUNERAL MAN HATE FUNERALS SH...
3,3,joy,Such an eye ! The true hazel eye-and so brill...,eye true hazel eyeand brilliant Regular feat...,1,eye true hazel eyeand brilliant Regular feat...
4,4,joy,@Iluvmiasantos ugh babe.. hugggzzz for u .! b...,ugh babe hugggzzz u babe naamazed nga ako e...,1,ugh babe hugggzzz u babe naamazed nga ako e...
...,...,...,...,...,...,...
34787,34787,surprise,@MichelGW have you gift! Hope you like it! It'...,gift Hope like it hand wear Itll warm Lol,4,gift Hope like it hand wear Itll warm Lol
34788,34788,joy,The world didnt give it to me..so the world MO...,world didnt meso world DEFINITELY cnt away,1,world didnt meso world DEFINITELY cnt away
34789,34789,anger,A man robbed me today .,man robbed today,5,man robbed today
34790,34790,fear,"Youu call it JEALOUSY, I call it of #Losing YO...",Youu JEALOUSY #Losing YOU,3,Youu JEALOUSY #Losing YOU


In [54]:
print(df["Emotion"].value_counts())


Emotion
joy         11045
sadness      6722
fear         5410
anger        4297
surprise     4062
neutral      2254
disgust       856
shame         146
Name: count, dtype: int64


In [55]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["Final_Text"], df["label_id"],
    test_size=0.3, random_state=42, stratify=df["label_id"]
)

In [56]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

class EmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = list(texts)
        self.labels = list(labels)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encodings = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": encodings["input_ids"].squeeze(),
            "attention_mask": encodings["attention_mask"].squeeze(),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }
    
train_dataset = EmotionDataset(train_texts, train_labels, tokenizer)
val_dataset = EmotionDataset(val_texts, val_labels, tokenizer)

In [57]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [58]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    return {"accuracy": acc, "f1": f1}

In [59]:
training_args = TrainingArguments(
    output_dir="./bert_results",
    do_eval=True,                # instead of evaluation_strategy
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./bert_logs",
    logging_steps=10,
    save_total_limit=1
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)



  trainer = Trainer(


In [60]:
model.save_pretrained("./emotion_model")
tokenizer.save_pretrained("./emotion_model")

('./emotion_model\\tokenizer_config.json',
 './emotion_model\\special_tokens_map.json',
 './emotion_model\\vocab.txt',
 './emotion_model\\added_tokens.json')

In [61]:
import numpy as np

def predict(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    probs = probs.detach().cpu().numpy()[0]
    pred_id = np.argmax(probs)
    return {
        "text": text,
        "predicted_label": id2label[pred_id],
        "confidence": float(probs[pred_id]),
        "all_probs": {id2label[i]: float(p) for i, p in enumerate(probs)}
    }


print(predict("I am  sad today!"))

{'text': 'I am  sad today!', 'predicted_label': 'fear', 'confidence': 0.19364005327224731, 'all_probs': {'neutral': 0.13711442053318024, 'joy': 0.1423381119966507, 'sadness': 0.09443894028663635, 'fear': 0.19364005327224731, 'surprise': 0.09870859235525131, 'anger': 0.05217939615249634, 'shame': 0.14698702096939087, 'disgust': 0.13459345698356628}}


In [66]:
print(predict("I am  sad today!"))

{'text': 'I am  sad today!', 'predicted_label': 'fear', 'confidence': 0.19364005327224731, 'all_probs': {'neutral': 0.13711442053318024, 'joy': 0.1423381119966507, 'sadness': 0.09443894028663635, 'fear': 0.19364005327224731, 'surprise': 0.09870859235525131, 'anger': 0.05217939615249634, 'shame': 0.14698702096939087, 'disgust': 0.13459345698356628}}
