<a href="https://colab.research.google.com/github/FadQode/teks-emo-analysis/blob/main/roberta_can_read_emotion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!apt-get install git

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git is already the newest version (1:2.34.1-1ubuntu1.11).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [None]:
!git clone https://github.com/FadQode/teks-emo-analysis

Cloning into 'teks-emo-analysis'...
remote: Enumerating objects: 17, done.[K
remote: Counting objects: 100% (17/17), done.[K
remote: Compressing objects: 100% (15/15), done.[K
remote: Total 17 (delta 3), reused 10 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (17/17), 397.94 KiB | 1.46 MiB/s, done.
Resolving deltas: 100% (3/3), done.


In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import AdamW, get_scheduler
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import pandas as pd
import numpy as np
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
LABEL_MAPPING = {
    'anger': 4,
    'fear': 5,
    'joy': 2,
    'love': 3,
    'sadness': 0,
    'surprise': 1
}


MODEL_NAME = "roberta-base"
BATCH_SIZE = 32
EPOCHS = 20
MAX_LENGTH = 256
LEARNING_RATE = 1e-5

In [None]:
class EmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels.values.astype(int)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long)
        }

In [None]:
train_data = pd.read_csv("/content/teks-emo-analysis/dataset/emotion_train.csv")
val_data = pd.read_csv("/content/teks-emo-analysis/dataset/emotion_val.csv")
test_data = pd.read_csv("/content/teks-emo-analysis/dataset/emotion_test.csv")

In [None]:
# Check for missing labels in your train_data
print(train_data["emotion"].isnull().sum())  # Check if there are NaNs

train_data.info()

0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6784 entries, 0 to 6783
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text     6784 non-null   object
 1   emotion  6784 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 106.1+ KB


In [None]:
print(train_data['emotion'].value_counts())

emotion
1    1651
3    1118
4    1108
0    1016
5     962
2     929
Name: count, dtype: int64


In [None]:
print("Label range in train data:", train_data['emotion'].min(), "-", train_data['emotion'].max())

Label range in train data: 0 - 5


In [None]:
tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)

In [None]:
train_dataset = EmotionDataset(train_data['text'], train_data['emotion'], tokenizer)
val_dataset = EmotionDataset(val_data['text'], val_data['emotion'], tokenizer)
test_dataset = EmotionDataset(test_data['text'], test_data['emotion'], tokenizer)


In [None]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)

In [None]:
model = RobertaForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(LABEL_MAPPING))
model.to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [None]:
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
num_training_steps = len(train_loader) * EPOCHS
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)




In [None]:
def train_model():
    model.train()
    total_loss, total_correct = 0, 0
    for batch in tqdm(train_loader):
        batch = {key: val.to(device) for key, val in batch.items()}

        outputs = model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            labels=batch["label"]
        )

        loss = outputs.loss
        logits = outputs.logits

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_loss += loss.item()
        total_correct += (logits.argmax(dim=1) == batch["label"]).sum().item()

    avg_loss = total_loss / len(train_loader)
    accuracy = total_correct / len(train_dataset)
    print(f"Training loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}")

In [None]:
def evaluate_model():
    model.eval()
    total_loss, total_correct = 0, 0
    with torch.no_grad():
        for batch in tqdm(val_loader):
            batch = {key: val.to(device) for key, val in batch.items()}

            outputs = model(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
                labels=batch["label"]
            )

            loss = outputs.loss
            logits = outputs.logits

            total_loss += loss.item()
            total_correct += (logits.argmax(dim=1) == batch["label"]).sum().item()

    avg_loss = total_loss / len(val_loader)
    accuracy = total_correct / len(val_dataset)
    print(f"Validation loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}")


In [None]:
def evaluate_model():
    model.eval()
    all_labels, all_preds = [], []
    with torch.no_grad():
        for batch in tqdm(val_loader):
            batch = {key: val.to(device) for key, val in batch.items()}
            outputs = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])
            logits = outputs.logits
            preds = logits.argmax(dim=1).cpu().numpy()
            labels = batch["label"].cpu().numpy()
            all_labels.extend(labels)
            all_preds.extend(preds)

    report = classification_report(all_labels, all_preds, target_names=list(LABEL_MAPPING.keys()))
    print(report)

In [None]:
for epoch in range(EPOCHS):
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    train_model()
    evaluate_model()


Epoch 1/20


100%|██████████| 424/424 [02:30<00:00,  2.81it/s]


Training loss: 0.6841, Accuracy: 0.7619


100%|██████████| 97/97 [00:10<00:00,  9.18it/s]


Validation loss: 0.6555, Accuracy: 0.7828
Epoch 2/20


100%|██████████| 424/424 [02:30<00:00,  2.82it/s]


Training loss: 0.4867, Accuracy: 0.8404


100%|██████████| 97/97 [00:10<00:00,  9.11it/s]


Validation loss: 0.5803, Accuracy: 0.8119
Epoch 3/20


100%|██████████| 424/424 [02:30<00:00,  2.82it/s]


Training loss: 0.3711, Accuracy: 0.8796


100%|██████████| 97/97 [00:10<00:00,  9.11it/s]


Validation loss: 0.5716, Accuracy: 0.8216
Epoch 4/20


100%|██████████| 424/424 [02:30<00:00,  2.82it/s]


Training loss: 0.3304, Accuracy: 0.8909


100%|██████████| 97/97 [00:10<00:00,  9.07it/s]


Validation loss: 0.5347, Accuracy: 0.8371
Epoch 5/20


100%|██████████| 424/424 [02:30<00:00,  2.82it/s]


Training loss: 0.2739, Accuracy: 0.9091


100%|██████████| 97/97 [00:10<00:00,  9.16it/s]


Validation loss: 0.5399, Accuracy: 0.8416
Epoch 6/20


100%|██████████| 424/424 [02:30<00:00,  2.82it/s]


Training loss: 0.2429, Accuracy: 0.9203


100%|██████████| 97/97 [00:10<00:00,  9.08it/s]


Validation loss: 0.5776, Accuracy: 0.8365
Epoch 7/20


100%|██████████| 424/424 [02:30<00:00,  2.82it/s]


Training loss: 0.2159, Accuracy: 0.9291


100%|██████████| 97/97 [00:10<00:00,  9.09it/s]


Validation loss: 0.5697, Accuracy: 0.8442
Epoch 8/20


100%|██████████| 424/424 [02:30<00:00,  2.82it/s]


Training loss: 0.1851, Accuracy: 0.9406


100%|██████████| 97/97 [00:10<00:00,  9.10it/s]


Validation loss: 0.5970, Accuracy: 0.8390
Epoch 9/20


100%|██████████| 424/424 [02:30<00:00,  2.82it/s]


Training loss: 0.1571, Accuracy: 0.9481


100%|██████████| 97/97 [00:10<00:00,  9.05it/s]


Validation loss: 0.6181, Accuracy: 0.8197
Epoch 10/20


100%|██████████| 424/424 [02:30<00:00,  2.82it/s]


Training loss: 0.1294, Accuracy: 0.9595


100%|██████████| 97/97 [00:10<00:00,  9.14it/s]


Validation loss: 0.6771, Accuracy: 0.8300
Epoch 11/20


100%|██████████| 424/424 [02:30<00:00,  2.82it/s]


Training loss: 0.1100, Accuracy: 0.9645


100%|██████████| 97/97 [00:10<00:00,  9.08it/s]


Validation loss: 0.6857, Accuracy: 0.8339
Epoch 12/20


100%|██████████| 424/424 [02:30<00:00,  2.82it/s]


Training loss: 0.0939, Accuracy: 0.9707


100%|██████████| 97/97 [00:10<00:00,  9.17it/s]


Validation loss: 0.7081, Accuracy: 0.8326
Epoch 13/20


100%|██████████| 424/424 [02:30<00:00,  2.82it/s]


Training loss: 0.0797, Accuracy: 0.9752


100%|██████████| 97/97 [00:11<00:00,  8.64it/s]


Validation loss: 0.7407, Accuracy: 0.8300
Epoch 14/20


100%|██████████| 424/424 [02:33<00:00,  2.77it/s]


Training loss: 0.0657, Accuracy: 0.9808


100%|██████████| 97/97 [00:10<00:00,  9.08it/s]


Validation loss: 0.7320, Accuracy: 0.8300
Epoch 15/20


100%|██████████| 424/424 [02:30<00:00,  2.81it/s]


Training loss: 0.0577, Accuracy: 0.9830


100%|██████████| 97/97 [00:10<00:00,  9.15it/s]


Validation loss: 0.7352, Accuracy: 0.8358
Epoch 16/20


100%|██████████| 424/424 [02:30<00:00,  2.82it/s]


Training loss: 0.0530, Accuracy: 0.9828


100%|██████████| 97/97 [00:10<00:00,  9.13it/s]


Validation loss: 0.7606, Accuracy: 0.8345
Epoch 17/20


100%|██████████| 424/424 [02:30<00:00,  2.82it/s]


Training loss: 0.0464, Accuracy: 0.9864


100%|██████████| 97/97 [00:10<00:00,  9.14it/s]


Validation loss: 0.7672, Accuracy: 0.8339
Epoch 18/20


100%|██████████| 424/424 [02:30<00:00,  2.82it/s]


Training loss: 0.0425, Accuracy: 0.9881


100%|██████████| 97/97 [00:10<00:00,  9.11it/s]


Validation loss: 0.7645, Accuracy: 0.8306
Epoch 19/20


100%|██████████| 424/424 [02:30<00:00,  2.82it/s]


Training loss: 0.0363, Accuracy: 0.9915


100%|██████████| 97/97 [00:10<00:00,  9.18it/s]


Validation loss: 0.7667, Accuracy: 0.8352
Epoch 20/20


100%|██████████| 424/424 [02:30<00:00,  2.82it/s]


Training loss: 0.0369, Accuracy: 0.9897


100%|██████████| 97/97 [00:10<00:00,  9.13it/s]

Validation loss: 0.7667, Accuracy: 0.8352





In [None]:
model.save_pretrained("./roberta_emotion_model")
tokenizer.save_pretrained("./roberta_emotion_model")

('./roberta_emotion_model/tokenizer_config.json',
 './roberta_emotion_model/special_tokens_map.json',
 './roberta_emotion_model/vocab.json',
 './roberta_emotion_model/merges.txt',
 './roberta_emotion_model/added_tokens.json')

In [None]:
print(test_data["emotion"].isnull().sum())  # Should be 0
test_data.info()

0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1547 entries, 0 to 1546
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text     1547 non-null   object
 1   emotion  1547 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 24.3+ KB


In [None]:
def predict_emotion_batch(texts):
    """Predict emotions for a batch of texts."""
    encoding = tokenizer(
        texts,
        max_length=128,
        padding=True,  # Batch padding
        truncation=True,
        return_tensors="pt"
    )

    input_ids = encoding["input_ids"].to('cuda' if torch.cuda.is_available() else 'cpu')
    attention_mask = encoding["attention_mask"].to('cuda' if torch.cuda.is_available() else 'cpu')

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1).cpu().numpy()

    return predictions


In [None]:
all_texts = test_data['text'].tolist()
true_labels = test_data['emotion'].values

# Predict in batches to avoid memory issues
BATCH_SIZE = 16  # Can be adjusted depending on your hardware
predicted_labels = []

for i in range(0, len(all_texts), BATCH_SIZE):
    batch_texts = all_texts[i:i + BATCH_SIZE]
    batch_predictions = predict_emotion_batch(batch_texts)
    predicted_labels.extend(batch_predictions)

# Step 5: Calculate accuracy
accuracy = (np.array(predicted_labels) == true_labels).mean()
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Test Accuracy: 84.16%


In [None]:
MODEL_DIR = "roberta-base"
# Load the model and tokenizer
tokenizer = RobertaTokenizer.from_pretrained(MODEL_DIR)
model_input = RobertaForSequenceClassification.from_pretrained(MODEL_DIR)
model_input.eval()  # Set to evaluation mode


LABEL_MAPPING = {
    0: 'sadness',
    1: 'surprise',
    2: 'joy',
    3: 'love',
    4: 'anger',
    5: 'fear'
}

def predict_emotion(diary_text):
    """Predicts the emotion for a given diary text."""
    encoding = tokenizer(
        diary_text,
        max_length=128,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )

    input_ids = encoding["input_ids"]
    attention_mask = encoding["attention_mask"]

    with torch.no_grad():
        outputs = model_input(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()
        predicted_emotion = LABEL_MAPPING[predicted_class]

    return predicted_emotion


# Input a diary text and predict its emotion
diary_text = input("Enter your diary text: ")
predicted_emotion = predict_emotion(diary_text)
print(f"The predicted emotion is: {predicted_emotion}")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Enter your diary text: Diary  Tanggal: 14 Desember 2024  Hari ini aku sedih, uangku tambah banyak.namun aku bingung mau menghabiskan bagaimana. kemudian temanku datang dan dipalak   End of Diary.
The predicted emotion is: sadness


In [None]:
import shutil

# Specify the folder to zip
folder_to_zip = 'roberta_emotion_model'

# Create a zip file
shutil.make_archive('roberta_model', 'zip', folder_to_zip)

'/content/teks-emo-analysis/roberta_model.zip'

In [None]:
!git rm -r roberta_emotion_model

rm 'roberta_emotion_model/merges.txt'
rm 'roberta_emotion_model/model.safetensors'
rm 'roberta_emotion_model/special_tokens_map.json'
rm 'roberta_emotion_model/tokenizer_config.json'
rm 'roberta_emotion_model/vocab.json'


In [None]:
from google.colab import files

# Download the zip file
files.download('roberta_model.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!du -sh roberta_model.zip

405M	roberta_model.zip


In [None]:
from google.colab import drive
drive.mount('/content/drive')
shutil.move('roberta_model.zip', '/content/drive/MyDrive/')

Mounted at /content/drive


'/content/drive/MyDrive/roberta_model.zip'

In [3]:
%pip install -U -q datasets trl bitsandbytes transformers accelerate peft wandb gradio

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.5/43.5 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.7/365.7 kB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.1/69.1 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m336.4/336.4 kB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.8/374.8 kB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m62.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.2/57.2 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━