## Importing essential libraries

In [114]:
from datasets import load_dataset, DatasetDict , Dataset
import pandas as pd
import ast
import os
import datasets
from tqdm import tqdm
import time
from transformers import GPT2Tokenizer,GPT2LMHeadModel
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader,Dataset,random_split
os.environ['CUDA_VISIBLE_DEVICES'] = ''
import os
os.environ["HF_DATASETS_CACHE"] = "/tmp/hf_cache"

In [115]:
import torch

if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    try:
        device = torch.device('mps')  # For Apple Silicon (M1/M2)
    except Exception:
        device = torch.device('cpu')


## GPT2 before finetuning

## Loading the data

In [116]:
data_sample = load_dataset("rasyosef/amharic-sentiment")

In [117]:
data_sample

DatasetDict({
    train: Dataset({
        features: ['clean_tweet', 'label'],
        num_rows: 2223
    })
    dev: Dataset({
        features: ['clean_tweet', 'label'],
        num_rows: 279
    })
    test: Dataset({
        features: ['clean_tweet', 'label'],
        num_rows: 279
    })
})

## Select only the features we want to work with

In [118]:
label_map = {"negative": 0, "positive": 1}

updated_data = [
    {
        'clean_tweet': item['clean_tweet'],
        'label': label_map[item['label']] if isinstance(item['label'], str) else item['label']
    }
    for item in data_sample['train']
]
df = pd.DataFrame(updated_data)
df.to_csv('data.csv', index=False)

In [119]:
df.head()

Unnamed: 0,clean_tweet,label
0,አሜን እውነት ብለሀል መሪዬ ዶር ዐብይ ብዙዎች ከአንተጋር መሆናችንን እን...,1
1,እወድሽ ነበረ ዛሬስ ጠላሁሽ\n እንደ ሰፌድ ቆሎ ያም ያም ዘገነሽ፧\nወይ ጉድ,0
2,😂😆😜🤡 አጋር ፓርቲም ሆኖ የመኖር ፍላጎት አሳይታለች....አሮጊቷ 🙌🏽🤣,0
3,ከፍተኛ ሊግ ለ | መከላከያ እና ሀምበሪቾ ወደ ሠንጠረዡ አናት ሲጠጉ ሶዶ...,1
4,ይሄ ሽማግሌ ግን አይበቃውም? ንስሃ አይገባም🤦‍♂️,0


In [120]:
device

device(type='cuda')

## Loading the tokenizer for the GPT2 model

In [121]:
model_name = "bert-base-multilingual-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [122]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [123]:
BATCH_SIZE = 8

## Dataset Preparation

In [124]:
class BERTDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=128):
        self.tokenizer = tokenizer
        self.data = df
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['clean_tweet']
        label = int(self.data.iloc[idx]['label'])

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }

## Split the data into a train and test set

In [125]:
dataset = BERTDataset(df, tokenizer)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

In [126]:
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

In [127]:
num_epochs = 10

In [128]:
model_name = "bert-base-multilingual-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)

# Add pad token if missing
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.resize_token_embeddings(len(tokenizer))  # Adjust for new pad token
model.to(device)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [129]:
# It compares the model's predicted class probabilities with the actual class labels and calculates how "wrong" the model is.
criterion = nn.CrossEntropyLoss(ignore_index = tokenizer.pad_token_id)
optimizer = optim.Adam(model.parameters(), lr=5e-4) # adjusting the learning rate for each parameters
tokenizer.pad_token = tokenizer.eos_token


In [130]:
# A dataframe used to store and track the results of model training over multiple epochs
results = pd.DataFrame(columns = ['epoch', 'transformer', 'batch_size', 'gpu','training_loss', 'validation_loss', 'epoch_duration_sec'])

## Training and evaluating the model with custom data

In [131]:
num_epochs = 10
for epoch in range(num_epochs):
    start = time.time()
    model.train()
    total_train_loss = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} Training"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)

    # Validation
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Epoch {epoch+1} Validation"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(val_loader)
    end = time.time()

    print(f"Epoch {epoch+1} completed in {end - start:.2f}s")
    print(f"Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")


Epoch 1 Training: 100%|██████████| 223/223 [00:49<00:00,  4.55it/s]
Epoch 1 Validation: 100%|██████████| 56/56 [00:03<00:00, 15.53it/s]


Epoch 1 completed in 52.62s
Train Loss: 0.7235 | Val Loss: 0.6917


Epoch 2 Training: 100%|██████████| 223/223 [00:49<00:00,  4.47it/s]
Epoch 2 Validation: 100%|██████████| 56/56 [00:03<00:00, 16.66it/s]


Epoch 2 completed in 53.22s
Train Loss: 0.7166 | Val Loss: 0.7145


Epoch 3 Training: 100%|██████████| 223/223 [00:49<00:00,  4.47it/s]
Epoch 3 Validation: 100%|██████████| 56/56 [00:03<00:00, 16.34it/s]


Epoch 3 completed in 53.32s
Train Loss: 0.7356 | Val Loss: 0.7808


Epoch 4 Training: 100%|██████████| 223/223 [00:49<00:00,  4.51it/s]
Epoch 4 Validation: 100%|██████████| 56/56 [00:03<00:00, 16.39it/s]


Epoch 4 completed in 52.92s
Train Loss: 0.7184 | Val Loss: 0.6961


Epoch 5 Training: 100%|██████████| 223/223 [00:49<00:00,  4.49it/s]
Epoch 5 Validation: 100%|██████████| 56/56 [00:03<00:00, 16.56it/s]


Epoch 5 completed in 53.10s
Train Loss: 0.7392 | Val Loss: 0.7980


Epoch 6 Training: 100%|██████████| 223/223 [00:49<00:00,  4.50it/s]
Epoch 6 Validation: 100%|██████████| 56/56 [00:03<00:00, 16.37it/s]


Epoch 6 completed in 53.01s
Train Loss: 0.7344 | Val Loss: 0.7025


Epoch 7 Training: 100%|██████████| 223/223 [00:49<00:00,  4.49it/s]
Epoch 7 Validation: 100%|██████████| 56/56 [00:03<00:00, 16.38it/s]


Epoch 7 completed in 53.09s
Train Loss: 0.7118 | Val Loss: 0.6942


Epoch 8 Training: 100%|██████████| 223/223 [00:49<00:00,  4.49it/s]
Epoch 8 Validation: 100%|██████████| 56/56 [00:03<00:00, 16.45it/s]


Epoch 8 completed in 53.03s
Train Loss: 0.7241 | Val Loss: 0.7535


Epoch 9 Training: 100%|██████████| 223/223 [00:49<00:00,  4.49it/s]
Epoch 9 Validation: 100%|██████████| 56/56 [00:03<00:00, 16.58it/s]


Epoch 9 completed in 53.04s
Train Loss: 0.7239 | Val Loss: 0.6909


Epoch 10 Training: 100%|██████████| 223/223 [00:49<00:00,  4.50it/s]
Epoch 10 Validation: 100%|██████████| 56/56 [00:03<00:00, 16.46it/s]

Epoch 10 completed in 53.00s
Train Loss: 0.7035 | Val Loss: 0.8041





In [134]:
# Save the model
torch.save(model.state_dict(), "bert_amharic_sentiment.pt")


## Making a prediction

In [137]:
def predict(text):
    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        model.resize_token_embeddings(len(tokenizer))

    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        return_attention_mask=True
    )

    input_ids = encoding["input_ids"].to(model.device)
    attention_mask = encoding["attention_mask"].to(model.device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        prediction = torch.argmax(logits, dim=-1).item()

    return "positive" if prediction == 1 else "negative"


In [142]:
print(predict("ጊዜው ይፈትናል አንድ ቀን ደስታ አንድ ቀን መከፋት ግን ሁሉም ያልፋል ጥሩ ጊዜ ይመጣል ብዬ አስባለሁ መልካም ቅዳሜ ይሁን"))
print(predict("አስተያየት የለኝም በጣም አሳዛኝ ነው"))


positive
negative
