In [35]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.metrics import accuracy_score, classification_report

### 1.Load pre-trained BanglaBERT and Tokenizer

In [36]:
model = AutoModelForSequenceClassification.from_pretrained("sagorsarker/bangla-bert-base", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("sagorsarker/bangla-bert-base")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sagorsarker/bangla-bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### 2. Load and Preprocess Dataset

In [37]:
df = pd.read_csv("../training_data/labeled_data.csv")
df['sentiment'] = df['sentiment'].map({1: 1, -1: 0})
df.head()

Unnamed: 0,date_published,url,title,sample_type,newspaper,sentiment
0,2024-06-16 18:13:20,https://www.prothomalo.com/politics/e5tc3bxkma,মিয়ানমারের সঙ্গে গায়ে পড়ে যুদ্ধ বাধানোর ইচ্ছা ...,headline,prothom_alo,1
1,2024-06-13 23:45:10,https://www.prothomalo.com/bangladesh/qbn6uzf2qz,প্রধানমন্ত্রী শেখ হাসিনা এবার\nদ্বিপক্ষীয় সফরে...,headline,prothom_alo,1
2,2024-06-02 18:07:55,https://www.prothomalo.com/politics/rrlne6snmk,বিতর্কিতদের নিয়ে প্রশ্ন উঠছে আওয়ামী লীগে,headline,prothom_alo,0
3,2024-06-25 00:57:56,https://www.prothomalo.com/bangladesh/capital/...,আওয়ামী লীগের ৭৫তম প্রতিষ্ঠাবার্ষিকীতে হাতিরঝিল...,headline,prothom_alo,1
4,2024-05-19 09:45:00,https://www.prothomalo.com/bangladesh/district...,ঝালকাঠিতে উপজেলা নির্বাচন নিয়ে বিভক্ত আওয়ামী লীগ,headline,prothom_alo,0


In [38]:
df[df['sentiment'].isna()]

Unnamed: 0,date_published,url,title,sample_type,newspaper,sentiment


In [39]:
texts = df['title'].tolist()
labels = df['sentiment'].tolist()

### 3. Tokenize the headlines

In [40]:
encodings = tokenizer(texts, truncation=True, padding=True, max_length=128)
dataset = TensorDataset(
  torch.tensor(encodings['input_ids']), 
  torch.tensor(encodings['attention_mask']), 
  torch.tensor(labels)
)

### 4. Create a DataLoader

In [41]:
dataloader = DataLoader(dataset, batch_size=16)

### 5. Evaluate the Model Without Fine Tuning

In [42]:
from tqdm import tqdm

model.eval()
all_predictions, all_labels = [], []

with torch.no_grad():
  for batch in tqdm(dataloader, desc="Evaluating", unit="batch"):
    input_ids, attention_mask, labels = batch
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    all_predictions.extend(predictions.cpu().numpy())
    all_labels.extend(labels.cpu().numpy())

Evaluating: 100%|██████████| 285/285 [06:43<00:00,  1.42s/batch]


### 6. Calculate And Display Metrics

In [43]:
accuracy = accuracy_score(all_labels, all_predictions)
print(f"Accuracy without fine tuning: {accuracy * 100:.2f}%")

Accuracy without fine tuning: 46.72%


### 7. Detailed Classification Report

In [44]:
print(classification_report(all_labels, all_predictions, target_names=['Negative', 'Positive']))

              precision    recall  f1-score   support

    Negative       0.46      0.85      0.60      2118
    Positive       0.51      0.14      0.22      2439

    accuracy                           0.47      4557
   macro avg       0.48      0.49      0.41      4557
weighted avg       0.49      0.47      0.39      4557



## Train the model

In [45]:
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
import torch
import accelerate

### 1. Split into Train, Validation and Test Sets

In [46]:

train_texts, temp_texts, train_labels, temp_labels = train_test_split(
  df['title'].tolist(), df['sentiment'].tolist(), test_size=0.2, random_state=42
)
val_texts, test_texts, val_labels, test_labels = train_test_split(
  temp_texts, temp_labels, test_size=0.5, random_state=42
)

### 2. Tokenize

In [47]:

tokenizer = AutoTokenizer.from_pretrained("sagorsarker/bangla-bert-base")

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)

### 3. Prepare Data for PyTorch

In [48]:
class SentimentDataset(torch.utils.data.Dataset):
  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels

  def __len__(self):
    return len(self.labels)

  def __getitem__(self, idx):
    item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels[idx])
    return item

train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels)
test_dataset = SentimentDataset(test_encodings, test_labels)

### 4. Define a compute_metrics Function for Evaluation

In [49]:

def compute_metrics(pred):
  predictions = torch.argmax(torch.tensor(pred.predictions), dim=-1).numpy()
  labels = pred.label_ids
  report = classification_report(labels, predictions, target_names=["Negative", "Positive"], output_dict=True)
  
  accuracy = report['accuracy']
  precision = report['macro avg']['precision']
  recall = report['macro avg']['recall']
  f1 = report['macro avg']['f1-score']
  
  return {
    'accuracy': accuracy,
    'precision': precision,
    'recall': recall,
    'f1': f1
  }

### 5. Fine-Tune the model

In [50]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
  # learning_rate=5e-5,
  max_grad_norm=0.5,
  output_dir='./results',          
  num_train_epochs=5,             
  per_device_train_batch_size=16,  
  per_device_eval_batch_size=16,  
  warmup_steps=500,                
  weight_decay=0.01,          
  logging_dir='./logs',        
  logging_steps=15,
  eval_strategy="epoch",    
  save_strategy="epoch",         
  load_best_model_at_end=True,
  disable_tqdm=False,
  # report_to="none",  # Disable W&B integration
)

trainer = Trainer(
  model=model,                      
  args=training_args,                 
  train_dataset=train_dataset,        
  eval_dataset=val_dataset,
  compute_metrics=compute_metrics            
)

# Train the model
trainer.train()

  0%|          | 0/1140 [00:00<?, ?it/s]

{'loss': 0.7245, 'grad_norm': 4.642988204956055, 'learning_rate': 1.5e-06, 'epoch': 0.07}
{'loss': 0.6821, 'grad_norm': 3.7675275802612305, 'learning_rate': 3e-06, 'epoch': 0.13}
{'loss': 0.6557, 'grad_norm': 4.110259532928467, 'learning_rate': 4.5e-06, 'epoch': 0.2}
{'loss': 0.6311, 'grad_norm': 5.932863712310791, 'learning_rate': 6e-06, 'epoch': 0.26}
{'loss': 0.579, 'grad_norm': 5.716725826263428, 'learning_rate': 7.5e-06, 'epoch': 0.33}
{'loss': 0.5103, 'grad_norm': 6.16485595703125, 'learning_rate': 9e-06, 'epoch': 0.39}
{'loss': 0.4597, 'grad_norm': 9.783946990966797, 'learning_rate': 1.05e-05, 'epoch': 0.46}
{'loss': 0.4641, 'grad_norm': 11.104312896728516, 'learning_rate': 1.2e-05, 'epoch': 0.53}
{'loss': 0.4909, 'grad_norm': 10.016702651977539, 'learning_rate': 1.3500000000000001e-05, 'epoch': 0.59}
{'loss': 0.4628, 'grad_norm': 11.178995132446289, 'learning_rate': 1.5e-05, 'epoch': 0.66}
{'loss': 0.4938, 'grad_norm': 12.908051490783691, 'learning_rate': 1.65e-05, 'epoch': 0.7

  0%|          | 0/29 [00:00<?, ?it/s]

{'eval_loss': 0.4166795015335083, 'eval_accuracy': 0.793859649122807, 'eval_precision': 0.7927216713199191, 'eval_recall': 0.7949479075723183, 'eval_f1': 0.7930794778713215, 'eval_runtime': 32.9105, 'eval_samples_per_second': 13.856, 'eval_steps_per_second': 0.881, 'epoch': 1.0}
{'loss': 0.3846, 'grad_norm': 7.922717094421387, 'learning_rate': 2.4e-05, 'epoch': 1.05}
{'loss': 0.3455, 'grad_norm': 20.084924697875977, 'learning_rate': 2.5500000000000003e-05, 'epoch': 1.12}
{'loss': 0.4041, 'grad_norm': 12.209802627563477, 'learning_rate': 2.7000000000000002e-05, 'epoch': 1.18}
{'loss': 0.2911, 'grad_norm': 5.304938316345215, 'learning_rate': 2.8499999999999998e-05, 'epoch': 1.25}
{'loss': 0.356, 'grad_norm': 14.357898712158203, 'learning_rate': 3e-05, 'epoch': 1.32}
{'loss': 0.2668, 'grad_norm': 10.258506774902344, 'learning_rate': 3.15e-05, 'epoch': 1.38}
{'loss': 0.3775, 'grad_norm': 14.778487205505371, 'learning_rate': 3.3e-05, 'epoch': 1.45}
{'loss': 0.4615, 'grad_norm': 11.409733772

  0%|          | 0/29 [00:00<?, ?it/s]

{'eval_loss': 0.4683535397052765, 'eval_accuracy': 0.7850877192982456, 'eval_precision': 0.8076544450025114, 'eval_recall': 0.7971014492753623, 'eval_f1': 0.7843867605905626, 'eval_runtime': 34.2715, 'eval_samples_per_second': 13.306, 'eval_steps_per_second': 0.846, 'epoch': 2.0}
{'loss': 0.3664, 'grad_norm': 7.8909711837768555, 'learning_rate': 4.6500000000000005e-05, 'epoch': 2.04}
{'loss': 0.2526, 'grad_norm': 0.9803512096405029, 'learning_rate': 4.8e-05, 'epoch': 2.11}
{'loss': 0.2547, 'grad_norm': 27.86567497253418, 'learning_rate': 4.9500000000000004e-05, 'epoch': 2.17}
{'loss': 0.3168, 'grad_norm': 19.144290924072266, 'learning_rate': 4.921875e-05, 'epoch': 2.24}
{'loss': 0.2177, 'grad_norm': 16.90492820739746, 'learning_rate': 4.8046875e-05, 'epoch': 2.3}
{'loss': 0.2676, 'grad_norm': 14.057941436767578, 'learning_rate': 4.6875e-05, 'epoch': 2.37}
{'loss': 0.2674, 'grad_norm': 19.980239868164062, 'learning_rate': 4.5703125e-05, 'epoch': 2.43}
{'loss': 0.2929, 'grad_norm': 8.536

  0%|          | 0/29 [00:00<?, ?it/s]

{'eval_loss': 0.49421483278274536, 'eval_accuracy': 0.8201754385964912, 'eval_precision': 0.8261952493773161, 'eval_recall': 0.8125254641755428, 'eval_f1': 0.8155776515151516, 'eval_runtime': 42.6682, 'eval_samples_per_second': 10.687, 'eval_steps_per_second': 0.68, 'epoch': 3.0}
{'loss': 0.1934, 'grad_norm': 11.739583969116211, 'learning_rate': 3.5156250000000004e-05, 'epoch': 3.03}
{'loss': 0.125, 'grad_norm': 1.624236822128296, 'learning_rate': 3.3984375000000004e-05, 'epoch': 3.09}
{'loss': 0.1715, 'grad_norm': 14.453299522399902, 'learning_rate': 3.2812500000000005e-05, 'epoch': 3.16}
{'loss': 0.1835, 'grad_norm': 11.616009712219238, 'learning_rate': 3.1640625e-05, 'epoch': 3.22}
{'loss': 0.2182, 'grad_norm': 1.4407237768173218, 'learning_rate': 3.0468750000000002e-05, 'epoch': 3.29}
{'loss': 0.2221, 'grad_norm': 27.32814598083496, 'learning_rate': 2.9296875000000002e-05, 'epoch': 3.36}
{'loss': 0.124, 'grad_norm': 0.2729629576206207, 'learning_rate': 2.8125000000000003e-05, 'epoc

  0%|          | 0/29 [00:00<?, ?it/s]

{'eval_loss': 0.6631172299385071, 'eval_accuracy': 0.8201754385964912, 'eval_precision': 0.8213399026155237, 'eval_recall': 0.823933414818695, 'eval_f1': 0.8199537750385206, 'eval_runtime': 32.1712, 'eval_samples_per_second': 14.174, 'eval_steps_per_second': 0.901, 'epoch': 4.0}
{'loss': 0.0978, 'grad_norm': 0.35816287994384766, 'learning_rate': 1.7578125000000002e-05, 'epoch': 4.01}
{'loss': 0.0594, 'grad_norm': 5.227641582489014, 'learning_rate': 1.6406250000000002e-05, 'epoch': 4.08}
{'loss': 0.0434, 'grad_norm': 5.600642681121826, 'learning_rate': 1.5234375000000001e-05, 'epoch': 4.14}
{'loss': 0.0868, 'grad_norm': 24.080400466918945, 'learning_rate': 1.4062500000000001e-05, 'epoch': 4.21}
{'loss': 0.119, 'grad_norm': 29.9935245513916, 'learning_rate': 1.2890625e-05, 'epoch': 4.28}
{'loss': 0.0996, 'grad_norm': 0.03621614724397659, 'learning_rate': 1.171875e-05, 'epoch': 4.34}
{'loss': 0.03, 'grad_norm': 0.02660064399242401, 'learning_rate': 1.0546875e-05, 'epoch': 4.41}
{'loss': 0

  0%|          | 0/29 [00:00<?, ?it/s]

{'eval_loss': 1.0611993074417114, 'eval_accuracy': 0.8201754385964912, 'eval_precision': 0.818695409058481, 'eval_recall': 0.820674000349223, 'eval_f1': 0.8192854933498299, 'eval_runtime': 34.7182, 'eval_samples_per_second': 13.134, 'eval_steps_per_second': 0.835, 'epoch': 5.0}
{'train_runtime': 7092.3174, 'train_samples_per_second': 2.57, 'train_steps_per_second': 0.161, 'train_loss': 0.28652230610972956, 'epoch': 5.0}


TrainOutput(global_step=1140, training_loss=0.28652230610972956, metrics={'train_runtime': 7092.3174, 'train_samples_per_second': 2.57, 'train_steps_per_second': 0.161, 'total_flos': 374624920620000.0, 'train_loss': 0.28652230610972956, 'epoch': 5.0})

### Evaluate the model

In [51]:
test_results = trainer.evaluate(eval_dataset=test_dataset)
print(test_results)

  0%|          | 0/29 [00:00<?, ?it/s]

{'eval_loss': 0.4020768404006958, 'eval_accuracy': 0.8355263157894737, 'eval_precision': 0.8336118938878632, 'eval_recall': 0.8339169000933706, 'eval_f1': 0.8337602745385875, 'eval_runtime': 37.5809, 'eval_samples_per_second': 12.134, 'eval_steps_per_second': 0.772, 'epoch': 5.0}


### Save the Fine-Tuned Model

In [52]:
model.save_pretrained("./fine_tuned_banglabert")
tokenizer.save_pretrained("./fine_tuned_banglabert")

('./fine_tuned_banglabert\\tokenizer_config.json',
 './fine_tuned_banglabert\\special_tokens_map.json',
 './fine_tuned_banglabert\\vocab.txt',
 './fine_tuned_banglabert\\added_tokens.json',
 './fine_tuned_banglabert\\tokenizer.json')