In [1]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.metrics import accuracy_score, classification_report

### 1.Load pre-trained BanglaBERT and Tokenizer

In [2]:
model = AutoModelForSequenceClassification.from_pretrained("sagorsarker/bangla-bert-base", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("sagorsarker/bangla-bert-base")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sagorsarker/bangla-bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### 2. Load and Preprocess Dataset

In [3]:
df = pd.read_csv("../training_data/labeled_data.csv")
df['sentiment'] = df['sentiment'].map({1: 1, -1: 0})
df.head()

Unnamed: 0,date_published,url,title,sample_type,newspaper,sentiment
0,2024-06-16 18:13:20,https://www.prothomalo.com/politics/e5tc3bxkma,মিয়ানমারের সঙ্গে গায়ে পড়ে যুদ্ধ বাধানোর ইচ্ছা ...,headline,prothom_alo,1
1,2024-06-13 23:45:10,https://www.prothomalo.com/bangladesh/qbn6uzf2qz,প্রধানমন্ত্রী শেখ হাসিনা এবার\nদ্বিপক্ষীয় সফরে...,headline,prothom_alo,1
2,2024-06-02 18:07:55,https://www.prothomalo.com/politics/rrlne6snmk,বিতর্কিতদের নিয়ে প্রশ্ন উঠছে আওয়ামী লীগে,headline,prothom_alo,0
3,2024-06-25 00:57:56,https://www.prothomalo.com/bangladesh/capital/...,আওয়ামী লীগের ৭৫তম প্রতিষ্ঠাবার্ষিকীতে হাতিরঝিল...,headline,prothom_alo,1
4,2024-05-19 09:45:00,https://www.prothomalo.com/bangladesh/district...,ঝালকাঠিতে উপজেলা নির্বাচন নিয়ে বিভক্ত আওয়ামী লীগ,headline,prothom_alo,0


In [4]:
df[df['sentiment'].isna()]

Unnamed: 0,date_published,url,title,sample_type,newspaper,sentiment


In [5]:
texts = df['title'].tolist()
labels = df['sentiment'].tolist()

### 3. Tokenize the headlines

In [6]:
encodings = tokenizer(texts, truncation=True, padding=True, max_length=128)
dataset = TensorDataset(
  torch.tensor(encodings['input_ids']), 
  torch.tensor(encodings['attention_mask']), 
  torch.tensor(labels)
)

### 4. Create a DataLoader

In [7]:
dataloader = DataLoader(dataset, batch_size=16)

### 5. Evaluate the Model Without Fine Tuning

In [8]:
from tqdm import tqdm

model.eval()
all_predictions, all_labels = [], []

with torch.no_grad():
  for batch in tqdm(dataloader, desc="Evaluating", unit="batch"):
    input_ids, attention_mask, labels = batch
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    all_predictions.extend(predictions.cpu().numpy())
    all_labels.extend(labels.cpu().numpy())

Evaluating: 100%|██████████| 285/285 [04:43<00:00,  1.01batch/s]


### 6. Calculate And Display Metrics

In [9]:
accuracy = accuracy_score(all_labels, all_predictions)
print(f"Accuracy without fine tuning: {accuracy * 100:.2f}%")

Accuracy without fine tuning: 53.90%


### 7. Detailed Classification Report

In [10]:
print(classification_report(all_labels, all_predictions, target_names=['Negative', 'Positive']))

              precision    recall  f1-score   support

    Negative       0.50      0.46      0.48      2118
    Positive       0.56      0.61      0.59      2439

    accuracy                           0.54      4557
   macro avg       0.53      0.53      0.53      4557
weighted avg       0.54      0.54      0.54      4557



## Train the model

In [11]:
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
import torch
import accelerate

### 1. Split into Train, Validation and Test Sets

In [12]:

train_texts, temp_texts, train_labels, temp_labels = train_test_split(
  df['title'].tolist(), df['sentiment'].tolist(), test_size=0.2, random_state=42
)
val_texts, test_texts, val_labels, test_labels = train_test_split(
  temp_texts, temp_labels, test_size=0.5, random_state=42
)

### 2. Tokenize

In [13]:

tokenizer = AutoTokenizer.from_pretrained("sagorsarker/bangla-bert-base")

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)

### 3. Prepare Data for PyTorch

In [14]:
class SentimentDataset(torch.utils.data.Dataset):
  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels

  def __len__(self):
    return len(self.labels)

  def __getitem__(self, idx):
    item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels[idx])
    return item

train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels)
test_dataset = SentimentDataset(test_encodings, test_labels)

### 4. Define a compute_metrics Function for Evaluation

In [15]:

def compute_metrics(pred):
  predictions = torch.argmax(torch.tensor(pred.predictions), dim=-1).numpy()
  labels = pred.label_ids
  report = classification_report(labels, predictions, target_names=["Negative", "Positive"], output_dict=True)
  
  accuracy = report['accuracy']
  precision = report['macro avg']['precision']
  recall = report['macro avg']['recall']
  f1 = report['macro avg']['f1-score']
  
  return {
    'accuracy': accuracy,
    'precision': precision,
    'recall': recall,
    'f1': f1
  }

### 5. Fine-Tune the model

In [16]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
  # learning_rate=5e-5,
  max_grad_norm=0.5,
  output_dir='./results',          
  num_train_epochs=5,             
  per_device_train_batch_size=16,  
  per_device_eval_batch_size=16,  
  warmup_steps=500,                
  weight_decay=0.01,          
  logging_dir='./logs',        
  logging_steps=15,
  eval_strategy="epoch",    
  save_strategy="epoch",         
  load_best_model_at_end=True,
  report_to="none",  # Disable W&B integration
)

trainer = Trainer(
  model=model,                      
  args=training_args,                 
  train_dataset=train_dataset,        
  eval_dataset=val_dataset,
  compute_metrics=compute_metrics            
)

# Train the model
trainer.train()




  0%|          | 0/1140 [00:00<?, ?it/s]

{'loss': 0.6833, 'grad_norm': 5.325006008148193, 'learning_rate': 1.5e-06, 'epoch': 0.07}
{'loss': 0.6787, 'grad_norm': 4.302872180938721, 'learning_rate': 3e-06, 'epoch': 0.13}
{'loss': 0.6402, 'grad_norm': 5.738388538360596, 'learning_rate': 4.5e-06, 'epoch': 0.2}
{'loss': 0.6138, 'grad_norm': 6.003082275390625, 'learning_rate': 6e-06, 'epoch': 0.26}
{'loss': 0.5728, 'grad_norm': 8.131525993347168, 'learning_rate': 7.5e-06, 'epoch': 0.33}
{'loss': 0.5144, 'grad_norm': 6.369797229766846, 'learning_rate': 9e-06, 'epoch': 0.39}
{'loss': 0.4713, 'grad_norm': 12.196466445922852, 'learning_rate': 1.05e-05, 'epoch': 0.46}
{'loss': 0.4718, 'grad_norm': 10.987630844116211, 'learning_rate': 1.2e-05, 'epoch': 0.53}
{'loss': 0.5002, 'grad_norm': 9.614204406738281, 'learning_rate': 1.3500000000000001e-05, 'epoch': 0.59}
{'loss': 0.4717, 'grad_norm': 13.784036636352539, 'learning_rate': 1.5e-05, 'epoch': 0.66}
{'loss': 0.4879, 'grad_norm': 13.954841613769531, 'learning_rate': 1.65e-05, 'epoch': 0.

  0%|          | 0/29 [00:00<?, ?it/s]

{'eval_loss': 0.4131354093551636, 'eval_accuracy': 0.8135964912280702, 'eval_precision': 0.8168981481481481, 'eval_recall': 0.818724172050521, 'eval_f1': 0.813523850761348, 'eval_runtime': 18.6441, 'eval_samples_per_second': 24.458, 'eval_steps_per_second': 1.555, 'epoch': 1.0}
{'loss': 0.3827, 'grad_norm': 7.845691680908203, 'learning_rate': 2.4e-05, 'epoch': 1.05}
{'loss': 0.3437, 'grad_norm': 17.505434036254883, 'learning_rate': 2.5500000000000003e-05, 'epoch': 1.12}
{'loss': 0.3942, 'grad_norm': 12.609180450439453, 'learning_rate': 2.7000000000000002e-05, 'epoch': 1.18}
{'loss': 0.2978, 'grad_norm': 5.617586612701416, 'learning_rate': 2.8499999999999998e-05, 'epoch': 1.25}
{'loss': 0.3513, 'grad_norm': 12.413505554199219, 'learning_rate': 3e-05, 'epoch': 1.32}
{'loss': 0.2664, 'grad_norm': 10.763534545898438, 'learning_rate': 3.15e-05, 'epoch': 1.38}
{'loss': 0.4123, 'grad_norm': 18.09748649597168, 'learning_rate': 3.3e-05, 'epoch': 1.45}
{'loss': 0.4262, 'grad_norm': 11.6752119064

  0%|          | 0/29 [00:00<?, ?it/s]

{'eval_loss': 0.4237270951271057, 'eval_accuracy': 0.8048245614035088, 'eval_precision': 0.8030853696257346, 'eval_recall': 0.8041732145975204, 'eval_f1': 0.8035310577197714, 'eval_runtime': 19.195, 'eval_samples_per_second': 23.756, 'eval_steps_per_second': 1.511, 'epoch': 2.0}
{'loss': 0.3087, 'grad_norm': 19.488483428955078, 'learning_rate': 4.6500000000000005e-05, 'epoch': 2.04}
{'loss': 0.2694, 'grad_norm': 1.1958409547805786, 'learning_rate': 4.8e-05, 'epoch': 2.11}
{'loss': 0.3181, 'grad_norm': 34.84136199951172, 'learning_rate': 4.9500000000000004e-05, 'epoch': 2.17}
{'loss': 0.2346, 'grad_norm': 18.119441986083984, 'learning_rate': 4.921875e-05, 'epoch': 2.24}
{'loss': 0.2301, 'grad_norm': 7.944274425506592, 'learning_rate': 4.8046875e-05, 'epoch': 2.3}
{'loss': 0.2684, 'grad_norm': 31.6678409576416, 'learning_rate': 4.6875e-05, 'epoch': 2.37}
{'loss': 0.3082, 'grad_norm': 12.241788864135742, 'learning_rate': 4.5703125e-05, 'epoch': 2.43}
{'loss': 0.2642, 'grad_norm': 7.940335

  0%|          | 0/29 [00:00<?, ?it/s]

{'eval_loss': 0.4266470670700073, 'eval_accuracy': 0.831140350877193, 'eval_precision': 0.8308080808080809, 'eval_recall': 0.8278621733310052, 'eval_f1': 0.829001387975747, 'eval_runtime': 23.127, 'eval_samples_per_second': 19.717, 'eval_steps_per_second': 1.254, 'epoch': 3.0}
{'loss': 0.1953, 'grad_norm': 15.099349021911621, 'learning_rate': 3.5156250000000004e-05, 'epoch': 3.03}
{'loss': 0.1438, 'grad_norm': 9.023579597473145, 'learning_rate': 3.3984375000000004e-05, 'epoch': 3.09}
{'loss': 0.2184, 'grad_norm': 19.188974380493164, 'learning_rate': 3.2812500000000005e-05, 'epoch': 3.16}
{'loss': 0.1508, 'grad_norm': 1.7518222332000732, 'learning_rate': 3.1640625e-05, 'epoch': 3.22}
{'loss': 0.1816, 'grad_norm': 18.01801872253418, 'learning_rate': 3.0468750000000002e-05, 'epoch': 3.29}
{'loss': 0.1201, 'grad_norm': 2.092465877532959, 'learning_rate': 2.9296875000000002e-05, 'epoch': 3.36}
{'loss': 0.1443, 'grad_norm': 1.3128206729888916, 'learning_rate': 2.8125000000000003e-05, 'epoch'

  0%|          | 0/29 [00:00<?, ?it/s]

{'eval_loss': 0.7052695751190186, 'eval_accuracy': 0.8135964912280702, 'eval_precision': 0.8176218428979687, 'eval_recall': 0.819131598859205, 'eval_f1': 0.8135525549938187, 'eval_runtime': 27.037, 'eval_samples_per_second': 16.866, 'eval_steps_per_second': 1.073, 'epoch': 4.0}
{'loss': 0.0843, 'grad_norm': 4.065337181091309, 'learning_rate': 1.7578125000000002e-05, 'epoch': 4.01}
{'loss': 0.0808, 'grad_norm': 12.534356117248535, 'learning_rate': 1.6406250000000002e-05, 'epoch': 4.08}
{'loss': 0.0902, 'grad_norm': 0.5454608201980591, 'learning_rate': 1.5234375000000001e-05, 'epoch': 4.14}
{'loss': 0.0495, 'grad_norm': 12.187219619750977, 'learning_rate': 1.4062500000000001e-05, 'epoch': 4.21}
{'loss': 0.0761, 'grad_norm': 0.07699603587388992, 'learning_rate': 1.2890625e-05, 'epoch': 4.28}
{'loss': 0.0867, 'grad_norm': 0.12628336250782013, 'learning_rate': 1.171875e-05, 'epoch': 4.34}
{'loss': 0.0675, 'grad_norm': 42.31182098388672, 'learning_rate': 1.0546875e-05, 'epoch': 4.41}
{'loss'

  0%|          | 0/29 [00:00<?, ?it/s]

{'eval_loss': 0.8614694476127625, 'eval_accuracy': 0.8486842105263158, 'eval_precision': 0.8473325109860458, 'eval_recall': 0.8496304056806938, 'eval_f1': 0.8479816402947216, 'eval_runtime': 26.6219, 'eval_samples_per_second': 17.129, 'eval_steps_per_second': 1.089, 'epoch': 5.0}
{'train_runtime': 4845.9918, 'train_samples_per_second': 3.761, 'train_steps_per_second': 0.235, 'train_loss': 0.28651638485883413, 'epoch': 5.0}


TrainOutput(global_step=1140, training_loss=0.28651638485883413, metrics={'train_runtime': 4845.9918, 'train_samples_per_second': 3.761, 'train_steps_per_second': 0.235, 'total_flos': 374624920620000.0, 'train_loss': 0.28651638485883413, 'epoch': 5.0})

### Evaluate the model

In [21]:
test_results = trainer.evaluate(eval_dataset=test_dataset)
print(test_results)

  0%|          | 0/29 [00:00<?, ?it/s]

{'eval_loss': 0.3889409303665161, 'eval_accuracy': 0.8223684210526315, 'eval_precision': 0.821055165110234, 'eval_recall': 0.8243464052287581, 'eval_f1': 0.8215436646938037, 'eval_runtime': 25.911, 'eval_samples_per_second': 17.599, 'eval_steps_per_second': 1.119, 'epoch': 5.0}


In [22]:
predictions = trainer.predict(test_dataset)
predicted_labels = torch.argmax(torch.tensor(predictions.predictions), dim=1).numpy()
report = classification_report(test_labels, predicted_labels, target_names=["Negative", "Positive"], digits=2)
print(report)

  0%|          | 0/29 [00:00<?, ?it/s]

              precision    recall  f1-score   support

    Negative       0.78      0.84      0.81       204
    Positive       0.86      0.81      0.83       252

    accuracy                           0.82       456
   macro avg       0.82      0.82      0.82       456
weighted avg       0.83      0.82      0.82       456



### Save the Fine-Tuned Model

In [18]:
model.save_pretrained("./fine_tuned_banglabert")
tokenizer.save_pretrained("./fine_tuned_banglabert")

('./fine_tuned_banglabert\\tokenizer_config.json',
 './fine_tuned_banglabert\\special_tokens_map.json',
 './fine_tuned_banglabert\\vocab.txt',
 './fine_tuned_banglabert\\added_tokens.json',
 './fine_tuned_banglabert\\tokenizer.json')