In [24]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.metrics import accuracy_score, classification_report

### 1.Load pre-trained BanglaBERT and Tokenizer

In [25]:
model = AutoModelForSequenceClassification.from_pretrained("sagorsarker/bangla-bert-base", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("sagorsarker/bangla-bert-base")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sagorsarker/bangla-bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### 2. Load and Preprocess Dataset

In [26]:
df = pd.read_csv("../training_data/labeled_data.csv")
df['sentiment'] = df['sentiment'].map({1: 1, -1: 0})
df.head()

Unnamed: 0,date_published,url,title,sample_type,newspaper,sentiment
0,2024-06-16 18:13:20,https://www.prothomalo.com/politics/e5tc3bxkma,মিয়ানমারের সঙ্গে গায়ে পড়ে যুদ্ধ বাধানোর ইচ্ছা ...,headline,prothom_alo,1
1,2024-06-13 23:45:10,https://www.prothomalo.com/bangladesh/qbn6uzf2qz,প্রধানমন্ত্রী শেখ হাসিনা এবার\r\nদ্বিপক্ষীয় সফ...,headline,prothom_alo,1
2,2024-06-02 18:07:55,https://www.prothomalo.com/politics/rrlne6snmk,বিতর্কিতদের নিয়ে প্রশ্ন উঠছে আওয়ামী লীগে,headline,prothom_alo,0
3,2024-06-25 00:57:56,https://www.prothomalo.com/bangladesh/capital/...,আওয়ামী লীগের ৭৫তম প্রতিষ্ঠাবার্ষিকীতে হাতিরঝিল...,headline,prothom_alo,1
4,2024-05-19 09:45:00,https://www.prothomalo.com/bangladesh/district...,ঝালকাঠিতে উপজেলা নির্বাচন নিয়ে বিভক্ত আওয়ামী লীগ,headline,prothom_alo,0


In [27]:
df[df['sentiment'].isna()]

Unnamed: 0,date_published,url,title,sample_type,newspaper,sentiment


In [28]:
texts = df['title'].tolist()
labels = df['sentiment'].tolist()

### 3. Tokenize the headlines

In [29]:
encodings = tokenizer(texts, truncation=True, padding=True, max_length=128)
dataset = TensorDataset(
  torch.tensor(encodings['input_ids']), 
  torch.tensor(encodings['attention_mask']), 
  torch.tensor(labels)
)

### 4. Create a DataLoader

In [30]:
dataloader = DataLoader(dataset, batch_size=16)

### 5. Evaluate the Model Without Fine Tuning

In [31]:
model.eval()
all_predictions, all_labels = [], []

with torch.no_grad():
  for batch in dataloader:
    input_ids, attention_mask, labels = batch
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    all_predictions.extend(predictions.cpu().numpy())
    all_labels.extend(labels.cpu().numpy())

### 6. Calculate And Display Metrics

In [32]:
accuracy = accuracy_score(all_labels, all_predictions)
print(f"Accuracy without fine tuning: {accuracy * 100:.2f}%")

Accuracy without fine tuning: 52.00%


### 7. Detailed Classification Report

In [33]:
print(classification_report(all_labels, all_predictions, target_names=['Negative', 'Positive']))

              precision    recall  f1-score   support

    Negative       0.38      0.06      0.10      1549
    Positive       0.53      0.92      0.67      1801

    accuracy                           0.52      3350
   macro avg       0.45      0.49      0.39      3350
weighted avg       0.46      0.52      0.41      3350



## Train the model

In [34]:
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
import torch
import accelerate

### 1. Split into Train and Validation Sets

In [35]:

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['title'].tolist(), df['sentiment'].tolist(), test_size=0.1, random_state=42
)

### 2. Tokenize

In [36]:

tokenizer = AutoTokenizer.from_pretrained("sagorsarker/bangla-bert-base")

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

### 3. Prepare Data for PyTorch

In [37]:
class SentimentDataset(torch.utils.data.Dataset):
  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels

  def __len__(self):
    return len(self.labels)

  def __getitem__(self, idx):
    item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels[idx])
    return item

train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels)

### 4. Define a compute_metrics Function for Evaluation

In [38]:

def compute_metrics(pred):
    predictions = torch.argmax(torch.tensor(pred.predictions), dim=-1).numpy()
    labels = pred.label_ids
    report = classification_report(labels, predictions, target_names=["Negative", "Positive"], output_dict=True)
    
    accuracy = report['accuracy']
    precision = report['macro avg']['precision']
    recall = report['macro avg']['recall']
    f1 = report['macro avg']['f1-score']
    
    return {
      'accuracy': accuracy,
      'precision': precision,
      'recall': recall,
      'f1': f1
    }

### 5. Fine-Tune the model

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
  learning_rate=2e-5,
  max_grad_norm=1.0,
  output_dir='./results',          
  num_train_epochs=10,             
  per_device_train_batch_size=16,  
  per_device_eval_batch_size=16,  
  warmup_steps=500,                
  weight_decay=0.01,          
  logging_dir='./logs',        
  logging_steps=30,
  eval_strategy="epoch",    
  save_strategy="epoch",         
  load_best_model_at_end=True,
  disable_tqdm=False,
  # report_to="none",  # Disable W&B integration
)

trainer = Trainer(
  model=model,                      
  args=training_args,                 
  train_dataset=train_dataset,        
  eval_dataset=val_dataset,
  compute_metrics=compute_metrics            
)

# Train the model
trainer.train()

  0%|          | 0/1890 [00:00<?, ?it/s]

{'loss': 0.7047, 'grad_norm': 4.774619102478027, 'learning_rate': 1.2000000000000002e-06, 'epoch': 0.16}
{'loss': 0.6654, 'grad_norm': 4.194238662719727, 'learning_rate': 2.4000000000000003e-06, 'epoch': 0.32}
{'loss': 0.6162, 'grad_norm': 4.609460353851318, 'learning_rate': 3.6000000000000003e-06, 'epoch': 0.48}
{'loss': 0.588, 'grad_norm': 5.373795032501221, 'learning_rate': 4.800000000000001e-06, 'epoch': 0.63}


### Evaluate the model

In [None]:
trainer.evaluate()

  0%|          | 0/21 [00:00<?, ?it/s]

{'eval_loss': 0.3854467272758484,
 'eval_accuracy': 0.817910447761194,
 'eval_precision': 0.8181081081081081,
 'eval_recall': 0.8158770485937165,
 'eval_f1': 0.8166294272305525,
 'eval_runtime': 19.2355,
 'eval_samples_per_second': 17.416,
 'eval_steps_per_second': 1.092,
 'epoch': 3.0}

### Save the Fine-Tuned Model

In [None]:
model.save_pretrained("./fine_tuned_banglabert")
tokenizer.save_pretrained("./fine_tuned_banglabert")

('./fine_tuned_banglabert\\tokenizer_config.json',
 './fine_tuned_banglabert\\special_tokens_map.json',
 './fine_tuned_banglabert\\vocab.txt',
 './fine_tuned_banglabert\\added_tokens.json',
 './fine_tuned_banglabert\\tokenizer.json')