In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [2]:
model_name = 'csebuetnlp/banglabert'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)

# Define device (CPU or GPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at csebuetnlp/banglabert and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
import pandas as pd
train_url = 'train_data.csv'
test_url = 'test_data.csv'
df_train = pd.read_csv(train_url)
df_test = pd.read_csv(test_url)
stop_words_df = pd.read_excel('stopwords_bangla.xlsx',index_col=False)

In [4]:
STOPWORDS = set([word.strip() for word in stop_words_df['words']])

In [5]:
import re
def preprocess(x):
    html_pattern = re.compile('<.*?>')
    x = html_pattern.sub(r'', x)
    x = " ".join([word for word in str(x).split() if word not in STOPWORDS])
    return x
df_train['Comment'] = df_train['Comment'].apply(lambda x: preprocess(x))
df_test['Comment'] = df_test['Comment'].apply(lambda x:preprocess(x))

In [6]:
import numpy as np
allcats = set(df_train['Category'].dropna().tolist())
allcats

{'Code Switching', 'Grammatical', 'Multiple Errors', 'Spelling'}

In [7]:
allcats.add('Correct')
labeldict = {}
counter = 0
for i in allcats:
    labeldict[i] = counter
    counter += 1
labeldict

{'Correct': 0,
 'Grammatical': 1,
 'Code Switching': 2,
 'Spelling': 3,
 'Multiple Errors': 4}

In [8]:
def manage(x):
    if x in labeldict:
        return labeldict[x]
    else:
        return labeldict['Correct']
df_train['Category'] = df_train['Category'].apply(lambda x:manage(x))
df_test['Category'] = df_test['Category'].apply(lambda x:manage(x))

In [9]:
df_train

Unnamed: 0,Video Title,Genre,Comment,Error,Category,Correct Form
0,"ওবায়দুল কাদের বললেন, ‘খেলা হবে’; আর রুমিন ফারহ...",Politics,কাদের খেলব কাদের খেলতেই না,1,1,কাদের কি খেলবে কাদের তো খেলতেই পারে না
1,পুলিশের গাড়ির ওপর চড়াও বিএনপির কর্মীরা | BN...,Politics,এসব আরো ঠাসা,1,3,এসব করে আরো কোণঠাসা হবে
2,Ayub Bachchu | Ek Akash Tara | আইয়ুব বাচ্চু |...,Entertainment,যুগ যুগ গেথে গান,0,0,যুগ যুগ ধরে আমাদের মনে গেথে থাকবে এ গান
3,যে প্রেম কাহিনী কোন বাধা মানেনি | BBC Bangla,Miscellaneous,অাচছা অাপু এলাজী থাকলে টিকা জাবেনা,1,3,আচ্ছা আপু এলার্জী থাকলে টিকা নেওয়া যাবেনা
4,তুরস্কের চেয়ে ভয়াবহ ভূমিকম্পের ঝুঁকিতে বাংলাদে...,News,হে আল্লাহ জালিমদের সন্তান সন্তদের হেফাজত,0,0,হে আল্লাহ এই জালিমদের থেকে আমাদের সন্তান সন্তদ...
...,...,...,...,...,...,...
8027,Shitom Ahmed - Chorabali (Lyrics) || কেন লাগে ...,Entertainment,সত্যি শুন্য লাগে,0,0,সত্যি তাকে ছাড়া খুব শুন্য লাগে
8028,হৃদয় ছুঁয়ে যাওয়া ৭টি সেরা ইমোশনাল বিজ্ঞাপন ...,Entertainment,বিজ্ঞাপন গুলো চোখে পানি আসলো ভাই,0,0,বিজ্ঞাপন গুলো দেখে চোখে পানি চলে আসলো ভাই
8029,আইপিএলের নিলাম তালিকায় পাঁচ বাংলাদেশি | IPL | ...,Sports,টাকা সবদিক,1,3,একবার যখন টাকা হয় তখন সব দিক দিয়ে আসে
8030,মাহমুদুল্লাহর সেরা ১০টি ইনিংস || 10 Greatest I...,Sports,অসাধারণ ইউটিউব ভিডিও সাথে চমৎকার ব্যাকগ্রাউন্ড,0,0,আমার দেখা অসাধারণ ইউটিউব ভিডিও তার সাথে চমৎকার...


In [10]:
data_no = 5

# Prepare the training data
train_texts = df_train['Comment'].tolist()
train_labels = df_train['Category'].tolist()

test_texts = df_test['Comment'].tolist()
test_labels = df_test['Category'].tolist()

In [11]:
# Tokenize and encode the training texts
train_encodings = tokenizer(train_texts, truncation=True, max_length=128,
        padding='max_length', return_tensors = 'pt')

# Convert the labels to tensors
train_labels = torch.tensor(train_labels)

# Create a PyTorch dataset
train_dataset = torch.utils.data.TensorDataset(train_encodings['input_ids'],
                                               train_encodings['attention_mask'],
                                               train_labels)

# Create a data loader
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)

model = model.to(device)

In [12]:
from tqdm.notebook import tqdm
import torch
import torch.optim as optim
import torch.nn as nn
from sklearn.metrics import accuracy_score
import time

# Reset peak GPU memory tracking
if torch.cuda.is_available():
    torch.cuda.reset_max_memory_allocated()

# Set the model to training mode
model.train()

# Define the optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

losses = []
accuracies = []  # To store accuracy per epoch
num_epochs = 5

start_time = time.time()  # Start time before training

# Training loop
for epoch in tqdm(range(num_epochs)):  # Number of training epochs
    running_loss = 0.0
    predicted_labels = []  # To store predicted labels for accuracy calculation
    true_labels = []  # To store true labels for accuracy calculation

    for batch in tqdm(train_loader):
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        # Convert logits to predicted labels
        _, predicted = torch.max(logits, dim=1)
        predicted_labels.extend(predicted.cpu().tolist())
        true_labels.extend(labels.cpu().tolist())

    epoch_loss = running_loss / len(train_loader)
    losses.append(epoch_loss)

    # Calculate and store accuracy
    accuracy = accuracy_score(true_labels, predicted_labels)
    accuracies.append(accuracy)

    print(f'Epoch {epoch + 1}/{num_epochs} - Loss: {epoch_loss:.4f} - Accuracy: {accuracy:.4f}')
    


end_time = time.time()  # End time after training
total_time = end_time - start_time

max_memory = torch.cuda.max_memory_allocated() / 1e9 if torch.cuda.is_available() else 0  # Convert bytes to GB

print("Witout AMP: ")
print(f"\nTotal training time: {total_time:.2f} seconds")
if torch.cuda.is_available():
    print(f"Maximum GPU memory allocated: {max_memory:.4f} GB")
else:
    print("GPU not available, cannot measure GPU memory usage.")

# Save the model
torch.save(model.state_dict(), 'model.pth')




  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/502 [00:00<?, ?it/s]

Epoch 1/5 - Loss: 1.0181 - Accuracy: 0.6500


  0%|          | 0/502 [00:00<?, ?it/s]

Epoch 2/5 - Loss: 0.7522 - Accuracy: 0.7480


  0%|          | 0/502 [00:00<?, ?it/s]

Epoch 3/5 - Loss: 0.6218 - Accuracy: 0.8045


  0%|          | 0/502 [00:00<?, ?it/s]

Epoch 4/5 - Loss: 0.5071 - Accuracy: 0.8434


  0%|          | 0/502 [00:00<?, ?it/s]

Epoch 5/5 - Loss: 0.4086 - Accuracy: 0.8794
Witout AMP: 

Total training time: 545.27 seconds
Maximum GPU memory allocated: 3.0138 GB


In [13]:
#dgfdgdfgdgffdgdfd1212jhkhk

In [14]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

def predict_labels(text):
    train_encodings = tokenizer(text, truncation=True, max_length=128,
        padding='max_length', return_tensors = 'pt')
    input_ids = train_encodings['input_ids'].to(device)
    attention_mask = train_encodings['attention_mask'].to(device)

    # Set the model to evaluation mode
    model.eval()

    # Disable gradient calculation
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=1)
    predicted_class = torch.argmax(probabilities, dim=1)
    

    return predicted_class.item(), probabilities[:,1].item()

In [15]:
predicted_labels = []
predicted_probs = []
for text in tqdm(test_texts):
    predicted_label, prob = predict_labels(text)
    predicted_labels.append(predicted_label)
    predicted_probs.append(prob)

# Calculate accuracy and F1 score
accuracy = accuracy_score(test_labels, predicted_labels)
# f1 = f1_score(test_labels, predicted_labels)
# roc_auc = roc_auc_score(test_labels, predicted_probs)

print('Accuracy:', accuracy)
# print('F1 Score:', f1)
# print('ROC-AUC:', roc_auc)

  0%|          | 0/2010 [00:00<?, ?it/s]

Accuracy: 0.7318407960199005


In [16]:
print('Accuracy:', accuracy)

Accuracy: 0.7318407960199005


In [17]:
from sklearn.metrics import roc_auc_score, classification_report

print('\nThe Classification Report is as follows\n')
print(classification_report(test_labels, predicted_labels, digits = 4))


The Classification Report is as follows

              precision    recall  f1-score   support

           0     0.7751    0.8755    0.8222      1157
           1     0.5000    0.2578    0.3402       128
           2     0.6212    0.7785    0.6910       158
           3     0.7077    0.5884    0.6425       498
           4     0.3600    0.1304    0.1915        69

    accuracy                         0.7318      2010
   macro avg     0.5928    0.5261    0.5375      2010
weighted avg     0.7145    0.7318    0.7151      2010



In [18]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer
import torch.cuda.amp as amp
from torch.nn import CrossEntropyLoss
from torch.optim import AdamW
import time

# Define dataset class
class BanglaCommentDataset(Dataset):
    def __init__(self, comments, labels, tokenizer, max_length=128):
        self.comments = comments
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.comments)
    
    def __getitem__(self, idx):
        comment = str(self.comments[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(
            comment,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        item = {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }
        return item

# Prepare dataloaders
train_dataset = BanglaCommentDataset(df_train['Comment'].tolist(), df_train['Category'].tolist(), tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# Model, loss, optimizer setup
model.to(device)
criterion = CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=5e-5)
scaler = amp.GradScaler()  # Initialize GradScaler for mixed precision

model.train()
num_epochs = 3

# Reset peak GPU memory tracking
if torch.cuda.is_available():
    torch.cuda.reset_max_memory_allocated()

start_time = time.time()

for epoch in range(num_epochs):
    total_loss = 0
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        with amp.autocast():  # Mixed precision forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss = criterion(logits, labels)

        scaler.scale(loss).backward()  # Scaled backward pass
        scaler.step(optimizer)         # Optimizer step with scaled gradients
        scaler.update()                # Update scaler for next iteration

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

end_time = time.time()
total_time = end_time - start_time

max_memory = torch.cuda.max_memory_allocated() / 1e9  # Convert bytes to GB if using GPU

print("With AMP: ")
print(f"\nTotal training time: {total_time:.2f} seconds")
if torch.cuda.is_available():
    print(f"Maximum GPU memory allocated: {max_memory:.4f} GB")
else:
    print("GPU not available, cannot measure GPU memory usage.")


  scaler = amp.GradScaler()  # Initialize GradScaler for mixed precision
  with amp.autocast():  # Mixed precision forward pass


Epoch 1/3, Loss: 0.4893
Epoch 2/3, Loss: 0.3706
Epoch 3/3, Loss: 0.2722
With AMP: 

Total training time: 180.30 seconds
Maximum GPU memory allocated: 2.7001 GB


In [19]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

def predict_labels(text):
    train_encodings = tokenizer(text, truncation=True, max_length=128,
        padding='max_length', return_tensors = 'pt')
    input_ids = train_encodings['input_ids'].to(device)
    attention_mask = train_encodings['attention_mask'].to(device)

    # Set the model to evaluation mode
    model.eval()

    # Disable gradient calculation
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=1)
    predicted_class = torch.argmax(probabilities, dim=1)
    

    return predicted_class.item(), probabilities[:,1].item()

In [20]:
predicted_labels = []
predicted_probs = []
for text in tqdm(test_texts):
    predicted_label, prob = predict_labels(text)
    predicted_labels.append(predicted_label)
    predicted_probs.append(prob)

# Calculate accuracy and F1 score
accuracy = accuracy_score(test_labels, predicted_labels)
# f1 = f1_score(test_labels, predicted_labels)
# roc_auc = roc_auc_score(test_labels, predicted_probs)

print('Accuracy:', accuracy)
# print('F1 Score:', f1)
# print('ROC-AUC:', roc_auc)

  0%|          | 0/2010 [00:00<?, ?it/s]

Accuracy: 0.7134328358208956


In [21]:
print('Accuracy:', accuracy)

Accuracy: 0.7134328358208956


In [22]:
from sklearn.metrics import roc_auc_score, classification_report

print('\nThe Classification Report is as follows\n')
print(classification_report(test_labels, predicted_labels, digits = 4))


The Classification Report is as follows

              precision    recall  f1-score   support

           0     0.8157    0.7917    0.8035      1157
           1     0.4595    0.2656    0.3366       128
           2     0.6944    0.6329    0.6623       158
           3     0.6016    0.7430    0.6649       498
           4     0.2593    0.2029    0.2276        69

    accuracy                         0.7134      2010
   macro avg     0.5661    0.5272    0.5390      2010
weighted avg     0.7113    0.7134    0.7086      2010

