<a href="https://colab.research.google.com/github/Dehan001/NLP/blob/main/vitd_baseline_exp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [23]:
model_name = 'csebuetnlp/banglabert'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)

# Define device (CPU or GPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at csebuetnlp/banglabert and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
import pandas as pd
url='https://drive.google.com/file/d/11vpbRMgjAzB64omxO2UVj0meb9Jdqea3/view?usp=sharing'
url='https://drive.google.com/uc?id=' + url.split('/')[-2]
df = pd.read_csv(url)

In [25]:
from sklearn.model_selection import train_test_split
train_df,test_df= train_test_split(df, test_size=0.25, random_state=121)

In [26]:
data_no = 5

# Prepare the training data
train_texts = train_df['Comment'].tolist()
train_labels = train_df['Category'].tolist()

test_texts = test_df['Comment'].tolist()
test_labels = test_df['Category'].tolist()

In [28]:
# prompt: write label encoder for train_labels and text_labels

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train_labels = le.fit_transform(train_labels)
test_labels = le.transform(test_labels)


In [29]:
# Tokenize and encode the training texts
train_encodings = tokenizer(train_texts, truncation=True, max_length=128,
        padding='max_length', return_tensors = 'pt')

# Convert the labels to tensors
train_labels = torch.tensor(train_labels)

# Create a PyTorch dataset
train_dataset = torch.utils.data.TensorDataset(train_encodings['input_ids'],
                                               train_encodings['attention_mask'],
                                               train_labels)

# Create a data loader
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)

model = model.to(device)

In [30]:
from tqdm.notebook import tqdm
# Set the model to training mode
model.train()

# Define the optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

losses = []
num_epochs = 3
# Training loop
for epoch in tqdm(range(num_epochs)):  # Number of training epochs
    running_loss = 0.0
    for batch in tqdm(train_loader):
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        loss.backward()
        optimizer.step()

        running_loss += loss.item()
    epoch_loss = running_loss / len(train_loader)
    losses.append(epoch_loss)
    print(f'Epoch {epoch+1}/{num_epochs} - Loss: {epoch_loss:.4f}')


# Save the model
torch.save(model.state_dict(), 'model.pth')

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1177 [00:00<?, ?it/s]

Epoch 1/3 - Loss: 0.9291


  0%|          | 0/1177 [00:00<?, ?it/s]

Epoch 2/3 - Loss: 0.6729


  0%|          | 0/1177 [00:00<?, ?it/s]

Epoch 3/3 - Loss: 0.5725


In [31]:
#dgfdgdfgdgffdgdfd1212jhkhk

In [32]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

def predict_labels(text):
    train_encodings = tokenizer(text, truncation=True, max_length=128,
        padding='max_length', return_tensors = 'pt')
    input_ids = train_encodings['input_ids'].to(device)
    attention_mask = train_encodings['attention_mask'].to(device)

    # Set the model to evaluation mode
    model.eval()

    # Disable gradient calculation
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=1)
    predicted_class = torch.argmax(probabilities, dim=1)


    return predicted_class.item(), probabilities[:,1].item()

In [33]:
predicted_labels = []
predicted_probs = []
for text in tqdm(test_texts):
    predicted_label, prob = predict_labels(text)
    predicted_labels.append(predicted_label)
    predicted_probs.append(prob)

# Calculate accuracy and F1 score
accuracy = accuracy_score(test_labels, predicted_labels)
# f1 = f1_score(test_labels, predicted_labels)
# roc_auc = roc_auc_score(test_labels, predicted_probs)

print('Accuracy:', accuracy)
# print('F1 Score:', f1)
# print('ROC-AUC:', roc_auc)

  0%|          | 0/6277 [00:00<?, ?it/s]

Accuracy: 0.7519515692209654


In [34]:
print('Accuracy:', accuracy)

Accuracy: 0.7519515692209654


In [35]:
from sklearn.metrics import roc_auc_score, classification_report

print('\nThe Classification Report is as follows\n')
print(classification_report(test_labels, predicted_labels, digits = 4))


The Classification Report is as follows

              precision    recall  f1-score   support

           0     0.6817    0.7658    0.7213       730
           1     0.0000    0.0000    0.0000       117
           2     0.6254    0.3824    0.4746       489
           3     0.7877    0.7645    0.7759      2548
           4     0.7546    0.8466    0.7980      2393

    accuracy                         0.7520      6277
   macro avg     0.5699    0.5519    0.5540      6277
weighted avg     0.7354    0.7520    0.7400      6277



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
