In [1]:
import numpy as np
import pandas as pd
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [2]:
!pip install transformers -q

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertForSequenceClassification
from transformers import AutoTokenizer
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

2025-06-01 03:50:58.952499: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748749859.176406      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748749859.241361      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [10]:
data = pd.read_csv('/kaggle/input/text-document-classification-dataset/df_file.csv')
print("Data shape:", data.shape)
print("Label distribution:")
print(data['Label'].value_counts())
print("Unique labels:", data['Label'].unique())


Data shape: (2225, 2)
Label distribution:
Label
1    511
4    510
0    417
2    401
3    386
Name: count, dtype: int64
Unique labels: [0 1 2 3 4]


In [11]:
train_X, test_X, train_Y, test_Y = train_test_split(
    data['Text'], 
    data['Label'],  
    train_size=0.7, 
    random_state=42  
)

In [12]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
train_tokens = tokenizer(list(train_X), padding=True, truncation=True, max_length=512)
test_tokens = tokenizer(list(test_X), padding=True, truncation=True, max_length=512)

In [21]:
class TokenData(Dataset):
    def __init__(self, train=False): 
        if train:
            self.text_data = train_X
            self.tokens = train_tokens
            self.labels = list(train_Y)
        else:
            self.text_data = test_X
            self.tokens = test_tokens
            self.labels = list(test_Y)
    
    def __len__(self):  
        return len(self.text_data)
    
    def __getitem__(self, idx):  
        sample = {}
        for k, v in self.tokens.items():
            sample[k] = torch.tensor(v[idx])
        sample['Label'] = torch.tensor(self.labels[idx], dtype=torch.long)  
        return sample

In [22]:
batch_size = 40
train_dataset = TokenData(train=True)
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
test_dataset = TokenData(train=False)
test_loader = DataLoader(test_dataset, shuffle=True, batch_size=batch_size)


In [23]:
num_classes = len(data['Label'].unique())
print(f"Number of classes in dataset: {num_classes}")

Number of classes in dataset: 5


In [24]:
bert_model = BertForSequenceClassification.from_pretrained(
    'bert-base-cased', 
    num_labels=num_classes
)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
optimizer = AdamW(bert_model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()

num_epochs = 4
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
bert_model.to(device)

Using device: cuda


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [27]:
for epoch in range(num_epochs):
    print(f"Epoch: {epoch + 1}")
    
    # Training
    bert_model.train()
    total_train_loss = 0
    
    for i, batch in enumerate(train_loader):
        
        batch = {k: v.to(device) for k, v in batch.items()}
        
      
        optimizer.zero_grad()
        
       
        outputs = bert_model(
            input_ids=batch['input_ids'], 
            attention_mask=batch['attention_mask'],
            labels=batch['Label']  
        )
        
   
        loss = outputs.loss
        
       
        loss.backward()
        optimizer.step()
        
       
        total_train_loss += loss.item()
        
        if (i + 1) % 10 == 0:  # Print every 10 batches
            avg_loss = total_train_loss / (i + 1)
            print(f'Training batch {i + 1}, Average loss: {avg_loss:.4f}')
    
   
    avg_epoch_loss = total_train_loss / len(train_loader)
    print(f"Training epoch {epoch + 1} average loss: {avg_epoch_loss:.4f}\n")


Epoch: 1
Training batch 10, Average loss: 1.5646
Training batch 20, Average loss: 1.3644
Training batch 30, Average loss: 1.1399
Training epoch 1 average loss: 0.9659

Epoch: 2
Training batch 10, Average loss: 0.2291
Training batch 20, Average loss: 0.1876
Training batch 30, Average loss: 0.1649
Training epoch 2 average loss: 0.1528

Epoch: 3
Training batch 10, Average loss: 0.0586
Training batch 20, Average loss: 0.0545
Training batch 30, Average loss: 0.0543
Training epoch 3 average loss: 0.0537

Epoch: 4
Training batch 10, Average loss: 0.0305
Training batch 20, Average loss: 0.0264
Training batch 30, Average loss: 0.0257
Training epoch 4 average loss: 0.0242



In [30]:
print("Starting validation...")
bert_model.eval()
total_val_loss = 0
correct_predictions = 0
total_predictions = 0

with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        
        outputs = bert_model(
            input_ids=batch['input_ids'],
            attention_mask=batch['attention_mask'],
            labels=batch['Label']
        )
        
        total_val_loss += outputs.loss.item()
        
        # Calculate accuracy
        predictions = torch.argmax(outputs.logits, dim=-1)
        correct_predictions += (predictions == batch['Label']).sum().item()
        total_predictions += batch['Label'].size(0)

Starting validation...


In [31]:
avg_val_loss = total_val_loss / len(test_loader)
accuracy = correct_predictions / total_predictions

print(f"Validation Loss: {avg_val_loss:.4f}")
print(f"Validation Accuracy: {accuracy:.4f}")


Validation Loss: 0.0684
Validation Accuracy: 0.9835
