# Prepartaion and installing required libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, roc_auc_score

# Data loading and cleaning

In [None]:
data = pd.read_csv("/content/drive/MyDrive/002_Sem_2/758O - AI/consumer_complaints.csv", low_memory=False)


In [None]:
data = data[data['consumer_complaint_narrative'].notnull()] # removing rows with customer complaint narrative is empty

In [None]:
data.shape

(66806, 18)

In [None]:
data = data.sample(n=20000, random_state=42)


In [None]:
texts = data['consumer_complaint_narrative'].tolist()
labels = data['product'].tolist()

In [None]:
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)
num_labels = len(label_encoder.classes_)
print(f"Number of classes: {num_labels}")

Number of classes: 11


In [None]:
print(f"Classes: {label_encoder.classes_}") # Verifying the correct labels

Classes: ['Bank account or service' 'Consumer Loan' 'Credit card'
 'Credit reporting' 'Debt collection' 'Money transfers' 'Mortgage'
 'Other financial service' 'Payday loan' 'Prepaid card' 'Student loan']


In [None]:
mapping = {i: category for i, category in enumerate(label_encoder.classes_)}
print(mapping)

{0: 'Bank account or service', 1: 'Consumer Loan', 2: 'Credit card', 3: 'Credit reporting', 4: 'Debt collection', 5: 'Money transfers', 6: 'Mortgage', 7: 'Other financial service', 8: 'Payday loan', 9: 'Prepaid card', 10: 'Student loan'}


In [None]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, test_size=0.3, random_state=42, stratify=labels
)

# Creating Custom Dataset

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

class ComplaintDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=150):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        # Remove extra dimension (batch dimension from tokenizer)
        item = {key: encoding[key].squeeze(0) for key in encoding}
        item['labels'] = torch.tensor(label, dtype=torch.long)
        return item

In [None]:
# Create Dataset objects
train_dataset = ComplaintDataset(train_texts, train_labels, tokenizer)
test_dataset = ComplaintDataset(test_texts, test_labels, tokenizer)

In [None]:
# DataLoaders for batching
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Model

In [None]:
class ConsumerComplaintClassifier(nn.Module):
    def __init__(self, num_labels, dropout_rate=0.3):
        super(ConsumerComplaintClassifier, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = outputs.pooler_output
        x = self.dropout(cls_embedding)
        logits = self.classifier(x)
        return logits

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ConsumerComplaintClassifier(num_labels)
model.to(device)

ConsumerComplaintClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-1

# Training Setup

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5)
epochs = 3
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=total_steps
)
criterion = nn.CrossEntropyLoss()



# Training Loop

In [None]:
model.train()
for epoch in range(epochs):
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{epochs} - Average Loss: {avg_loss:.4f}")

torch.save(model.state_dict(), "consumer_complaint_classifier.pth")


Epoch 1/3 - Average Loss: 0.8372
Epoch 2/3 - Average Loss: 0.4396
Epoch 3/3 - Average Loss: 0.3047


# Model Evaluation

In [None]:
model.load_state_dict(torch.load("consumer_complaint_classifier.pth"))
model.to(device)

model.eval()
all_preds = []
all_labels = []
all_probs = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        logits = model(input_ids, attention_mask)
        probs = torch.softmax(logits, dim=1)
        preds = torch.argmax(probs, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
        all_probs.extend(probs.cpu().numpy())

# Compute accuracy
acc = accuracy_score(all_labels, all_preds)
print(f"Test Accuracy: {acc:.4f}")

# Compute multi-class AUC score using one-vs-rest (ovr) approach.
# Note: roc_auc_score requires probability estimates for each class.
try:
    auc = roc_auc_score(all_labels, all_probs, multi_class='ovr')
    print(f"Test AUC: {auc:.4f}")
except Exception as e:
    print("Error computing AUC:", e)

  model.load_state_dict(torch.load("consumer_complaint_classifier.pth"))


Test Accuracy: 0.8493
Test AUC: 0.9507
