# 1. Data Preprocessing

## 1.1 Load libraries and GPU

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
import torch
print(torch.cuda.is_available())  # check GPU status
print(torch.cuda.get_device_name(0))  # show device name


True
NVIDIA GeForce RTX 2060


## 1.2 Load dataset

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
data = pd.read_csv(r'E:\Users\76044\Desktop\consumer_complaints.csv')
# data = pd.read_csv('/content/drive/MyDrive/758/consumer_complaints.csv')
data_selected = data[['consumer_complaint_narrative','product']]
data_selected.head()

  data = pd.read_csv(r'E:\Users\76044\Desktop\consumer_complaints.csv')


Unnamed: 0,consumer_complaint_narrative,product
0,,Mortgage
1,,Mortgage
2,,Credit reporting
3,,Student loan
4,,Debt collection


In [None]:
data_selected['consumer_complaint_narrative'].isnull().sum()

489151

## 1.3 Filter out the empty data

In [None]:
# 1.Filter out these records where the column of consumer_complaint_narrative are emtpy.
data_filtered = data_selected.dropna(subset=['consumer_complaint_narrative'])
data_filtered['consumer_complaint_narrative'].isnull().sum()

0

In [None]:
data_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 66806 entries, 190126 to 553096
Data columns (total 2 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   consumer_complaint_narrative  66806 non-null  object
 1   product                       66806 non-null  object
dtypes: object(2)
memory usage: 1.5+ MB


In [None]:
data_filtered['product'].value_counts()

product
Debt collection            17552
Mortgage                   14919
Credit reporting           12526
Credit card                 7929
Bank account or service     5711
Consumer Loan               3678
Student loan                2128
Prepaid card                 861
Payday loan                  726
Money transfers              666
Other financial service      110
Name: count, dtype: int64

In [None]:
data_filtered = data_filtered.copy()
data_filtered['text_length'] = data_filtered['consumer_complaint_narrative'].apply(lambda x: len(str(x)))
print(data_filtered['text_length'].describe())

count    66806.000000
mean      1039.587327
std        910.270430
min         10.000000
25%        391.000000
50%        740.000000
75%       1383.000000
max       5153.000000
Name: text_length, dtype: float64


In [None]:
data_filtered['consumer_complaint_narrative'].tolist()

['XXXX has claimed I owe them {$27.00} for XXXX years despite the PROOF of PAYMENT I sent them : canceled check and their ownPAID INVOICE for {$27.00}! \nThey continue to insist I owe them and collection agencies are after me. \nHow can I stop this harassment for a bill I already paid four years ago? \n',
 'Due to inconsistencies in the amount owed that I was told by M & T Bank and the amount that was reported to the credit reporting agencies, I was advised to write a good will letter in order to address the issue and request the negative entry be removed from my credit report all together. I had a vehicle that was stolen and it was declared a total loss by insurance company. The insurance company and the GAP insurancw companypaid the outstanding balance of the loan, but I was told by M & T Bank that there was still a balance due on the loan. In good faith, without having received any proof as to why there was still a balance, I made a partial payment towards the remaining debt. I then

In [None]:
import re
import pandas as pd
from bs4 import BeautifulSoup

# 1. copy DataFrame，avoiding Pandas view warning
data_filtered = data_filtered.copy()

# 2. clean HTML tags
def clean_html(text):
    return BeautifulSoup(text, "lxml").get_text()

# 3. replace date format（XXXX/XXXX/XX/XX/2015 → DATE）
def replace_dates(text):
    return re.sub(r'\d{1,2}/\d{1,2}/\d{2,4}', 'DATE', text)

# 4. replace money informatio（{$990.00} → MONEY）
def replace_money(text):
    return re.sub(r'\{\$\d+(?:\.\d{1,2})?\}', 'MONEY', text)

# 5. replace XX（如 XX XXX XXXX → UNKNOWN）
def replace_XX(text):
    return re.sub(r'XX{2,}', 'UNKNOWN', text)

# 6. clean whitespace
def clean_whitespace(text):
    text = text.replace("\n", " ")  # replace \n -> whitespace
    text = re.sub(r'\s+', ' ', text)  # clean
    return text.strip()

# 7. to lower（using bert-base-uncased）
def convert_lowercase(text):
    return text.lower()

# 8. to xx（ xx xxx xxxx → UNKNOWN）
def replace_xx(text):
    return re.sub(r'\bxx+\b', 'UNKNOWN', text)

# 9. apply func
def clean_text(text):
    text = clean_html(text)
    text = replace_dates(text)
    text = replace_money(text)
    text = replace_XX(text)
    text = replace_xx(text)
    text = clean_whitespace(text)
    text = convert_lowercase(text)
    return text

# 10. apply result
data_filtered["consumer_complaint_narrative"] = data_filtered["consumer_complaint_narrative"].apply(clean_text)

# 11. check
print(data_filtered["consumer_complaint_narrative"].head())


  return BeautifulSoup(text, "lxml").get_text()


190126    unknown has claimed i owe them money for unkno...
190135    due to inconsistencies in the amount owed that...
190155    in xx/xx/unknown my wages that i earned at my ...
190207    i have an open and current mortgage with chase...
190208    unknown was submitted xx/xx/unknown. at the ti...
Name: consumer_complaint_narrative, dtype: object


In [None]:
data_filtered['product'].tolist()

['Debt collection',
 'Consumer Loan',
 'Mortgage',
 'Mortgage',
 'Mortgage',
 'Mortgage',
 'Mortgage',
 'Mortgage',
 'Credit card',
 'Consumer Loan',
 'Mortgage',
 'Credit card',
 'Consumer Loan',
 'Debt collection',
 'Debt collection',
 'Debt collection',
 'Debt collection',
 'Mortgage',
 'Credit reporting',
 'Student loan',
 'Credit reporting',
 'Credit reporting',
 'Credit reporting',
 'Debt collection',
 'Credit reporting',
 'Credit reporting',
 'Credit reporting',
 'Credit reporting',
 'Debt collection',
 'Credit reporting',
 'Credit reporting',
 'Debt collection',
 'Credit reporting',
 'Debt collection',
 'Mortgage',
 'Consumer Loan',
 'Credit card',
 'Debt collection',
 'Debt collection',
 'Bank account or service',
 'Debt collection',
 'Debt collection',
 'Credit reporting',
 'Debt collection',
 'Debt collection',
 'Mortgage',
 'Credit card',
 'Student loan',
 'Mortgage',
 'Credit card',
 'Consumer Loan',
 'Debt collection',
 'Credit reporting',
 'Bank account or service',
 'St

In [None]:
texts = data_filtered['consumer_complaint_narrative'].values
labels = data_filtered['product'].values

In [None]:
# encode lable
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
labels = le.fit_transform(labels)

In [None]:
print(type(labels))

<class 'numpy.ndarray'>


In [None]:
print(labels[:10])  # check head 10 label
print(type(labels))  # check labels type
print(labels.dtype)  # check dtype

[4 1 6 6 6 6 6 6 2 1]
<class 'numpy.ndarray'>
int32


## 1.4 split train and test

In [None]:
# 2.Train/test split ration: 7:3.
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.3, random_state=42)

# 2. Create a custom dataset class

## 2.1 Tokenization

In [None]:
!pip install transformers



In [None]:
# initialize tokenizer
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
print("original text:", texts[0])
print("tokenized text:", tokenizer.tokenize(texts[0]))
print("tokenized text id:", tokenizer.convert_tokens_to_ids(tokenizer.tokenize(texts[0])))

original text: unknown has claimed i owe them money for unknown years despite the proof of payment i sent them : canceled check and their ownpaid invoice for money! they continue to insist i owe them and collection agencies are after me. how can i stop this harassment for a bill i already paid four years ago?
tokenized text: ['unknown', 'has', 'claimed', 'i', 'owe', 'them', 'money', 'for', 'unknown', 'years', 'despite', 'the', 'proof', 'of', 'payment', 'i', 'sent', 'them', ':', 'canceled', 'check', 'and', 'their', 'own', '##pa', '##id', 'in', '##vo', '##ice', 'for', 'money', '!', 'they', 'continue', 'to', 'insist', 'i', 'owe', 'them', 'and', 'collection', 'agencies', 'are', 'after', 'me', '.', 'how', 'can', 'i', 'stop', 'this', 'harassment', 'for', 'a', 'bill', 'i', 'already', 'paid', 'four', 'years', 'ago', '?']
tokenized text id: [4242, 2038, 3555, 1045, 12533, 2068, 2769, 2005, 4242, 2086, 2750, 1996, 6947, 1997, 7909, 1045, 2741, 2068, 1024, 13261, 4638, 1998, 2037, 2219, 4502, 359

## 2.2 define dataset

In [None]:
from torch.utils.data import Dataset

In [None]:
class ComplaintDataset(Dataset):
  def __init__(self, texts, labels, tokenizer, max_len=128):
    self.texts = texts
    self.labels = labels
    self.tokenizer = tokenizer # use predefine tokenizer
    self.max_len = max_len # set up as 128 in default

  def __len__(self):
    return len(self.texts) # know the size of dataset to load batch

  def __getitem__(self, idx):
    text = str(self.texts[idx]) # get text according to idx

    # when text is none
    if not text or len(text) == 0:
        text = "[UNK]"  # [UNK] replace blank text

    label = int(self.labels[idx])

    try:
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True, # add [CLS], [SEP]
            max_length=self.max_len, # control len
            padding='max_length', # add 0 if it's smaller than max_len
            return_tensors='pt', # return tensor
            truncation=True # trunction
        )
    except Exception as e:
        print(f"Tokenization Error at index {idx}: {e}") # avoid tokenizing failed
        text = "[UNK]"
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_tensors='pt',
            truncation=True
        )

    return {
        'text': text,
        'input_ids': encoding['input_ids'].squeeze(0), # token ID, removing batch dimension
        'attention_mask': encoding['attention_mask'].squeeze(0), #  attention mask
        'label': torch.tensor(label, dtype=torch.long) # label
    }

## 2.3 Create Dataset and Dataloader

In [None]:
from torch.utils.data import Dataset, DataLoader

In [None]:
import sys
sys.path.append(r'E:\Users\76044\Desktop')
from dataset import ComplaintDataset

In [None]:
# use custom class complaintDataset to change train and test set into Dataset
train_dataset = ComplaintDataset(X_train, y_train, tokenizer)
test_dataset = ComplaintDataset(X_test, y_test, tokenizer)

# change batchsize 16 -> 32
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, num_workers=4) # keep original order

# 3. Model architecture

In [None]:
from torch import nn

In [None]:
class BertClassifier(nn.Module):
  def __init__(self, n_classes):
    super(BertClassifier, self).__init__()
    self.bert = BertModel.from_pretrained('bert-base-uncased') # load BERT
    self.dropout = nn.Dropout(p=0.3) # use dropout to avoid overfit
    self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes) # layer for classifying, changing dimension from 768 to n_classes

  def forward(self, input_ids, attention_mask):
    outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=True) # output （batch_size × seq_len × hidden_size）

    # ftech [cls] token's embedding as the representaion of whole sentence
    cls_embedding = outputs.last_hidden_state[:, 0, :] # use [CLS] token as representation
    cls_embedding = self.dropout(cls_embedding)
    logits = self.classifier(cls_embedding) # logistic classification
    return logits

In [None]:
n_classes = len(data_filtered['product'].unique())
model = BertClassifier(n_classes)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

# 4. Preparation for the model training

## 4.1 Define loss function, optimizer and some hyperparameters

In [None]:
from transformers import AdamW

In [None]:
epochs = 3
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
loss_fn = nn.CrossEntropyLoss().to(device)
total_steps = len(train_loader) * epochs
scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer, max_lr=2e-5, steps_per_epoch=len(train_loader), epochs=epochs
)

In [None]:
total_steps

8769

## 4.2 Define training fuction

In [None]:
from torch.amp import autocast, GradScaler

scaler = GradScaler()

In [None]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler):
    model = model.train() # setting as train mode
    losses = [] # store loss for each batch
    correct_predictions = 0 # calculate accuracy

    for batch in data_loader:
        optimizer.zero_grad() # clean gradient avoiding error

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        with torch.amp.autocast(device_type='cuda'): # use fp16 calculation to improve speed
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = loss_fn(outputs, labels)

        scaler.scale(loss).backward() # calculate gradient
        scaler.step(optimizer) # update weight
        scaler.update() # adjust scaling

        if scheduler is not None and scheduler.last_epoch < total_steps:
          scheduler.step()
        else:
          print("Skipping scheduler step to prevent step overflow.")

        losses.append(loss.item())
        _, preds = torch.max(outputs, dim=1) # get predicted class
        correct_predictions += torch.sum(preds == labels)

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses) # calculate accuracy and loss

## 4.3 define evaluation function

In [None]:
def eval_model(model, data_loader, loss_fn, device):
  model = model.eval()
  losses = []
  correct_predictions = 0
  all_labels = []
  all_preds = []

  with torch.no_grad():
    for batch in data_loader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['label'].to(device)

      outputs = model(input_ids=input_ids, attention_mask=attention_mask)
      loss = loss_fn(outputs, labels)
      losses.append(loss.item())

      _, preds = torch.max(outputs, dim=1)
      correct_predictions += torch.sum(preds == labels)

      all_labels.extend(labels.cpu().numpy())
      all_preds.extend(outputs.cpu().numpy())

  return correct_predictions.double() / len(data_loader.dataset), np.mean(losses), all_labels, all_preds

# 5. Model Training

In [None]:
labels

array([4, 1, 6, ..., 8, 6, 6])

In [None]:
import torch
torch.backends.cudnn.benchmark = True  #  PyTorch automate calculation efficiency

In [None]:
for epoch in range(epochs):
  print(f'Epoch {epoch + 1}/{epochs}')

  train_acc, train_loss = train_epoch(model, train_loader, loss_fn, optimizer, device, scheduler)
  print(f'Train loss {train_loss} accuracy {train_acc}')

  val_acc, val_loss, val_labels, val_preds = eval_model(model, test_loader, loss_fn, device)
  print(f'Val loss {val_loss} accuracy {val_acc}')
  print("-"*10)
  # print(val_labels)
  # print(val_preds)

Epoch 1/3
Train loss 1.8441615300842478 accuracy 0.3752245316910444
Val loss 0.8895451206972574 accuracy 0.73560522901906
----------
Epoch 2/3
Train loss 0.6683400540860842 accuracy 0.8033743905568386
Val loss 0.5188709682200018 accuracy 0.8418321524797925
----------
Epoch 3/3
Skipping scheduler step to prevent step overflow.
Skipping scheduler step to prevent step overflow.
Skipping scheduler step to prevent step overflow.
Skipping scheduler step to prevent step overflow.
Skipping scheduler step to prevent step overflow.
Skipping scheduler step to prevent step overflow.
Skipping scheduler step to prevent step overflow.
Skipping scheduler step to prevent step overflow.
Skipping scheduler step to prevent step overflow.
Skipping scheduler step to prevent step overflow.
Skipping scheduler step to prevent step overflow.
Skipping scheduler step to prevent step overflow.
Skipping scheduler step to prevent step overflow.
Skipping scheduler step to prevent step overflow.
Skipping scheduler ste

# 6. Model evaluation

## 6.1 Calculate AUC

In [None]:
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_auc_score

In [None]:
# there are many classes, so we need to process their label
all_labels_array = np.array(val_labels, dtype=int) # change label to numpy arrary
val_preds_prob = np.vstack(val_preds) # n_samples x n_classes

all_labels_binarized = label_binarize(all_labels_array, classes=np.arange(n_classes)) # one-hot real label

# calculate multiple classes' AUC
auc = roc_auc_score(all_labels_binarized, val_preds_prob, average="macro", multi_class="ovr")
print(f"AUC: {auc:.4f}")

AUC: 0.9320


# Reference

1.   https://zhuanlan.zhihu.com/p/143209797

2.   https://mccormickml.com/2019/07/22/BERT-fine-tuning/

3.   https://blog.csdn.net/zhong_ddbb/article/details/109276751

4.   https://zhuanlan.zhihu.com/p/524036087

5.   https://zhuanlan.zhihu.com/p/46833276