<a href="https://colab.research.google.com/github/ErayImamoglu/BERT-MultiClass/blob/main/multiclass_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets transformers==4.28.0

In [27]:
# Importing stock ml libraries
import numpy as np
import pandas as pd
from sklearn import metrics
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig

# Preparing for TPU usage
# import torch_xla
# import torch_xla.core.xla_model as xm
# device = xm.xla_device()

In [28]:
# # Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
device

'cuda'

In [None]:
df = pd.read_csv("/content/train_set_cost_related (2) (2).csv")

# Filter rows where 'CostRelated' is 1
filtered_df = df[df['CostRelated'] == 1]

# Create the 'list' column based on 4th, 5th, 6th, and 7th columns
filtered_df['list'] = filtered_df[filtered_df.columns[4:8]].values.tolist()

# Create a new dataframe with only 'Text' and 'list' columns
new_df = filtered_df[['Text', 'list']].copy()
new_df

In [160]:
# Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 256
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
EPOCHS = 50
LEARNING_RATE = 1e-05

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [161]:
class CustomDataset(Dataset):

  def __init__(self, dataframe, tokenizer, max_len):
    self.tokenizer = tokenizer
    self.data = dataframe
    self.title = dataframe.Text
    self.targets = self.data.list
    self.max_len = max_len

  def __len__(self):
    return len(self.title)

  def __getitem__(self, index):
    title = str(self.title[index])
    title = " ".join(title.split())

    inputs = self.tokenizer.encode_plus(
        title,
        None,
        add_special_tokens = True,
        truncation = True,
        max_length = self.max_len,
        padding='max_length',
        return_attention_mask=True,
        return_token_type_ids = True
    )
    ids = inputs ['input_ids']
    mask = inputs['attention_mask']
    token_type_ids = inputs["token_type_ids"]

    return{
        'ids': torch.tensor(ids, dtype=torch.long),
        'mask': torch.tensor(mask, dtype=torch.long),
        'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
        'targets': torch.tensor(self.targets[index], dtype=torch.float)
    }

In [162]:
# Creating the dataset and dataloader for the neural network

train_size = 0.8
train_dataset=new_df.sample(frac=train_size,random_state=200)
test_dataset=new_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))


FULL Dataset: (183, 2)
TRAIN Dataset: (146, 2)
TEST Dataset: (37, 2)


In [None]:
training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

# Fetch and print out the first example to check tokenization
first_item = training_set[0]
print(f"Token IDs: {first_item['ids']}")
print(f"Attention Mask: {first_item['mask']}")
print(f"Token Type IDs: {first_item['token_type_ids']}")
print(f"Target: {first_item['targets']}")

In [164]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [None]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model.
import transformers
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased') #pre-trained BERT Model - will serve as the backbone for feature extraction.
        self.l2 = torch.nn.Dropout(0.3) #Regularization purpose #helps to prevent overfitting during training.
        self.l3 = torch.nn.Linear(768, 4) #Classification Purpose - fully connected layer - 4-class classification problem

    def forward(self, ids, mask, token_type_ids):
        output_1 = self.l1(ids, attention_mask=mask, token_type_ids=token_type_ids).last_hidden_state[:, 0, :]
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

model = BERTClass()
model.to(device)

In [166]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [167]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [168]:
def train(epoch):
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [None]:
for epoch in range(EPOCHS):
    train(epoch)

for name, param in model.named_parameters():
    if torch.isnan(param).any():
        print(f'Nan in {name}')


In [170]:
def validation(epoch):
    model.eval()
    fin_targets = []
    fin_outputs = []
    fin_sentences = []
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype=torch.long)
            mask = data['mask'].to(device, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype=torch.float)
            sentences = [tokenizer.decode(i, skip_special_tokens=True) for i in ids.cpu().numpy()]

            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
            fin_sentences.extend(sentences)
    return fin_outputs, fin_targets, fin_sentences


In [171]:
outputs, targets, sentences = validation(epoch)
outputs = np.array(outputs) >= 0.5
accuracy = metrics.accuracy_score(targets, outputs)
f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.8378378378378378
F1 Score (Micro) = 0.8799999999999999
F1 Score (Macro) = 0.8840758192332405


In [None]:
column_names = ["Accommodation&Transportation", "AdministrativeCost", "Office_Equipment_Labs", "Personnel"]

for sentence, output, target in zip(sentences, outputs, targets):
    print("Sentence:", sentence)
    print("Predicted Labels:", [column_names[i] for i, val in enumerate(output) if val == 1])
    print("True Labels:", [column_names[i] for i, val in enumerate(target) if val == 1])
    print("------------")

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import multilabel_confusion_matrix

mcm = multilabel_confusion_matrix(targets, outputs)

for i, (label_mcm) in enumerate(mcm):
    plt.figure(figsize=(10, 7))
    sns.set(font_scale=1.2)  # for label size
    sns.heatmap(label_mcm, annot=True, fmt="g", cmap="Blues")  # font size

    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title(f'Confusion Matrix for Label {i}')
    plt.show()

