In [1]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.cluster import KMeans
import transformers
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel

In [2]:
# Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
# device = torch.device("cpu")

In [3]:
# encodes the string values in the "Severity" column into numerical values, storing the result in a new column
from sklearn.preprocessing import LabelEncoder
df = pd.read_csv('../DataSet/CleanDataSet.csv')
label_encoder = LabelEncoder()
df["EncodedSeverity"] = label_encoder.fit_transform(df["Severity"])

In [26]:
# Defining some key variables that will be used later
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
INFERENCE_BATCH_SIZE = 8
EPOCHS = 2
LEARNING_RATE = 3e-5
MODEL_NAME = "bert-base-uncased"
NUM_CLUSTERS = 6

In [5]:
# Preparing and returning data in a format that can be directly used for training a model
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text_description = dataframe.TextDescription
        self.targets = dataframe.EncodedSeverity
        self.max_len = max_len

    def __len__(self):
        return len(self.text_description)

    def __getitem__(self, index):
        text_description = str(self.text_description[index])
        text_description = " ".join(text_description.split())

        inputs = self.tokenizer.encode_plus(
            text_description,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_token_type_ids=True
        )
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.long)
        }

In [6]:
# Creating the customized model, by adding a drop out and a dense layer on top of bert to get the final output for the model. 

class BERTClass(torch.nn.Module):

    def __init__(self):
        super(BERTClass, self).__init__()
        self.pretrainedLayer = transformers.BertModel.from_pretrained(MODEL_NAME)
        self.dropOutLayer = torch.nn.Dropout(0.3)
        self.linearLayer = torch.nn.Linear(768, 6)
    
    def forward(self, input_ids, attention_mask, token_type_ids):
        _, output_1= self.pretrainedLayer(input_ids, attention_mask = attention_mask, token_type_ids = token_type_ids, return_dict=False)
        output_2 = self.dropOutLayer(output_1)
        output = self.linearLayer(output_2)

        return output

In [7]:
# Initializes the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
model = BERTClass()
model.to(device)

BERTClass(
  (pretrainedLayer): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elem

In [8]:
# Creating the dataset and dataloader

train_size = 0.7
train_dataset=df.sample(frac=train_size,random_state=200)
test_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)


In [9]:
# Setting up the DataLoader objects for training and testing datasets, specifying how the data should be loaded during training and evaluation
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [10]:
from sklearn.utils import class_weight
# Adapting weights to balance an unbalanced dataset 
class_weights = class_weight.compute_class_weight(class_weight='balanced', classes=
                                                 np.unique(df.EncodedSeverity),
                                                 y=df.EncodedSeverity)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

def loss_fn(outputs, targets):
    return torch.nn.CrossEntropyLoss(class_weights)(outputs, targets)


In [11]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [12]:
def train(epoch):
    model.train()
    fin_outputs = []
    for _,data in enumerate(training_loader, 0):
        input_ids = data['input_ids'].to(device, dtype = torch.long)
        attention_mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(input_ids, attention_mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        loss.backward()
        optimizer.step()

        fin_outputs.extend(outputs.cpu().detach().numpy().tolist())
    return fin_outputs

In [13]:
for epoch in range(EPOCHS):
    train(epoch)

Epoch: 0, Loss:  1.6038150787353516
Epoch: 1, Loss:  1.943310260772705


In [14]:
def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            input_ids = data['input_ids'].to(device, dtype = torch.long)
            attention_mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(input_ids, attention_mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.argmax(outputs, dim=1).cpu().detach().numpy().tolist())

    print("Sample Predictions:", fin_outputs[:20])  # Print first 20 predictions
    print("Actual Labels:", fin_targets[:20]) 

    return fin_outputs, fin_targets

In [15]:
for epoch in range(EPOCHS):
    outputs, targets = validation(epoch)
    outputs = np.array(outputs)
    accuracy = metrics.accuracy_score(targets, outputs)
    b_accuracy = metrics.balanced_accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    print(f"Accuracy Score = {accuracy}")
    print(f"Balanced Accuracy Score = {b_accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")

Sample Predictions: [4, 4, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4]
Actual Labels: [4, 4, 4, 4, 4, 4, 2, 5, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4]
Accuracy Score = 0.7233082706766917
Balanced Accuracy Score = 0.2639721912657049
F1 Score (Micro) = 0.7233082706766917
F1 Score (Macro) = 0.23765277052732323
Sample Predictions: [4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 3, 0, 4, 4, 2, 4, 4, 4, 4, 4]
Actual Labels: [3, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4]
Accuracy Score = 0.7233082706766917
Balanced Accuracy Score = 0.2639721912657049
F1 Score (Micro) = 0.7233082706766917
F1 Score (Macro) = 0.23765277052732323


In [16]:
torch.save(model.state_dict(), "../Model/fine_tuned_bert.pth")

In [19]:
# Initialize the pretrained model
pretrained_model = BERTClass()
pretrained_model.to(device)

# Load the saved model
pretrained_model.load_state_dict(torch.load("../Model/fine_tuned_bert.pth", map_location=device))

pretrained_model.eval()  # Set the model to evaluation mode

BERTClass(
  (pretrainedLayer): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elem

In [None]:
def get_fine_tuned_embeddings(data_loader):
    pretrained_model.eval()  # Set to evaluation mode
    all_labels = []
    embeddings_list = []

    with torch.no_grad():
        for data in data_loader:

            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            token_type_ids = data['token_type_ids'].to(device)
            labels = data['targets'].cpu().numpy()
            all_labels.append(labels)

            # Get embeddings from the fine-tuned model
            outputs = pretrained_model.pretrainedLayer(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, return_dict=True)
            embeddings = outputs['pooler_output']  # Shape: [batch_size, 768]
            embeddings_list.append(embeddings.cpu().numpy())
            
    # Concatenate all embeddings into a single numpy array
    all_embeddings = np.vstack(embeddings_list)
    return np.array(all_embeddings), np.array(all_labels)

In [24]:
inference_dataset = CustomDataset(dataframe=df, tokenizer=tokenizer, max_len=MAX_LEN)
inference_params = {
    'batch_size': INFERENCE_BATCH_SIZE, 
    'shuffle': False,
    'num_workers': 0 
}
inference_loader=DataLoader(inference_dataset, **inference_params)

In [None]:
embeddings = get_fine_tuned_embeddings(inference_loader)

print("Embeddings shape:", embeddings.shape) 

Embeddings shape: (15517, 768)


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

cosine_sim_matrix = cosine_similarity(embeddings)
cosine_dist_matrix = 1 - cosine_sim_matrix

In [None]:
for seed in range(5):

    kmeans = KMeans(
        n_clusters=NUM_CLUSTERS,
        max_iter=100,
        init='k-means++',
        n_init="auto",
        random_state=seed,
    ).fit(embeddings)

    cluster_ids, cluster_sizes = np.unique(kmeans.labels_, return_counts=True)

    print(f"Number of elements assigned to each cluster: {cluster_sizes}")
    cluster_labels = kmeans.fit_predict(embeddings)
    accuracy = metrics.accuracy_score(y_true, cluster_labels, normalize=True, sample_weight=None)
    print("Accuracy: ", accuracy)
    nmi = metrics.normalized_mutual_info_score(labels_true, cluster_labels, average_method='arithmetic')
    print("NMI: ", nmi)
    ari = metrics.adjusted_rand_score(labels_true, cluster_labels)
    print("ARI: ", ari)


Number of elements assigned to each cluster: [3699  964 4343 2460 1949 2102]
[2 0 5 ... 1 3 1]
Number of elements assigned to each cluster: [1166 3446 2406 3738 2152 2609]
[1 5 4 ... 0 1 0]
Number of elements assigned to each cluster: [3732 2612 1166 3436 2404 2167]
[3 1 5 ... 2 3 2]
Number of elements assigned to each cluster: [2617 3460 3743 2148 2388 1161]
[1 0 3 ... 5 1 5]
Number of elements assigned to each cluster: [ 953 4343 1977 3700 2094 2450]
[1 3 4 ... 0 5 0]
