In [109]:
import numpy as np
import pandas as pd
from sklearn import metrics

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

import transformers
from transformers import BertTokenizer, BertModel, BertConfig

In [110]:
#Setting up the device for GPU usage
device = torch.device("cuda" if torch.cuda.is_available() 
                      else  "mps" if torch.backends.mps.is_available()
                      else "cpu"
                      )

## Load data

In [111]:
df=pd.read_csv("../Datasets/Cleaned_Datasets/Dataset_1_test.csv")
new_df=df[["title", "polarity"]].copy()
new_df = new_df.head(10)
new_df.polarity=new_df.polarity-1
new_df.head()

Unnamed: 0,title,polarity
0,great cd,1
1,one of the best game music soundtracks for a...,1
2,batteries died within a year ...,0
3,"works fine, but maha energy is better",1
4,great for the non audiophile,1


In [124]:
MAX_LEN=512
TRAIN_BATCH_SIZE=4
VALID_BATCH_SIZE=4
LEARNING_RATE=1e-05
EPOCHS=10

In [113]:
#Initialize a BERT tokenizer from the 'bert-base-uncased' pre-trained model.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [114]:
class AmazonTitles_Dataset(Dataset):
    """
    A PyTorch Dataset class for tokenizing and encoding Amazon product titles
    using a given tokenizer and preparing them for model training or inference.

    Args:
        dataframe (pandas.DataFrame): The input DataFrame containing the titles and polarity labels.
        tokenizer: The tokenizer object used to tokenize and encode the titles.
        max_len (int): The maximum length of the tokenized titles after encoding.
    """

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer=tokenizer
        self.data=dataframe
        self.titles=dataframe.title
        self.targets=self.data.polarity
        self.max_len=max_len
    
    def __len__(self):
        return len(self.titles)
    
    def __getitem__(self, index):
        titles=str(self.titles[index])
        titles = " ".join(titles.split())
    
        inputs=self.tokenizer.encode_plus(
            titles,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        ids=inputs['input_ids']
        mask=inputs['attention_mask']
        token_type_ids=inputs['token_type_ids'] 

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }      

In [115]:
train_size=0.8
train_dataset=new_df.sample(frac=train_size, random_state=200)
test_dataset=new_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

In [116]:
print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))


FULL Dataset: (10, 2)
TRAIN Dataset: (8, 2)
TEST Dataset: (2, 2)


In [117]:
training_set = AmazonTitles_Dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = AmazonTitles_Dataset(test_dataset, tokenizer, MAX_LEN)

In [118]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

## Model

In [119]:
class BertClass(torch.nn.Module):

    def __init__(self, dropout=0.1):
        super(BertClass, self).__init__()
        self.transformer = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.drop1 = torch.nn.Dropout(dropout)
        self.l1 = torch.nn.Linear(768, 300)
        self.act1=torch.nn.Tanh()
        self.drop2 = torch.nn.Dropout(dropout)
        self.l2 = torch.nn.Linear(300, 100)
        self.act2=torch.nn.Tanh()
        self.drop3=torch.nn.Dropout(dropout)
        self.l3=torch.nn.Linear(100,1)
        self.output=torch.nn.Sigmoid()

    def forward(self, ids, mask, token_type_ids):
        _, output_1=self.transformer(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
        output_2=self.drop1(output_1)
        output_3=self.drop2(self.act1(self.l1(output_2)))
        output_4=self.drop3(self.act2(self.l2(output_3)))
        output=self.output(self.l3(output_4))

        return output

In [120]:
model=BertClass()
model.to(device)

BertClass(
  (transformer): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_

In [121]:
def loss_fn(outputs, targets):
    return torch.nn.BCELoss()(outputs, targets)

In [122]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

## Train

In [123]:
def train(epoch):
    model.train()
    total_loss=0.0
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)
        targets = torch.unsqueeze(targets, dim=1)
        
        outputs = model(ids, mask, token_type_ids)

        #optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        
        # Accumulate the total loss
        total_loss += loss.item()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Calculate the average loss for the epoch
    avg_loss = total_loss / len(training_loader)

    # Print the average loss for the epoch
    print(f'Epoch: {epoch}, Average Loss: {avg_loss}')

In [125]:
for epoch in range(EPOCHS):
    train(epoch)

KeyboardInterrupt: 

## Validation

In [35]:
def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            targets = torch.unsqueeze(targets, dim=1)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [36]:
for epoch in range(EPOCHS):
    outputs, targets = validation(epoch)
    outputs = np.array(outputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.5
F1 Score (Micro) = 0.5
F1 Score (Macro) = 0.3333333333333333
Accuracy Score = 0.5
F1 Score (Micro) = 0.5
F1 Score (Macro) = 0.3333333333333333
Accuracy Score = 0.5
F1 Score (Micro) = 0.5
F1 Score (Macro) = 0.3333333333333333
Accuracy Score = 0.5
F1 Score (Micro) = 0.5
F1 Score (Macro) = 0.3333333333333333
Accuracy Score = 0.5
F1 Score (Micro) = 0.5
F1 Score (Macro) = 0.3333333333333333
Accuracy Score = 0.5
F1 Score (Micro) = 0.5
F1 Score (Macro) = 0.3333333333333333
Accuracy Score = 0.5
F1 Score (Micro) = 0.5
F1 Score (Macro) = 0.3333333333333333
Accuracy Score = 0.5
F1 Score (Micro) = 0.5
F1 Score (Macro) = 0.3333333333333333
Accuracy Score = 0.5
F1 Score (Micro) = 0.5
F1 Score (Macro) = 0.3333333333333333
Accuracy Score = 0.5
F1 Score (Micro) = 0.5
F1 Score (Macro) = 0.3333333333333333
