<a href="https://colab.research.google.com/github/Ahmadsaidnouh/Term7-Artificial-Intelligence-Assignments/blob/main/lab4/jupyter%20notebooks/NLP_bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
%pip install transformers
%pip install pandas 
%pip install torch
%pip install tqdm
%pip install nltk

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import torch
import numpy as np
from transformers import BertTokenizer, BertModel
from torch import nn
from torch.optim import Adam
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import re
import gc
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
#df_train = pd.read_csv(f"/content/drive/MyDrive/Lab4_Data/train.csv")
#df_val = pd.read_csv(f"/content/drive/MyDrive/Lab4_Data/val.csv")
#df_test = pd.read_csv(f"/content/drive/MyDrive/Lab4_Data/test.csv")


df_train = pd.read_csv(f"/mnt/nvme0n1p2/Study/Term7/AI/Term7-Artificial-Intelligence-Assignments/lab4/data/train.csv")
df_val = pd.read_csv(f"/mnt/nvme0n1p2/Study/Term7/AI/Term7-Artificial-Intelligence-Assignments/lab4/data/val.csv")
df_test = pd.read_csv(f"/mnt/nvme0n1p2/Study/Term7/AI/Term7-Artificial-Intelligence-Assignments/lab4/data/test.csv")



print(len(df_train), len(df_val), len(df_test))
df_train


In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
labels = {
    "negative" : 0 ,
    "positive" : 1,
}


class Dataset(torch.utils.data.TensorDataset):
    def __init__(self, df):

        self.labels = [labels[label] for label in df["sentiment"]]
        # self.labels = []
        # for label in df["sentiment"]:
        #     self.labels.append(label)
        self.texts = [
            tokenizer(
                text,
                padding="max_length",
                max_length=512,
                truncation=True,
                return_tensors="pt",
            )
            for text in df["review"]
        ]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y


In [None]:
class BertClassifier(nn.Module):
    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout_bert = nn.Dropout(0.5)
        self.dropout = nn.Dropout(0.2)
        self.linear1 = nn.Linear(768, 512)
        self.batchnorm1 = nn.BatchNorm1d(512)
        self.linear2 = nn.Linear(512, 256)
        self.batchnorm2 = nn.BatchNorm1d(256)
        self.linear3 = nn.Linear(256, 128)
        self.linear4 = nn.Linear(128, 64)
        self.linear_out = nn.Linear(64, 1)
        self.relu1 = nn.ReLU()
        self.relu2 = nn.ReLU()
        self.relu3 = nn.ReLU()
        self.relu4 = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_id, mask):
        _, x = self.bert(input_ids=input_id, attention_mask=mask,return_dict=False)
        x = self.dropout_bert(x)
        x = self.relu1(self.linear1(x))
        x = self.batchnorm1(x)
        x = self.dropout(x)
        x = self.relu2(self.linear2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.relu3(self.linear3(x))
        x = self.dropout(x)
        x = self.relu4(self.linear4(x))
        x = self.dropout(x)
        x = self.sigmoid(self.linear_out(x))
        return x


In [None]:
def save_checkpoint(model, epoch, optimizer, best_acc, model_path):
  state = {
      'epoch' : epoch + 1,
      'model' : model.state_dict(),
      'best accuracy' : best_acc,
      'optimizer' : optimizer.state_dict()
  }
  torch.save(state, model_path)

In [None]:
batch_size=16   
trainn, val, test = Dataset(df_train), Dataset(df_val), Dataset(df_test)

In [None]:
trainn

In [None]:
train_dataloader = torch.utils.data.DataLoader(trainn, batch_size=batch_size)
val_dataloader = torch.utils.data.DataLoader(val, batch_size=batch_size)
test_dataloader = torch.utils.data.DataLoader(test, batch_size=batch_size)

In [None]:
epoch_list = list()
train_loss = list()
val_loss = list()
train_accuracy = list()
val_accuracy = list()
def plot_all():
    plt.figure(figsize = [18, 5])
    plt.suptitle("Loss and Accuracy in training")
    plt.subplot(1,2,1)
    plt.plot(epoch_list,train_loss);
    plt.plot(epoch_list,val_loss);
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.legend(["Train","Validation"])
    plt.subplot(1,2,2)
    plt.plot(epoch_list,train_accuracy);
    plt.plot(epoch_list,val_accuracy);
    plt.xlabel("Epochs");
    plt.ylabel("Accuracy")
    plt.legend(["Train","Validation"]);
    plt.show()
    

In [None]:
def train(model,train_data, val_data, train_dataloader, val_dataloader, learning_rate, epochs, bach_size, best_model_path, last_model_path):                    
    # Early stopping
    val_acc_best = 0
    early_stop_items_num = 5

    use_cuda = torch.cuda.is_available()
    use_cuda = False
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.BCELoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):
            gc.collect()
            torch.cuda.empty_cache()

            model.train()
            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):
                gc.collect()
                torch.cuda.empty_cache()
                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)
                output = model(input_id, mask)

                batch_loss = criterion(output.squeeze(1), train_label.float())
                total_loss_train += batch_loss.item()

                y_pred_tag = torch.round(output.squeeze(1))
                acc = (y_pred_tag == train_label).sum()

                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0
            model.eval()
            with torch.no_grad():

                for val_input, val_label in val_dataloader:
                    gc.collect()
                    torch.cuda.empty_cache()

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output.squeeze(1), val_label.float())
                    total_loss_val += batch_loss.item()
                    
                    y_pred_tag = torch.round(output.squeeze(1))
                    acc = (y_pred_tag == val_label).sum()
                    total_acc_val += acc
            
            train_loss.append(total_loss_train / len(train_data))
            train_accuracy.append(total_acc_train.item() / len(train_data))
            val_loss.append(total_loss_val / len(val_data))
            val_accuracy.append(total_acc_val.item() / len(val_data))
            epoch_list.append(epoch_num+1)
            val_acc_new = total_acc_val / len(val_data)
            
            
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')
            
            
            if val_acc_new > val_acc_best:
              val_acc_best = val_acc_new
              save_checkpoint(model, epoch_num, optimizer, val_acc_best, best_model_path)

            save_checkpoint(model, epoch_num, optimizer, val_acc_new, last_model_path)
            # Early stopping
            if len(val_accuracy) >= early_stop_items_num and max(val_accuracy[-early_stop_items_num:]) != val_acc_best.cpu().data.numpy():
                print("Early stopping at epoch:", epoch_num+1)
                break
            the_current_loss = total_loss_val / len(val_data)
            the_last_loss = the_current_loss
            plot_all()


            
            
                  

In [None]:
EPOCHS = 10000
model = BertClassifier()
LR = 1e-6
#best_model_path = f"/content/drive/MyDrive/Lab4_Data/model_best.pth.tar"
#last_model_path = f"/content/drive/MyDrive/Lab4_Data/model_last.pth.tar"

best_model_path = f"/mnt/nvme0n1p2/Study/Term7/AI/Term7-Artificial-Intelligence-Assignments/lab4/data/model_best.pth.tar"
last_model_path = f"/mnt/nvme0n1p2/Study/Term7/AI/Term7-Artificial-Intelligence-Assignments/lab4/data/model_last.pth.tar"
gc.collect()
torch.cuda.empty_cache()
train(model, df_train, df_val, train_dataloader, val_dataloader, LR, EPOCHS,batch_size, best_model_path, last_model_path)

In [None]:
from sklearn.metrics import confusion_matrix

def evaluate(model, test_data, test_dataloader):

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    final_confusion_matrix = np.zeros([2,2])

    if use_cuda:

        model = model.cuda()

    total_acc_test = 0
    model.eval()
    with torch.no_grad():

        for test_input, test_label in test_dataloader:
              test_label = test_label.to(device)
              # print(test_label)
              mask = test_input['attention_mask'].to(device)
              input_id = test_input['input_ids'].squeeze(1).to(device)
              output = model(input_id, mask)
              y_pred_tag = torch.round(output.argmax(dim=1))
              # print(y_pred_tag)
              for i in range(len(input_id)):
                tokens = tokenizer.convert_ids_to_tokens(input_id[i])
                text = tokenizer.convert_tokens_to_string(tokens)
                text = text.replace('[CLS] ','').replace(' [SEP]  ','').replace('[PAD]','')
                print(text + " | Output : "+ labels[y_pred_tag.cpu().data.numpy()[i]] + " | Actual : " + labels[test_label.cpu().data.numpy()[i]])
              
              acc = (y_pred_tag == test_label).sum()
              total_acc_test += acc
              final_confusion_matrix += confusion_matrix(test_label.cpu().data.numpy(), y_pred_tag.cpu().data.numpy())
    
    TP = final_confusion_matrix[0][0]
    FP = final_confusion_matrix[0][1]
    FN = final_confusion_matrix[1][0]
    TN = final_confusion_matrix[1][1]
    Percision = TP/(TP+FP)
    Recall = TP/(TP+FN)
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
    print(f'Percision : {Percision}')
    print(f'Sensitivity (Recall) : {Recall}')
    print(f'Specifity : {TN/(TN+FP)}')
    print(f'F1 Score = {((2*Percision*Recall)/(Percision+Recall))}')
    print('Confusion Matrix :')
    print(final_confusion_matrix)
    


In [None]:
loaded_checkpoint = torch.load(best_model_path)
model = BertClassifier()
model.load_state_dict(loaded_checkpoint["model"])

In [None]:
evaluate(model, df_test, test_dataloader)