In [1]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig, AutoTokenizer, AlbertConfig, BertTokenizerFast
import gzip
import json
import torch.nn as nn
import gc
import torch.nn.functional as F

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
df_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/cse258_assignment2/train_ub.csv')
df_valid = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/cse258_assignment2/valid_ub.csv')
df_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/cse258_assignment2/test_ub.csv')
df_train.head()

Unnamed: 0,text,label,user_prop,book_prop
0,Book Title: Where the Road Takes Me; Review: I...,0,0.030928,0.05
1,Book Title: East of Eden; Review: My all-time ...,0,0.073955,0.041096
2,"Book Title: More Than Exes (Chasing The Dream,...",0,0.0,0.076923
3,Book Title: Pride and Prejudice; Review: For s...,0,0.0,0.022222
4,"Book Title: Long Way Down (Calloway Sisters, #...",0,0.065574,0.068182


In [4]:
print(len(df_train), len(df_valid), len(df_test))

167722 20942 20963


In [5]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = self.data.text
        self.u_prop = self.data.user_prop
        self.b_prop = self.data.book_prop
        self.targets = self.data.label
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        target = self.targets[index]
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(target, dtype=torch.long),
            'u_prop': torch.tensor(self.u_prop[index], dtype=torch.float32),
            'b_prop': torch.tensor(self.b_prop[index], dtype=torch.float32)
        }

In [None]:
df_test = pd.concat([df_valid, df_test], ignore_index=True)

In [6]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 3
LEARNING_RATE = 1e-05
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

train_size = 0.8

print("FULL Dataset: {}".format(df_train.shape[0] + df_test.shape[0]))
print("TRAIN Dataset: {}".format(df_train.shape[0]))
print("TEST Dataset: {}".format(df_test.shape[0]))

training_set = CustomDataset(df_train, tokenizer, MAX_LEN)
testing_set = CustomDataset(df_test, tokenizer, MAX_LEN)

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

FULL Dataset: 188685
TRAIN Dataset: 167722
TEST Dataset: 20963


In [7]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 2
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 2
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [15]:
class BERTcls(nn.Module):
    def __init__(self):
        super(BERTcls, self).__init__()
        self.bert = transformers.BertModel.from_pretrained('bert-base-cased')
        self.cls_head = nn.Sequential(
            nn.Dropout(0.3),
            nn.Linear(768 + 2, 2)
        )

    def forward(self, ids, mask, token_type_ids, u_prop, b_prop):
        _, pooled_output= self.bert(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        pooled_output = torch.concat([pooled_output, u_prop, b_prop], dim=1)
        output = self.cls_head(pooled_output)
        return output

In [9]:
def save_checkpoint(save_path, model, optimizer, valid_loss):
    if save_path == None:
        return

    state_dict = {'model_state_dict': model.state_dict(),
                  'optimizer_state_dict': optimizer.state_dict(),
                  'valid_loss': valid_loss}

    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')


def load_checkpoint(load_path, model, optimizer):
    if load_path==None:
        return

    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')

    model.load_state_dict(state_dict['model_state_dict'])
    optimizer.load_state_dict(state_dict['optimizer_state_dict'])

    return state_dict['valid_loss']

In [18]:
import time
from tqdm import tqdm

def train(model, epoch, load_model=False):
    if load_model:
        load_checkpoint('/content/drive/MyDrive/Colab Notebooks/cse258_assignment2/BERT.pt', model, optimizer)
    model.train()
    train_start = time.time()
    avg_loss = 0
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        u_prop = data['u_prop'].to(device).unsqueeze(1)
        b_prop = data['b_prop'].to(device).unsqueeze(1)
        targets = data['targets'].to(device, dtype = torch.long)
        targets = F.one_hot(targets, 2).to(dtype=torch.float32)

        outputs = model(ids, mask, token_type_ids, u_prop, b_prop)

        loss = criterion(outputs, targets)
        avg_loss += loss.item() / len(training_loader)
        if _%2500==0:
            print(f'Epoch: {epoch}, Step: {_}, Loss:  {avg_loss}')
            train_end = time.time()
            print("Training Epoch: ", train_end - train_start)
            train_start = train_end
        if _%5000 == 0 and _ != 0:
            fin_outputs, fin_targets = validation(model, False)
            print(classification_report(fin_targets, fin_outputs))
            print(accuracy_score(fin_targets, fin_outputs))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

def validation(model, load_model=False):
    if load_model:
        load_checkpoint('/content/drive/MyDrive/Colab Notebooks/cse258_assignment2/model_ub82.pt', model, optimizer)
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    best_valid_loss = float("Inf")
    valid_running_loss = 0.0
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            u_prop = data['u_prop'].to(device).unsqueeze(1)
            b_prop = data['b_prop'].to(device).unsqueeze(1)
            targets_l = F.one_hot(targets, 2).to(dtype=torch.float32)
            outputs = model(ids, mask, token_type_ids, u_prop, b_prop)
            loss = criterion(outputs, targets_l)
            outputs = torch.argmax(F.sigmoid(outputs), dim=1)

            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(outputs.cpu().detach().numpy().tolist())
            valid_running_loss += loss.item()
        average_valid_loss = valid_running_loss / len(testing_loader)
    print('Valid Loss: {:.4f}'.format(average_valid_loss))

    if best_valid_loss > average_valid_loss:
            best_valid_loss = average_valid_loss
            save_checkpoint('.' + '/model_ub.pt', model, optimizer, best_valid_loss)
    return fin_outputs, fin_targets

In [None]:
from sklearn.metrics import accuracy_score, classification_report

model = BERTcls().to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

for epoch in range(EPOCHS):
    train(model, epoch, load_model=False)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch: 0, Step: 0, Loss:  3.270621371612651e-05
Training Epoch:  0.5562646389007568


193it [02:28,  1.30it/s]


KeyboardInterrupt: ignored

In [19]:
from sklearn.metrics import accuracy_score, classification_report

model = BERTcls().to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

fin_outputs, fin_targets = validation(BERTcls().to(device), load_model=True)
print(classification_report(fin_targets, fin_outputs))
print(accuracy_score(fin_targets, fin_outputs))

Model loaded from <== /content/drive/MyDrive/Colab Notebooks/cse258_assignment2/model_ub82.pt


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Valid Loss: 0.3978
Model saved to ==> ./model_ub.pt
              precision    recall  f1-score   support

           0       0.83      0.85      0.84     12000
           1       0.80      0.76      0.78      8963

    accuracy                           0.82     20963
   macro avg       0.81      0.81      0.81     20963
weighted avg       0.82      0.82      0.82     20963

0.8156275342269713


In [23]:
from sklearn.metrics import precision_score, recall_score, f1_score

print(f1_score(fin_targets, fin_outputs))

0.7799350908159198
