In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import os
from pathlib import Path
import pickle
from sklearn.model_selection import train_test_split
# !pip install transformers
from transformers import DistilBertTokenizer
from transformers import DistilBertForTokenClassification,AdamW
from tqdm import tqdm
import numpy as np

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ed/d5/f4157a376b8a79489a76ce6cfe147f4f3be1e029b7144fa7b8432e8acb26/transformers-4.4.2-py3-none-any.whl (2.0MB)
[K     |████████████████████████████████| 2.0MB 14.9MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/71/23/2ddc317b2121117bf34dd00f5b0de194158f2a44ee2bf5e47c7166878a97/tokenizers-0.10.1-cp37-cp37m-manylinux2010_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 44.0MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 54.6MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp37-none-any.whl size=893262 sha256=b7c8883d42

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Exploring the dataset
The  dataset consists of negative and positive sentiments of movie reviews both in train and test set. The positive reviews are labelled as 1 and negative are labelled as 0. 

In [3]:
# To create a dataset function.
PATH = "IMDB dataset/"
def get_data(args): #args are train or test
    path = Path(os.path.join(PATH,args))
    print(path)
    texts = []
    labels = []
    label_dir = ["pos","neg"]
    for text_files in (path/label_dir[0]).iterdir():
        texts.append(text_files.read_text(encoding="utf8"))
        labels.append(1)
    for text_files in (path/label_dir[1]).iterdir():
        texts.append(text_files.read_text(encoding="utf8"))
        labels.append(0)
    return texts,labels

In [None]:
#It took long to run hence saved as pickle serialized object
# train_texts,train_labels = get_data("train")
# test_texts,test_labels = get_data("test")

In [None]:
# len(train_texts),len(test_texts)

In [4]:
#saving the list and then unpickling to access it
def saving_lists(name,list_):
    with open(f"{name}.txt","wb") as fp:
        pickle.dump(list_,fp)
def unpickling(filename):
    with open(filename,"rb") as fp:
        file = pickle.load(fp)
    return file

In [None]:
# saving_lists("Train_texts",train_texts)
# saving_lists("Test_texts",test_texts)
# saving_lists("Train_labels",train_labels)
# saving_lists("Test_labels",test_labels)

In [5]:
train_texts = unpickling("/content/drive/MyDrive/Train_texts.txt")
train_labels = unpickling("/content/drive/MyDrive/Train_labels.txt")
test_texts = unpickling("/content/drive/MyDrive/Test_texts.txt")
test_labels = unpickling("/content/drive/MyDrive/Test_labels.txt")

In [6]:
#splitting into train and validation data with test size of 20%
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)

In [7]:
#Distilled Bert is a cut down version of Bert Tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




In [8]:
#Ensure same maximum length of encodings
#Encodings for train, validation and test dataset
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [9]:
#Source Hugging face https://huggingface.co/transformers/custom_datasets.html#seq-imdb
#IMDB dataset to encodings and labels 
class IMDBdataset(torch.utils.data.Dataset):
    def __init__(self,encodings,labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self,idx):
        item = {key:torch.tensor(val[idx]) for key,val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

In [10]:
train_dataset = IMDBdataset(train_encodings,train_labels)
test_dataset = IMDBdataset(test_encodings,test_labels)
val_dataset = IMDBdataset(val_encodings,val_labels)

#Dataloader functions
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset,batch_size=8,shuffle=True)

In [11]:
#Using GPU device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [55]:
#Initialize tensorboard
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter("/content/drive/MyDrive/runs_final/BERT_IMDB")

In [56]:
# Defining the last layers of Bert Model which would be trained to fit our specific purpose
from transformers import BertModel
class BertClassifier(nn.Module):
    def __init__(self,freeze_bert=False):

        super(BertClassifier,self).__init__()
        D_in,H,D_out = 768,50,2

        self.bert = BertModel.from_pretrained('distilbert-base-uncased')

        self.classifier = torch.nn.Sequential(
            nn.Linear(D_in,H),
            nn.ReLU(),
            nn.Linear(H,D_out)
        )

        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False

        def forward(self,input_ids,attention_mask):
        outputs = self.bert(input_ids,attention_mask)
        last_hidden_cls = outputs[0][:,0,:]
        logits = self.classifier(last_hidden_cls)

        return logits

In [57]:
#Initialize optimizer and scheduler
from transformers import AdamW, get_linear_schedule_with_warmup

def initialize_model(epochs=4):
    bert_classifier = BertClassifier(freeze_bert=False)

    bert_classifier.to(device)

    optimizer = AdamW(bert_classifier.parameters(),
                      lr=5e-5,    # Default learning rate
                      eps=1e-8    # Default epsilon value
                      )

    total_steps = len(train_loader) * epochs

    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0, 
                                                num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler

In [58]:
#Training loop
import random
import time

loss_fn = nn.CrossEntropyLoss()

def train(model, train_loader, val_loader=None, epochs=4, evaluation=False):
    print("Start training...\n")
    for epoch_i in range(epochs):
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Elapsed':^9}")
        print("-"*70)

        t0_epoch, t0_batch = time.time(), time.time()

        total_loss, batch_loss, batch_counts = 0, 0, 0

        model.train()

        losses = []

        for step, batch in enumerate(train_loader):
            batch_counts +=1
            b_input_ids = batch['input_ids'].to(device)
            b_attn_mask = batch["attention_mask"].to(device)
            b_labels = batch['labels'].to(device)

            model.zero_grad()

            logits = model(b_input_ids, b_attn_mask)

            loss = loss_fn(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()
            losses.append(batch_loss)
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step()
            
            #Callbacks
            if batch_counts > 200:
                if (losses[step]>losses[step-1]) and (losses[step]>losses[step-2]) and (losses[step]>losses[step-3]):
                    break

            if (step % 20 == 0 and step != 0) or (step == len(train_loader) - 1):
      
                time_elapsed = time.time() - t0_batch
                writer.add_scalar('At training step the loss ', step,round(batch_loss/batch_counts,4))
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {time_elapsed:^9.2f}")

                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()
        avg_train_loss = total_loss / len(train_loader)
    torch.save(model.state_dict(),"/content/drive/MyDrive/BERTModel.pth")
    return model,avg_train_loss

In [59]:
#Evaluation loop
from sklearn.metrics import f1_score
def evaluate(model, val_loader):
    model.eval()

    val_accuracy = []
    val_loss = []
    f1_scores = []
    for idx,batch in enumerate(val_loader):
        b_input_ids = batch['input_ids'].to(device)
        b_attn_mask = batch["attention_mask"].to(device)
        b_labels = batch['labels'].to(device)
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)

        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        preds = torch.argmax(logits, dim=1).flatten()

        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

        score = f1_score(b_labels.cpu().numpy(),preds.cpu().numpy())
        f1_scores.append(score) 
        if idx == 20:
            writer.add_scalar('For step validation accuracy is  and loss respectively are ', idx, accuracy,round(loss.item(),2))
            writer.add_scalar("The f1 score is ", score)
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)
    F1_score = np.mean(f1_scores)

    return val_loss, val_accuracy,F1_score

In [60]:
%%time
seed_value = 42
random.seed(seed_value)
np.random.seed(seed_value)
torch.manual_seed(seed_value)
torch.cuda.manual_seed_all(seed_value)

bert_classifier, optimizer, scheduler = initialize_model(epochs=2)
model,avg_train_loss = train(bert_classifier, train_loader, val_loader, epochs=2)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing BertModel: ['distilbert.embeddings.word_embeddings.weight', 'distilbert.embeddings.position_embeddings.weight', 'distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.LayerNorm.bias', 'distilbert.transformer.layer.0.attention.q_lin.weight', 'distilbert.transformer.layer.0.attention.q_lin.bias', 'distilbert.transformer.layer.0.attention.k_lin.weight', 'distilbert.transformer.layer.0.attention.k_lin.bias', 'distilbert.transformer.layer.0.attention.v_lin.weight', 'distilbert.transformer.layer.0.attention.v_lin.bias', 'distilbert.transformer.layer.0.attention.out_lin.weight', 'distilbert.transformer.layer.0.attention.out_lin.bias', 'distilbert.transformer.layer.0.sa_layer_norm.weight', 'distilbert.transformer.layer.0.sa_layer_norm.bias', 'distilbert.transformer.layer.0.ffn.lin1.weight', 'distilbert.transformer.layer.0.ffn.lin1.bias', 'distilbert.transformer.layer.0.ffn.lin2.weight', 'd

Start training...

 Epoch  |  Batch  |  Train Loss  |  Elapsed 
----------------------------------------------------------------------
   1    |   20    |   0.739524   |   8.90   
   1    |   40    |   0.714741   |   8.48   
   1    |   60    |   0.709170   |   8.59   
   1    |   80    |   0.748598   |   8.54   
   1    |   100   |   0.720216   |   8.51   
   1    |   120   |   0.700601   |   8.46   
   1    |   140   |   0.705310   |   8.41   
   1    |   160   |   0.714570   |   8.38   
   1    |   180   |   0.683978   |   8.36   
   1    |   200   |   0.686814   |   8.37   
   1    |   220   |   0.693333   |   8.41   
   1    |   240   |   0.723810   |   8.46   
   1    |   260   |   0.703569   |   8.46   
   1    |   280   |   0.691303   |   8.43   
   1    |   300   |   0.699373   |   8.44   
   1    |   320   |   0.718428   |   8.41   
   1    |   340   |   0.692792   |   8.43   
   1    |   360   |   0.700358   |   8.38   
   1    |   380   |   0.693604   |   8.38   
   1    | 

In [61]:
val_loss, val_accuracy,F1_score = evaluate(model,val_loader)

In [63]:
#Validation results
print(f"The validation loss is {val_loss}, accuracy is {val_accuracy} and F1 score is {F1_score}")

The validation loss is 0.6931706943035125, accuracy is 50.34 and F1 score is 0.6332190476190476


In [None]:
%load_ext tensorboard
%tensorboard --logdir /content/drive/MyDrive/runs_final/BERT_IMDB