In [20]:
import torch
import torch.nn as nn
import torch.optim as optim
import os
from pathlib import Path
import pickle
from sklearn.model_selection import train_test_split
!pip install transformers
from transformers import DistilBertTokenizer
from transformers import DistilBertForTokenClassification,AdamW
from tqdm import tqdm
import numpy as np



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Exploring the dataset
The  dataset consists of negative and positive sentiments of movie reviews both in train and test set. The positive reviews are labelled as 1 and negative are labelled as 0. 

In [3]:
PATH = "IMDB dataset/"
def get_data(args): #args are train or test
    path = Path(os.path.join(PATH,args))
    print(path)
    texts = []
    labels = []
    label_dir = ["pos","neg"]
    for text_files in (path/label_dir[0]).iterdir():
        texts.append(text_files.read_text(encoding="utf8"))
        labels.append(1)
    for text_files in (path/label_dir[1]).iterdir():
        texts.append(text_files.read_text(encoding="utf8"))
        labels.append(0)
    return texts,labels

In [None]:
# train_texts,train_labels = get_data("train")
# test_texts,test_labels = get_data("test")

IMDB dataset\train
IMDB dataset\test


In [None]:
# len(train_texts),len(test_texts)

(25000, 25000)

In [4]:
def saving_lists(name,list_):
    with open(f"{name}.txt","wb") as fp:
        pickle.dump(list_,fp)
def unpickling(filename):
    with open(filename,"rb") as fp:
        file = pickle.load(fp)
    return file

In [None]:
# saving_lists("Train_texts",train_texts)
# saving_lists("Test_texts",test_texts)
# saving_lists("Train_labels",train_labels)
# saving_lists("Test_labels",test_labels)

In [5]:
train_texts = unpickling("/content/drive/MyDrive/Train_texts.txt")
train_labels = unpickling("/content/drive/MyDrive/Train_labels.txt")
test_texts = unpickling("/content/drive/MyDrive/Test_texts.txt")
test_labels = unpickling("/content/drive/MyDrive/Test_labels.txt")

In [6]:
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)

In [7]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




In [8]:
#Ensure same maximum length of encodings
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [10]:
class IMDBdataset(torch.utils.data.Dataset):
    def __init__(self,encodings,labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self,idx):
        item = {key:torch.tensor(val[idx]) for key,val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

In [12]:
train_dataset = IMDBdataset(train_encodings,train_labels)
test_dataset = IMDBdataset(test_encodings,test_labels)
val_dataset = IMDBdataset(val_encodings,val_labels)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset,batch_size=16,shuffle=True)

In [13]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [36]:
from transformers import BertModel
class BertClassifier(nn.Module):
  def __init__(self,freeze_bert=False):

    super(BertClassifier,self).__init__()
    D_in,H,D_out = 768,50,2

    self.bert = BertModel.from_pretrained('distilbert-base-uncased')

    self.classifier = torch.nn.Sequential(
        nn.Linear(D_in,H),
        nn.ReLU(),
        nn.Linear(H,D_out)
    )

    if freeze_bert:
      for param in self.bert.parameters():
        param.requires_grad = False

  def forward(self,input_ids,attention_mask):
    outputs = self.bert(input_ids,attention_mask)
    last_hidden_cls = outputs[0][:,0,:]
    logits = self.classifier(last_hidden_cls)

    return logits

In [37]:
from transformers import AdamW, get_linear_schedule_with_warmup

def initialize_model(epochs=4):
    """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
    """
    # Instantiate Bert Classifier
    bert_classifier = BertClassifier(freeze_bert=False)

    # Tell PyTorch to run the model on GPU
    bert_classifier.to(device)

    # Create the optimizer
    optimizer = AdamW(bert_classifier.parameters(),
                      lr=5e-5,    # Default learning rate
                      eps=1e-8    # Default epsilon value
                      )

    # Total number of training steps
    total_steps = len(train_loader) * epochs

    # Set up the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0, # Default value
                                                num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler

In [38]:
import random
import time

# Specify loss function
loss_fn = nn.CrossEntropyLoss()

def train(model, train_loader, val_loader=None, epochs=4, evaluation=False):
    print("Start training...\n")
    for epoch_i in range(epochs):
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*70)

        t0_epoch, t0_batch = time.time(), time.time()

        total_loss, batch_loss, batch_counts = 0, 0, 0

        model.train()

        for step, batch in enumerate(train_loader):
            batch_counts +=1
            b_input_ids = batch['input_ids'].to(device)
            b_attn_mask = batch["attention_mask"].to(device)
            b_labels = batch['labels'].to(device)

            model.zero_grad()

            logits = model(b_input_ids, b_attn_mask)

            loss = loss_fn(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()

            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step()

            if (step % 20 == 0 and step != 0) or (step == len(train_loader) - 1):
      
                time_elapsed = time.time() - t0_batch

                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        avg_train_loss = total_loss / len(train_loader)

In [39]:
def evaluate(model, val_loader):
  model.eval()

  val_accuracy = []
  val_loss = []

  for batch in val_loader:
    b_input_ids = batch['input_ids'].to(device)
    b_attn_mask = batch["attention_mask"].to(device)
    b_labels = batch['labels'].to(device)
    with torch.no_grad():
        logits = model(b_input_ids, b_attn_mask)

    loss = loss_fn(logits, b_labels)
    val_loss.append(loss.item())

    preds = torch.argmax(logits, dim=1).flatten()

    accuracy = (preds == b_labels).cpu().numpy().mean() * 100
    val_accuracy.append(accuracy)

  val_loss = np.mean(val_loss)
  val_accuracy = np.mean(val_accuracy)

  return val_loss, val_accuracy

In [40]:
seed_value = 42
random.seed(seed_value)
np.random.seed(seed_value)
torch.manual_seed(seed_value)
torch.cuda.manual_seed_all(seed_value)

bert_classifier, optimizer, scheduler = initialize_model(epochs=2)
train(bert_classifier, train_loader, val_loader, epochs=2)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing BertModel: ['distilbert.embeddings.word_embeddings.weight', 'distilbert.embeddings.position_embeddings.weight', 'distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.LayerNorm.bias', 'distilbert.transformer.layer.0.attention.q_lin.weight', 'distilbert.transformer.layer.0.attention.q_lin.bias', 'distilbert.transformer.layer.0.attention.k_lin.weight', 'distilbert.transformer.layer.0.attention.k_lin.bias', 'distilbert.transformer.layer.0.attention.v_lin.weight', 'distilbert.transformer.layer.0.attention.v_lin.bias', 'distilbert.transformer.layer.0.attention.out_lin.weight', 'distilbert.transformer.layer.0.attention.out_lin.bias', 'distilbert.transformer.layer.0.sa_layer_norm.weight', 'distilbert.transformer.layer.0.sa_layer_norm.bias', 'distilbert.transformer.layer.0.ffn.lin1.weight', 'distilbert.transformer.layer.0.ffn.lin1.bias', 'distilbert.transformer.layer.0.ffn.lin2.weight', 'd

RuntimeError: ignored