In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "4"

In [2]:
import torch
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader
from ml_things import fix_text
import torch
import torch.nn as nn
from transformers import (set_seed,
                          Trainer,
                          GPT2Config,
                          GPT2Tokenizer,
                          get_linear_schedule_with_warmup,
                          GPT2ForSequenceClassification, GPT2Model)

In [3]:
set_seed(123)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
import numpy as np

class ImdbDataset(Dataset):
    r"""PyTorch Dataset class for loading data.

    This is where the data parsing happens.

    This class is built with reusability in mind: it can be used as is as.

    Arguments:

    path (:obj:`str`):
        Path to the data partition.

    """

    def __init__(self, path):        
        self.texts = []
        self.labels = []
        for label in ['pos', 'neg']:
            sentiment_path = os.path.join(path, label)
            files_names = os.listdir(sentiment_path)
            for file_name in tqdm(files_names, desc=f'{label} files'):
                file_path = os.path.join(sentiment_path, file_name)
                with open(file_path, "r", encoding='utf-8') as f:
                    content = f.readlines()[0]
              
                content = fix_text(content)
                self.texts.append(content)
                self.labels.append(label == 'pos')

        # Number of exmaples.
        self.n_examples = len(self.labels)
        self.texts = np.array(self.texts)
        self.labels = torch.tensor(self.labels).type(torch.float32)

    def __len__(self):
        r"""When used `len` return the number of examples.

        """

        return len(self.texts)

    def __getitem__(self, item):
        r"""Given an index return an example from the position.

        Arguments:

          item (:obj:`int`):
              Index position to pick an example to return.

        Returns:
          :obj:`Dict[str, str]`: Dictionary of inputs that contain text and 
          asociated labels.

        """

        return {'text':self.texts[item],
                'label':self.labels[item]}



In [5]:
class Collator(object):
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, sequences):
        r"""
        This function allowes the class objesct to be used as a function call.
        Sine the PyTorch DataLoader needs a collator function, I can use this 
        class as a function.

        Arguments:

          item (:obj:`list`):
              List of texts and labels.

        Returns:
          :obj:`Dict[str, object]`: Dictionary of inputs that feed into the model.
          It holddes the statement `model(**Returned Dictionary)`.
        """

        # Get all texts from sequences list.
        texts = [sequence['text'] for sequence in sequences]
        labels = torch.tensor([sequence['label'] for sequence in sequences])
        # Call tokenizer on all texts to convert into tensors of numbers with 
        # appropriate padding.
        inputs = self.tokenizer(text=texts, return_tensors="pt", padding=True, truncation=True, 
                                max_length=100, return_token_type_ids=True)
        # Update the inputs with the associated encoded labels as tensor.
        inputs.update({'labels':torch.tensor(labels)})

        return inputs





In [6]:
n_epochs = 4
batch_size = 32

set_seed(123)
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

train_dataset = ImdbDataset(path='./aclImdb/train')
val_dataset = ImdbDataset(path='./aclImdb/test')

pos files:   0%|          | 0/12500 [00:00<?, ?it/s]

neg files:   0%|          | 0/12500 [00:00<?, ?it/s]

pos files:   0%|          | 0/12500 [00:00<?, ?it/s]

neg files:   0%|          | 0/12500 [00:00<?, ?it/s]

In [24]:
from collections import defaultdict


class Trainer:
    def __init__(self, model, n_epochs, train_dataloader, val_dataloader=None):
        self.model = model
        self.train_dataloader = train_dataloader
        self.val_dataloader = val_dataloader
        
        self.optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
        self.scheduler = get_linear_schedule_with_warmup(self.optimizer, 
                                num_warmup_steps = 0, # Default value in run_glue.py
                                num_training_steps = n_epochs * len(train_dataloader))
        self.n_epochs = n_epochs
        
        self.train_dataloader = train_dataloader
        self.val_dataloader = val_dataloader


        
    def train(self):
        
        log_dict = defaultdict(list)
        
        for epoch in tqdm(range(self.n_epochs)):
            train_loss = []
            train_acc = []
            pred_list = []
            labels_list = []
            for b in tqdm(self.train_dataloader, leave=False):
                
                self.optimizer.zero_grad()
                loss, out = self.model(b)
                
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)

                self.optimizer.step()
                self.scheduler.step()
                
                pred = out.detach().cpu().numpy() > 0
                train_loss.append(loss.item())
                pred_list.append(pred)
                labels_list.append(b["labels"].detach().cpu().numpy())
             

            print(np.concatenate(pred_list).shape, np.concatenate(labels_list).shape)


            train_acc = (np.concatenate(pred_list) == np.concatenate(labels_list)).mean()
            
            
            print(f"EPOCH: {epoch} | tr loss: {np.mean(train_loss)} | tr acc: {train_acc}")
            log_dict["train_loss"].append(np.mean(train_loss))
            log_dict["train_acc"].append(train_acc)
            if self.val_dataloader is not None:
                with torch.no_grad():
                    val_loss = []
                    val_acc = []
                    pred_list = []
                    labels_list = []
                    for b in tqdm(self.val_dataloader, leave=False): 
                        loss, out = self.model(b)

                        pred = out.detach().cpu().numpy() > 0
                        val_loss.append(loss.item())
                        pred_list.append(pred)
                        labels_list.append(b["labels"].detach().cpu().numpy())
                val_acc = (np.concatenate(pred_list) == np.concatenate(labels_list)).mean()
                
                
                print(f"val loss: {np.mean(val_loss)} | val acc: {val_acc}")
                log_dict["val_loss"].append(np.mean(val_loss))
                log_dict["val_acc"].append(val_acc)
        return log_dict

In [25]:
class GPT2_token_classifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.gpt2 = GPT2Model.from_pretrained("gpt2")
        self.classifier_head = nn.Sequential(nn.Linear(768, 1))
        self.criterion = nn.BCEWithLogitsLoss()
        
    def forward(self, b):
        o = self.gpt2(**{i:b[i].to(device) for i in b.keys() if i != 'labels'})
        out = self.classifier_head(o.last_hidden_state[:, -1, :]).squeeze(1)

        loss = self.criterion(out, b["labels"].to(device))
        return loss, out

In [26]:
model = GPT2_token_classifier()
model.to(device)

GPT2_token_classifier(
  (gpt2): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Drop

In [27]:
gpt_tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path="gpt2")

gpt_tokenizer.padding_side = "left"
gpt_tokenizer.pad_token = gpt_tokenizer.eos_token


collator = Collator(gpt_tokenizer)



train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, 
                              collate_fn=collator)

val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, 
                              collate_fn=collator)

In [28]:
trainer = Trainer(model, 4, train_dataloader, val_dataloader)
gpt_res = trainer.train()

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/782 [00:00<?, ?it/s]

  inputs.update({'labels':torch.tensor(labels)})


(25000,) (25000,)
EPOCH: 0 | tr loss: 0.6440678208570956 | tr acc: 0.68424


  0%|          | 0/782 [00:00<?, ?it/s]

val loss: 0.4590030928116168 | val acc: 0.78332


  0%|          | 0/782 [00:00<?, ?it/s]

(25000,) (25000,)
EPOCH: 1 | tr loss: 0.390949447651196 | tr acc: 0.82248


  0%|          | 0/782 [00:00<?, ?it/s]

val loss: 0.3785052659833218 | val acc: 0.83124


  0%|          | 0/782 [00:00<?, ?it/s]

(25000,) (25000,)
EPOCH: 2 | tr loss: 0.29855528608193177 | tr acc: 0.87036


  0%|          | 0/782 [00:00<?, ?it/s]

val loss: 0.3733039725943447 | val acc: 0.83768


  0%|          | 0/782 [00:00<?, ?it/s]

(25000,) (25000,)
EPOCH: 3 | tr loss: 0.20484630036575105 | tr acc: 0.91768


  0%|          | 0/782 [00:00<?, ?it/s]

val loss: 0.42124552636519264 | val acc: 0.83464


In [29]:
from transformers import BertModel, BertTokenizer, DistilBertModel


class BERT_token_classifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.classifier_head = nn.Sequential(
            nn.Linear(768, 256), nn.Dropout(0.3), nn.Linear(256, 1))
        self.criterion = nn.BCEWithLogitsLoss()
        
    def forward(self, b):
        o = self.bert(**{i:b[i].to(device) for i in b.keys() if i != 'labels'})
        out = self.classifier_head(o.last_hidden_state[:, 0, :]).squeeze(1)

        loss = self.criterion(out, b["labels"].to(device))
        return loss, out

In [30]:
gpt_res

defaultdict(list,
            {'train_loss': [0.6440678208570956,
              0.390949447651196,
              0.29855528608193177,
              0.20484630036575105],
             'train_acc': [0.68424, 0.82248, 0.87036, 0.91768],
             'val_loss': [0.4590030928116168,
              0.3785052659833218,
              0.3733039725943447,
              0.42124552636519264],
             'val_acc': [0.78332, 0.83124, 0.83768, 0.83464]})

In [31]:
bert_tokenizer = BertTokenizer.from_pretrained(pretrained_model_name_or_path="bert-large-uncased")

collator = Collator(bert_tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, 
                              collate_fn=collator)

val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, 
                              collate_fn=collator)

In [32]:
model = BERT_token_classifier()
model.to(device)

BERT_token_classifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_aff

In [33]:
trainer = Trainer(model, 4, train_dataloader, val_dataloader)
res_bert = trainer.train()

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/782 [00:00<?, ?it/s]

  inputs.update({'labels':torch.tensor(labels)})


(25000,) (25000,)
EPOCH: 0 | tr loss: 0.35203473379506783 | tr acc: 0.8424


  0%|          | 0/782 [00:00<?, ?it/s]

val loss: 0.2934613299587041 | val acc: 0.8716


  0%|          | 0/782 [00:00<?, ?it/s]

(25000,) (25000,)
EPOCH: 1 | tr loss: 0.15693824112777363 | tr acc: 0.94308


  0%|          | 0/782 [00:00<?, ?it/s]

val loss: 0.4210827107442414 | val acc: 0.85436


  0%|          | 0/782 [00:00<?, ?it/s]

(25000,) (25000,)
EPOCH: 2 | tr loss: 0.04231357999296938 | tr acc: 0.98924


  0%|          | 0/782 [00:00<?, ?it/s]

val loss: 0.5824533229715505 | val acc: 0.86496


  0%|          | 0/782 [00:00<?, ?it/s]

(25000,) (25000,)
EPOCH: 3 | tr loss: 0.012043124046715576 | tr acc: 0.99772


  0%|          | 0/782 [00:00<?, ?it/s]

val loss: 0.6414202705288129 | val acc: 0.86508


In [34]:
class DistilBERT_token_classifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.classifier_head = nn.Sequential(
            nn.Linear(768, 256), nn.Dropout(0.3), nn.Linear(256, 1))
        self.criterion = nn.BCEWithLogitsLoss()
        
    def forward(self, b):
        o = self.bert(**{i:b[i].to(device) for i in b.keys() if not i in ['labels', 'token_type_ids']})
        out = self.classifier_head(o.last_hidden_state[:, 0, :]).squeeze(1)

        loss = self.criterion(out, b["labels"].to(device))
        return loss, out

In [35]:
bert_tokenizer = BertTokenizer.from_pretrained(pretrained_model_name_or_path="bert-large-uncased")

collator = Collator(bert_tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, 
                              collate_fn=collator)

val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, 
                              collate_fn=collator)

In [36]:
model = DistilBERT_token_classifier()
model.to(device)

DistilBERT_token_classifier(
  (bert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): L

In [37]:
trainer = Trainer(model, 4, train_dataloader, val_dataloader)
res_distilbert = trainer.train()

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/782 [00:00<?, ?it/s]

  inputs.update({'labels':torch.tensor(labels)})


(25000,) (25000,)
EPOCH: 0 | tr loss: 0.3746855812590293 | tr acc: 0.8288


  0%|          | 0/782 [00:00<?, ?it/s]

val loss: 0.3215711982468205 | val acc: 0.8568


  0%|          | 0/782 [00:00<?, ?it/s]

(25000,) (25000,)
EPOCH: 1 | tr loss: 0.21050816873455291 | tr acc: 0.92016


  0%|          | 0/782 [00:00<?, ?it/s]

val loss: 0.3510178682844505 | val acc: 0.85772


  0%|          | 0/782 [00:00<?, ?it/s]

(25000,) (25000,)
EPOCH: 2 | tr loss: 0.08931827561094967 | tr acc: 0.97304


  0%|          | 0/782 [00:00<?, ?it/s]

val loss: 0.5258881528302074 | val acc: 0.85044


  0%|          | 0/782 [00:00<?, ?it/s]

(25000,) (25000,)
EPOCH: 3 | tr loss: 0.03405001855847161 | tr acc: 0.99268


  0%|          | 0/782 [00:00<?, ?it/s]

val loss: 0.6394833690067634 | val acc: 0.8498
