In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "4"

In [2]:
import torch
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader
from ml_things import fix_text
import torch
import torch.nn as nn
from transformers import (set_seed,
                          Trainer,
                          GPT2Config,
                          GPT2Tokenizer,
                          get_linear_schedule_with_warmup,
                          GPT2ForSequenceClassification, GPT2Model)

In [3]:
set_seed(123)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
import numpy as np

class ImdbDataset(Dataset):
    r"""PyTorch Dataset class for loading data.

    This is where the data parsing happens.

    This class is built with reusability in mind: it can be used as is as.

    Arguments:

    path (:obj:`str`):
        Path to the data partition.

    """

    def __init__(self, path):        
        self.texts = []
        self.labels = []
        for label in ['pos', 'neg']:
            sentiment_path = os.path.join(path, label)
            files_names = os.listdir(sentiment_path)
            for file_name in tqdm(files_names, desc=f'{label} files'):
                file_path = os.path.join(sentiment_path, file_name)
                with open(file_path, "r", encoding='utf-8') as f:
                    content = f.readlines()[0]
              
                content = fix_text(content)
                self.texts.append(content)
                self.labels.append(label == 'pos')

        # Number of exmaples.
        self.n_examples = len(self.labels)
        self.texts = np.array(self.texts)
        self.labels = torch.tensor(self.labels).type(torch.float32)

    def __len__(self):
        r"""When used `len` return the number of examples.

        """

        return len(self.texts)

    def __getitem__(self, item):
        r"""Given an index return an example from the position.

        Arguments:

          item (:obj:`int`):
              Index position to pick an example to return.

        Returns:
          :obj:`Dict[str, str]`: Dictionary of inputs that contain text and 
          asociated labels.

        """

        return {'text':self.texts[item],
                'label':self.labels[item]}



In [5]:
class Collator(object):
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, sequences):
        r"""
        This function allowes the class objesct to be used as a function call.
        Sine the PyTorch DataLoader needs a collator function, I can use this 
        class as a function.

        Arguments:

          item (:obj:`list`):
              List of texts and labels.

        Returns:
          :obj:`Dict[str, object]`: Dictionary of inputs that feed into the model.
          It holddes the statement `model(**Returned Dictionary)`.
        """

        # Get all texts from sequences list.
        texts = [sequence['text'] for sequence in sequences]
        labels = torch.tensor([sequence['label'] for sequence in sequences])
        # Call tokenizer on all texts to convert into tensors of numbers with 
        # appropriate padding.
        inputs = self.tokenizer(text=texts, return_tensors="pt", padding=True, truncation=True, 
                                max_length=100, return_token_type_ids=True)
        # Update the inputs with the associated encoded labels as tensor.
        inputs.update({'labels':torch.tensor(labels)})

        return inputs





In [6]:
n_epochs = 4
batch_size = 32

set_seed(123)
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

train_dataset = ImdbDataset(path='./aclImdb/train')
val_dataset = ImdbDataset(path='./aclImdb/test')

pos files:   0%|          | 0/12500 [00:00<?, ?it/s]

neg files:   0%|          | 0/12500 [00:00<?, ?it/s]

pos files:   0%|          | 0/12500 [00:00<?, ?it/s]

neg files:   0%|          | 0/12500 [00:00<?, ?it/s]

In [7]:
from collections import defaultdict


class Trainer:
    def __init__(self, model, n_epochs, train_dataloader, val_dataloader=None, 
                 lr=2e-5, 
                 weight_decay=1e-2, num_warmup_steps=0, accum_steps=1):
        self.model = model
        self.train_dataloader = train_dataloader
        self.val_dataloader = val_dataloader
        
        self.optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
        self.scheduler = get_linear_schedule_with_warmup(self.optimizer, 
                                num_warmup_steps = num_warmup_steps, # Default value in run_glue.py
                                num_training_steps = n_epochs * len(train_dataloader))
        self.n_epochs = n_epochs
        
        self.train_dataloader = train_dataloader
        self.val_dataloader = val_dataloader

        self.accum_steps = accum_steps
        
    def train(self):
        
        log_dict = defaultdict(list)
        
        for epoch in tqdm(range(self.n_epochs)):
            
            train_loss = []
            train_acc = []
            pred_list = []
            labels_list = []
            for i, b in tqdm(enumerate(self.train_dataloader), leave=False):
                loss, out = self.model(b)
                
                loss.backward()
                pred = out.detach().cpu().numpy() > 0
                train_loss.append(loss.item())
                pred_list.append(pred)
                labels_list.append(b["labels"].detach().cpu().numpy())
                
                if i % self.accum_steps == 0:
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)

                    self.optimizer.step()
                    self.scheduler.step()
                    self.optimizer.zero_grad()
             
            


            train_acc = (np.concatenate(pred_list) == np.concatenate(labels_list)).mean()
            
            
            print(f"EPOCH: {epoch} | tr loss: {np.mean(train_loss)} | tr acc: {train_acc}")
            log_dict["train_loss"].append(np.mean(train_loss))
            log_dict["train_acc"].append(train_acc)
            if self.val_dataloader is not None:
                with torch.no_grad():
                    val_loss = []
                    val_acc = []
                    pred_list = []
                    labels_list = []
                    for b in tqdm(self.val_dataloader, leave=False): 
                        loss, out = self.model(b)

                        pred = out.detach().cpu().numpy() > 0
                        val_loss.append(loss.item())
                        pred_list.append(pred)
                        labels_list.append(b["labels"].detach().cpu().numpy())
                val_acc = (np.concatenate(pred_list) == np.concatenate(labels_list)).mean()
                
                
                print(f"val loss: {np.mean(val_loss)} | val acc: {val_acc}")
                log_dict["val_loss"].append(np.mean(val_loss))
                log_dict["val_acc"].append(val_acc)
        return log_dict

In [8]:
class GPT2_token_classifier(nn.Module):
    def __init__(self, mode="last"):
        super().__init__()
        self.gpt2 = GPT2Model.from_pretrained("gpt2")
        self.classifier_head = nn.Sequential(nn.Linear(768, 1))
        self.criterion = nn.BCEWithLogitsLoss()
        self.mode = mode
        
    def forward(self, b):
        o = self.gpt2(**{i:b[i].to(device) for i in b.keys() if i != 'labels'})
        if self.mode == "last":
            out = self.classifier_head(o.last_hidden_state[:, -1, :]).squeeze(1)
        elif self.mode == "mean":
            out = self.classifier_head(o.last_hidden_state.mean(axis=1)).squeeze(1)
        else:
            raise ValueError
        loss = self.criterion(out, b["labels"].to(device))
        return loss, out

In [15]:
model = GPT2_token_classifier()
model.to(device)

GPT2_token_classifier(
  (gpt2): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Drop

In [16]:
gpt_tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path="gpt2")

gpt_tokenizer.padding_side = "left"
gpt_tokenizer.pad_token = "<PAD>"
gpt_tokenizer.bos_token = "<BOS>"
gpt_tokenizer.sep_token = "<SEP>"

collator = Collator(gpt_tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, 
                                  collate_fn=collator)
    
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, 
                                  collate_fn=collator)

In [17]:
accum_steps_grid = [2, 4, 8]
batch_size = 32

exp_1 = {}
for accum_steps in accum_steps_grid:
    model = GPT2_token_classifier()
    model.to(device)
    trainer = Trainer(model, 4, train_dataloader, val_dataloader, accum_steps=accum_steps)
    gpt_res = trainer.train()
    exp_1[accum_steps] = gpt_res

  0%|          | 0/4 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  inputs.update({'labels':torch.tensor(labels)})


EPOCH: 0 | tr loss: 0.5766748278342244 | tr acc: 0.6926


  0%|          | 0/782 [00:00<?, ?it/s]

val loss: 0.4470638093036002 | val acc: 0.78828


0it [00:00, ?it/s]

EPOCH: 1 | tr loss: 0.4213090934373839 | tr acc: 0.80412


  0%|          | 0/782 [00:00<?, ?it/s]

val loss: 0.41636418394954006 | val acc: 0.80848


0it [00:00, ?it/s]

EPOCH: 2 | tr loss: 0.3504871911633655 | tr acc: 0.84464


  0%|          | 0/782 [00:00<?, ?it/s]

val loss: 0.3895478224348458 | val acc: 0.81876


0it [00:00, ?it/s]

val loss: 0.3448543058960791 | val acc: 0.84768


  0%|          | 0/4 [00:00<?, ?it/s]

0it [00:00, ?it/s]

EPOCH: 0 | tr loss: 0.6998165727538221 | tr acc: 0.65996


  0%|          | 0/782 [00:00<?, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



val loss: 0.3733390817642593 | val acc: 0.82896


0it [00:00, ?it/s]

EPOCH: 3 | tr loss: 0.3092244379222393 | tr acc: 0.86492


  0%|          | 0/782 [00:00<?, ?it/s]

val loss: 0.40850111489158 | val acc: 0.82772


  0%|          | 0/4 [00:00<?, ?it/s]

0it [00:00, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



EPOCH: 2 | tr loss: 0.4910261570416448 | tr acc: 0.76424


  0%|          | 0/782 [00:00<?, ?it/s]

val loss: 0.4444928779969435 | val acc: 0.7896


0it [00:00, ?it/s]

EPOCH: 3 | tr loss: 0.41915066523091565 | tr acc: 0.80672


  0%|          | 0/782 [00:00<?, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [18]:
accum_steps = list(exp_1.keys())[np.argmax([max(v)  for v in exp_1.values()])]

In [19]:
exp_2 = {}
accum_steps = 2 
lr_grid = [1e-4, 4e-4, 2e-5, 1e-5]
for lr in lr_grid:    
    model = GPT2_token_classifier()
    model.to(device)
    trainer = Trainer(model, 4, train_dataloader, val_dataloader, lr=lr, accum_steps=accum_steps)
    gpt_res = trainer.train()
    exp_2[accum_steps] = gpt_res

  0%|          | 0/4 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  inputs.update({'labels':torch.tensor(labels)})


EPOCH: 0 | tr loss: 0.5184117731878825 | tr acc: 0.7306


  0%|          | 0/782 [00:00<?, ?it/s]

val loss: 0.3692732937062335 | val acc: 0.83364


0it [00:00, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



EPOCH: 3 | tr loss: 0.1290771245201359 | tr acc: 0.95176


  0%|          | 0/782 [00:00<?, ?it/s]

val loss: 0.44042506153859634 | val acc: 0.84828


  0%|          | 0/4 [00:00<?, ?it/s]

0it [00:00, ?it/s]

EPOCH: 0 | tr loss: 0.514335650483818 | tr acc: 0.73352


  0%|          | 0/782 [00:00<?, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



val loss: 0.38826294936230193 | val acc: 0.83552


0it [00:00, ?it/s]

EPOCH: 3 | tr loss: 0.16027215989235113 | tr acc: 0.93976


  0%|          | 0/782 [00:00<?, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



val loss: 0.36234461473267704 | val acc: 0.83332


0it [00:00, ?it/s]

EPOCH: 2 | tr loss: 0.2992802283790944 | tr acc: 0.87056


  0%|          | 0/782 [00:00<?, ?it/s]

val loss: 0.32725635108054446 | val acc: 0.85616


0it [00:00, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



EPOCH: 1 | tr loss: 0.3034337144869063 | tr acc: 0.8698


  0%|          | 0/782 [00:00<?, ?it/s]

val loss: 0.3147247013876505 | val acc: 0.86168


0it [00:00, ?it/s]

EPOCH: 2 | tr loss: 0.20079458763827676 | tr acc: 0.9204


  0%|          | 0/782 [00:00<?, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [20]:

lr = list(exp_2.keys())[np.argmax([max(v)  for v in exp_2.values()])]
wd_grid = [1, 1e-1, 1e-2]

exp_3 = {}
for wd in wd_grid:    
    model = GPT2_token_classifier()
    model.to(device)
    trainer = Trainer(model, 4, train_dataloader, val_dataloader, lr=lr, weight_decay=wd, accum_steps=accum_steps)
    gpt_res = trainer.train()
    exp_3[wd] = gpt_res

  0%|          | 0/4 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  inputs.update({'labels':torch.tensor(labels)})


EPOCH: 0 | tr loss: 0.4788222128854078 | tr acc: 0.75492


  0%|          | 0/782 [00:00<?, ?it/s]

val loss: 0.3401996571418193 | val acc: 0.8496


0it [00:00, ?it/s]

EPOCH: 1 | tr loss: 0.2987068461163727 | tr acc: 0.8722


  0%|          | 0/782 [00:00<?, ?it/s]

val loss: 0.30688019992921817 | val acc: 0.86764


0it [00:00, ?it/s]

EPOCH: 2 | tr loss: 0.20867365611302655 | tr acc: 0.91688


  0%|          | 0/782 [00:00<?, ?it/s]

val loss: 0.3292151781041985 | val acc: 0.8652


0it [00:00, ?it/s]

EPOCH: 3 | tr loss: 0.10925276169572454 | tr acc: 0.96128


  0%|          | 0/782 [00:00<?, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



val loss: 0.3779243638505564 | val acc: 0.828


0it [00:00, ?it/s]

EPOCH: 2 | tr loss: 0.3381673367717839 | tr acc: 0.85148


  0%|          | 0/782 [00:00<?, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



val loss: 0.328585504461318 | val acc: 0.8518


0it [00:00, ?it/s]

EPOCH: 2 | tr loss: 0.2150495457403419 | tr acc: 0.9148


  0%|          | 0/782 [00:00<?, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [21]:
wd = list(exp_3.keys())[np.argmax([max(v) for v in exp_3.values()])]

model = GPT2_token_classifier(mode="mean")
model.to(device)
trainer = Trainer(model, 4, train_dataloader, val_dataloader, lr=lr, weight_decay=wd, 
                  accum_steps=accum_steps)
gpt_res_mean = trainer.train()

  0%|          | 0/4 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  inputs.update({'labels':torch.tensor(labels)})


EPOCH: 0 | tr loss: 0.48158043129441075 | tr acc: 0.75204


  0%|          | 0/782 [00:00<?, ?it/s]

val loss: 0.37015750232483724 | val acc: 0.82916


0it [00:00, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



val loss: 0.3858559117366648 | val acc: 0.8614


In [22]:
from transformers import BertModel, BertTokenizer, DistilBertModel


class BERT_token_classifier(nn.Module):
    def __init__(self, mode="last"):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.classifier_head = nn.Sequential(
            nn.Linear(768, 256), nn.Dropout(0.3), nn.Linear(256, 1))
        self.criterion = nn.BCEWithLogitsLoss()
        self.mode = mode
        
    def forward(self, b):
        o = self.bert(**{i:b[i].to(device) for i in b.keys() if i != 'labels'})
        if self.mode == "last":
            out = self.classifier_head(o.last_hidden_state[:, 0, :]).squeeze(1)
        elif self.mode == "mean":
            out = self.classifier_head(o.last_hidden_state.mean(axis=1)).squeeze(1)
        else:
            raise ValueError 

        loss = self.criterion(out, b["labels"].to(device))
        return loss, out

In [23]:
bert_tokenizer = BertTokenizer.from_pretrained(pretrained_model_name_or_path="bert-large-uncased")

collator = Collator(bert_tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, 
                              collate_fn=collator)

val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, 
                              collate_fn=collator)

In [24]:
accum_steps_grid = [2, 4, 8]
batch_size = 32

b_exp_1 = {}
for accum_steps in accum_steps_grid:
    model = BERT_token_classifier()
    model.to(device)
    trainer = Trainer(model, 4, train_dataloader, val_dataloader, accum_steps=accum_steps)
    bert_res = trainer.train()
    b_exp_1[accum_steps] = gpt_res

  0%|          | 0/4 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  inputs.update({'labels':torch.tensor(labels)})


EPOCH: 0 | tr loss: 0.3549032825547868 | tr acc: 0.84024


  0%|          | 0/782 [00:00<?, ?it/s]

val loss: 0.3363087719825604 | val acc: 0.8544


0it [00:00, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



val loss: 0.6573284476301144 | val acc: 0.86388


  0%|          | 0/4 [00:00<?, ?it/s]

0it [00:00, ?it/s]

EPOCH: 0 | tr loss: 0.3643641300080225 | tr acc: 0.83776


  0%|          | 0/782 [00:00<?, ?it/s]

val loss: 0.3102813884711174 | val acc: 0.86112


0it [00:00, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



val loss: 0.6325494082551688 | val acc: 0.85524


  0%|          | 0/4 [00:00<?, ?it/s]

0it [00:00, ?it/s]

EPOCH: 0 | tr loss: 0.412744663960641 | tr acc: 0.80692


  0%|          | 0/782 [00:00<?, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



val loss: 0.5314432618778694 | val acc: 0.85212


In [26]:
b_exp_2 = {}
accum_steps = list(b_exp_1.keys())[np.argmax([max(v)  for v in b_exp_1.values()])]
lr_grid = [1e-4, 4e-4, 2e-5, 1e-5]
for lr in lr_grid:    
    model = BERT_token_classifier()
    model.to(device)
    trainer = Trainer(model, 4, train_dataloader, val_dataloader, lr=lr, accum_steps=accum_steps)
    bert_res = trainer.train()
    b_exp_2[accum_steps] = gpt_res

  0%|          | 0/4 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  inputs.update({'labels':torch.tensor(labels)})


EPOCH: 0 | tr loss: 0.3626275332192021 | tr acc: 0.83596


  0%|          | 0/782 [00:00<?, ?it/s]

val loss: 0.3060876177552411 | val acc: 0.866


0it [00:00, ?it/s]

EPOCH: 1 | tr loss: 0.18677594974551284 | tr acc: 0.92864


  0%|          | 0/782 [00:00<?, ?it/s]

val loss: 0.3193466570871451 | val acc: 0.87016


0it [00:00, ?it/s]

EPOCH: 2 | tr loss: 0.06558046036470166 | tr acc: 0.98068


  0%|          | 0/782 [00:00<?, ?it/s]

val loss: 0.5104519775991216 | val acc: 0.85876


0it [00:00, ?it/s]

EPOCH: 3 | tr loss: 0.02157933735157675 | tr acc: 0.99504


  0%|          | 0/782 [00:00<?, ?it/s]

val loss: 0.7233465179162936 | val acc: 0.85768


  0%|          | 0/4 [00:00<?, ?it/s]

0it [00:00, ?it/s]

EPOCH: 0 | tr loss: 0.35435784411857196 | tr acc: 0.84112


  0%|          | 0/782 [00:00<?, ?it/s]

val loss: 0.3028607758624322 | val acc: 0.86764


0it [00:00, ?it/s]

EPOCH: 1 | tr loss: 0.18409269551634597 | tr acc: 0.92956


  0%|          | 0/782 [00:00<?, ?it/s]

val loss: 0.34112903531497857 | val acc: 0.8658


0it [00:00, ?it/s]

EPOCH: 2 | tr loss: 0.06630943126230955 | tr acc: 0.9804


  0%|          | 0/782 [00:00<?, ?it/s]

val loss: 0.5148017930238784 | val acc: 0.86304


0it [00:00, ?it/s]

EPOCH: 3 | tr loss: 0.020622159311216793 | tr acc: 0.99532


  0%|          | 0/782 [00:00<?, ?it/s]

val loss: 0.7079404456800122 | val acc: 0.86244


  0%|          | 0/4 [00:00<?, ?it/s]

0it [00:00, ?it/s]

EPOCH: 0 | tr loss: 0.3633579827768876 | tr acc: 0.83644


  0%|          | 0/782 [00:00<?, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



val loss: 0.5304892659529715 | val acc: 0.85816


0it [00:00, ?it/s]

KeyboardInterrupt: 

In [None]:
# lr = 4e-4#list(b_exp_2.keys())[np.argmax([max(v)  for v in b_exp_2.values()])]
# wd_grid = [1, 1e-1, 1e-2]

# b_exp_3 = {}
# for wd in wd_grid:    
#     model = BERT_token_classifier()
#     model.to(device)
#     trainer = Trainer(model, 4, train_dataloader, val_dataloader, lr=lr, weight_decay=wd, 
#                       accum_steps=accum_steps)
#     bert_res = trainer.train()
#     b_exp_3[wd] = gpt_res

In [27]:
lr = 4e-4

model = BERT_token_classifier(mode="mean")
model.to(device)
trainer = Trainer(model, 4, train_dataloader, val_dataloader, lr=lr, 
                  accum_steps=accum_steps)
bert_res_mean = trainer.train()

  0%|          | 0/4 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  inputs.update({'labels':torch.tensor(labels)})


EPOCH: 0 | tr loss: 0.36383255434882306 | tr acc: 0.83636


  0%|          | 0/782 [00:00<?, ?it/s]

val loss: 0.3172559440064499 | val acc: 0.86096


0it [00:00, ?it/s]

EPOCH: 1 | tr loss: 0.19802299298374626 | tr acc: 0.9238


  0%|          | 0/782 [00:00<?, ?it/s]

val loss: 0.3538132235026725 | val acc: 0.86776


0it [00:00, ?it/s]

EPOCH: 2 | tr loss: 0.07807431518709969 | tr acc: 0.97696


  0%|          | 0/782 [00:00<?, ?it/s]

val loss: 0.4567586320834687 | val acc: 0.86232


0it [00:00, ?it/s]

EPOCH: 3 | tr loss: 0.02895740947202015 | tr acc: 0.99396


  0%|          | 0/782 [00:00<?, ?it/s]

val loss: 0.6752312300042571 | val acc: 0.86352


In [28]:
class DistilBERT_token_classifier(nn.Module):
    def __init__(self, mode="last"):
        super().__init__()
        self.bert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.classifier_head = nn.Sequential(
            nn.Linear(768, 256), nn.Dropout(0.3), nn.Linear(256, 1))
        self.criterion = nn.BCEWithLogitsLoss()
        self.mode = mode
        
    def forward(self, b):
        o = self.bert(**{i:b[i].to(device) for i in b.keys() if not i in ['labels', 'token_type_ids']})
        if self.mode == "last":
            out = self.classifier_head(o.last_hidden_state[:, 0, :]).squeeze(1)
        elif self.mode == "mean":
            out = self.classifier_head(o.last_hidden_state.mean(axis=1)).squeeze(1)
        else:
            raise ValueError 

        loss = self.criterion(out, b["labels"].to(device))
        return loss, out

In [None]:
# accum_steps_grid = [2, 4, 8]
# batch_size = 32

# db_exp_1 = {}
# for accum_steps in accum_steps_grid:
#     model = DistilBERT_token_classifier()
#     model.to(device)
#     trainer = Trainer(model, 4, train_dataloader, val_dataloader, accum_steps=accum_steps)
#     bert_res = trainer.train()
#     db_exp_1[accum_steps] = gpt_res

In [None]:
# db_exp_2 = {}
# accum_steps = list(db_exp_1.keys())[np.argmax([max(v)  for v in db_exp_1.values()])]

# lr_grid = [1e-4, 4e-4, 2e-5, 1e-5]:
# for lr in lr_grid:    
#     model = DistilBERT_token_classifier()
#     model.to(device)
#     trainer = Trainer(model, 4, train_dataloader, val_dataloader, lr=lr, accum_steps=accum_steps)
#     bert_res = trainer.train()
#     db_exp_2[accum_steps] = gpt_res

In [None]:
# lr = list(db_exp_2.keys())[np.argmax([max(v)  for v in db_exp_2.values()])]
# wd_grid = [1, 1e-1, 1e-2]

# db_exp_3 = {}
# for wd in wd_grid:    
#     model = DistilBERT_token_classifier()
#     model.to(device)
#     trainer = Trainer(model, 4, train_dataloader, val_dataloader, lr=lr, weight_decay=wd, 
#                       accum_steps=accum_steps)
#     bert_res = trainer.train()
#     db_exp_3[wd] = gpt_res

In [None]:
# wd = list(db_exp_3.keys())[np.argmax([max(v) for v in db_exp_3.values()])]

# model = DistilBERT_token_classifier(mode="mean")
# model.to(device)
# trainer = Trainer(model, 4, train_dataloader, val_dataloader, lr=lr, weight_decay=wd, 
#                   accum_steps=accum_steps)
# dbert_res_mean = trainer.train()

In [29]:
#wd = list(db_exp_3.keys())[np.argmax([max(v) for v in db_exp_3.values()])]

model = DistilBERT_token_classifier()
model.to(device)
trainer = Trainer(model, 4, train_dataloader, val_dataloader, lr=lr, weight_decay=wd, 
                  accum_steps=accum_steps)
dbert_res_last = trainer.train()

  0%|          | 0/4 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  inputs.update({'labels':torch.tensor(labels)})


EPOCH: 0 | tr loss: 0.38462774023947205 | tr acc: 0.82536


  0%|          | 0/782 [00:00<?, ?it/s]

val loss: 0.3331771263485903 | val acc: 0.851


0it [00:00, ?it/s]

EPOCH: 1 | tr loss: 0.23890103725835565 | tr acc: 0.9042


  0%|          | 0/782 [00:00<?, ?it/s]

val loss: 0.33511649134576016 | val acc: 0.85672


0it [00:00, ?it/s]

EPOCH: 2 | tr loss: 0.1173041365157022 | tr acc: 0.96044


  0%|          | 0/782 [00:00<?, ?it/s]

val loss: 0.43115347505742424 | val acc: 0.85144


0it [00:00, ?it/s]

KeyboardInterrupt: 

In [None]:
assert 0

In [39]:
max(gpt_res_mean["val_acc"]), max(bert_res_mean["val_acc"])

(0.86172, 0.86776)

In [33]:
max(max(i["val_acc"]) for i in exp_1.values())

0.84768

In [34]:
max(max(i["val_acc"]) for i in exp_2.values())

0.86232

In [35]:
max(max(i["val_acc"]) for i in exp_3.values())

0.86764

In [36]:
max(max(i["val_acc"]) for i in b_exp_1.values())

0.86208

In [37]:
max(max(i["val_acc"]) for i in b_exp_2.values())

0.86208

In [49]:
{k: max(v["val_acc"]) for k, v in exp_2.items()}

{2: 0.86232}

## Cond GPT2

In [8]:
from transformers import GPT2LMHeadModel

mgp = GPT2LMHeadModel.from_pretrained("gpt2")

In [9]:
mgp.cuda()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [10]:
from transformers import GPT2LMHeadModel

class GPT2_token_classifier_bayes(nn.Module):
    def __init__(self, model, tokenizer, n_classes=2):
        super().__init__()
        self.gpt2 = model#GPT2LMHeadModel.from_pretrained("gpt2")
        self.criterion = nn.CrossEntropyLoss()
        self.criterion_full = nn.CrossEntropyLoss(reduction="none")
        self.tokenizer = tokenizer
        self.n_classes = n_classes
        
    def forward(self, b):
        b_cuda = {i:b[i].cuda() for i in b.keys() if i not in ["class_label"]}
        #o_cond = self.gpt2(b_cuda["input_ids"], labels=b_cuda["labels"], 
        #                   attention_mask=b_cuda["attention_mask"])
        o_cond = self.gpt2(b_cuda["input_ids"], 
                           attention_mask=b_cuda["attention_mask"])
        
        #o_raw = self.gpt2(b_cuda["input_ids"][:, 2:], 
        #                   attention_mask=b_cuda["attention_mask"][:, 2:])
       
        
        mask = b_cuda["labels"][:, 1:].ravel() != gpt_tokenizer.pad_token_id
        loss_cond = self.criterion(o_cond.logits[:, 2:-1].reshape(-1, o_cond.logits[:, 2:-1].shape[-1])[mask], 
                              b_cuda["labels"][:, 1:].ravel()[mask])
        #with torch.no_grad():
        #    loss_raw = self.criterion(o_raw.logits[:, :-1].reshape(-1, o_raw.logits[:, :-1].shape[-1])[mask], 
        #                          b_cuda["labels"][:, 1:].ravel()[mask])
        #print(loss_cond, loss_raw)

        return loss_cond #- loss_raw.detach()
    
    def predict(self, b):
        b_cuda = {i:b[i].cuda() for i in b.keys() if i not in ["class_label"]}
        res = []
        for c in range(self.n_classes):
            input_ids = b_cuda["input_ids"] + 0
            input_ids[:, 0] = self.tokenizer._convert_token_to_id(str(c))
        
            o_cond = self.gpt2(input_ids, 
                               attention_mask=b_cuda["attention_mask"])

            o_raw = self.gpt2(b_cuda["input_ids"][:, 2:], 
                             attention_mask=b_cuda["attention_mask"][:, 2:])


            mask = b_cuda["labels"][:, 1:] != gpt_tokenizer.pad_token_id
            
#             print(o_cond.logits[:, 2:-1].shape)
#             print(o_raw.logits[:, :-1].permute(0, 2, 1).shape)
#             print(b_cuda["labels"][:, 1:].shape)
#             print(mask.shape)
            
            
            loss_cond = (self.criterion_full(o_cond.logits[:, 2:-1].permute(0, 2, 1), 
                                  b_cuda["labels"][:, 1:])*mask).sum(axis=-1)

            loss_raw = (self.criterion_full(o_raw.logits[:, :-1].permute(0, 2, 1), 
                                 b_cuda["labels"][:, 1:])*mask).sum(axis=-1)
            res.append(torch.exp(-(loss_cond - loss_raw)))
#             res.append(torch.exp(-(loss_cond)))
        res = torch.stack(res, dim=-1)
        return res, res.argmax(dim=-1)

In [11]:
class ConditionalCollator(object):
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, sequences):
        r"""
        This function allowes the class objesct to be used as a function call.
        Sine the PyTorch DataLoader needs a collator function, I can use this 
        class as a function.

        Arguments:

          item (:obj:`list`):
              List of texts and labels.

        Returns:
          :obj:`Dict[str, object]`: Dictionary of inputs that feed into the model.
          It holddes the statement `model(**Returned Dictionary)`.
        """

        # Get all texts from sequences list.
        

        texts = [str(int(sequence['label'].item())) +" <SEP> " + sequence['text'] for sequence in sequences]
        labels = torch.tensor([sequence['label'] for sequence in sequences])
        # Call tokenizer on all texts to convert into tensors of numbers with 
        # appropriate padding.
    
        out = self.tokenizer(text=texts, return_tensors="pt", padding=True, truncation=True, 
                                max_length=50, return_token_type_ids=True)
        # Update the inputs with the associated encoded labels as tensor.
        out.update({'class_label':torch.tensor(labels)})
        out.update({'labels':out.input_ids[:, 2:]})

        return out







In [12]:
from collections import defaultdict


class ConditionalTrainer:
    def __init__(self, model, n_epochs, train_dataloader, val_dataloader=None, 
                 lr=2e-5, 
                 weight_decay=1e-2, num_warmup_steps=0, accum_steps=1):
        self.model = model
        self.train_dataloader = train_dataloader
        self.val_dataloader = val_dataloader
        
        self.optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=weight_decay)
        self.scheduler = get_linear_schedule_with_warmup(self.optimizer, 
                                num_warmup_steps = num_warmup_steps, # Default value in run_glue.py
                                num_training_steps = n_epochs * len(train_dataloader))
        self.n_epochs = n_epochs
        
        self.train_dataloader = train_dataloader
        self.val_dataloader = val_dataloader

        self.accum_steps = accum_steps
        
    def train(self):
        
        log_dict = defaultdict(list)
        
        for epoch in tqdm(range(self.n_epochs)):
            train_loss = []
            train_acc = []
            pred_list = []
            labels_list = []
            for i, b in enumerate(tqdm(self.train_dataloader, leave=False)):
                loss = self.model(b) - np.log(0.5).item()
                loss.backward()
#                 pred = out.detach().cpu().numpy() > 0
                train_loss.append(loss.item())
#                 pred_list.append(pred)
#                 labels_list.append(b["labels"].detach().cpu().numpy())
                
                if i % self.accum_steps == 0:
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)

                    self.optimizer.step()
                    self.scheduler.step()
                    self.optimizer.zero_grad()
             

#             print(np.concatenate(pred_list).shape, np.concatenate(labels_list).shape)


#             train_acc = (np.concatenate(pred_list) == np.concatenate(labels_list)).mean()
            train_acc = 0
            
        
            print(f"EPOCH: {epoch} | tr loss: {np.mean(train_loss)} | tr acc: {train_acc}")
            log_dict["train_loss"].append(np.mean(train_loss))
            log_dict["train_acc"].append(train_acc)
            if self.val_dataloader is not None:
                with torch.no_grad():
                    val_loss = []
                    val_acc = []
                    pred_list = []
                    labels_list = []
                    for b in tqdm(self.val_dataloader, leave=False):                        
                        res, pred  = self.model.predict(b)


                        pred = pred.detach().cpu().numpy() > 0
                        #val_loss.append(loss.item())
                        pred_list.append(pred)
                        labels_list.append(b["class_label"].detach().cpu().numpy())
                val_acc = (np.concatenate(pred_list) == np.concatenate(labels_list)).mean()
                
                
                print(f"val loss: {np.mean(val_loss)} | val acc: {val_acc}")
                log_dict["val_loss"].append(np.mean(val_loss))
                log_dict["val_acc"].append(val_acc)
        return log_dict

In [13]:
np.log(0.5).item()

-0.6931471805599453

In [13]:
gpt_tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path="gpt2")

gpt_tokenizer.padding_side = "left"
gpt_tokenizer.pad_token = "<PAD>"
gpt_tokenizer.bos_token = "<BOS>"
gpt_tokenizer.sep_token = "<SEP>"



collator = ConditionalCollator(gpt_tokenizer)



train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, 
                              collate_fn=collator)

val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, 
                              collate_fn=collator)
    


In [15]:
r = []
for i in train_dataset:
    r.append(i["label"].item())
np.unique(r, return_counts=True)   

(array([0., 1.]), array([12500, 12500]))

In [22]:
model = GPT2_token_classifier_bayes(mgp, gpt_tokenizer, 2)
#model.to(device)

In [23]:
trainer = ConditionalTrainer(model, 4, train_dataloader, val_dataloader, lr=1e-3)

In [24]:
res_gpt_bayes = trainer.train()

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/782 [00:00<?, ?it/s]

  out.update({'class_label':torch.tensor(labels)})


EPOCH: 0 | tr loss: 4.048966794367641 | tr acc: 0


  0%|          | 0/782 [00:00<?, ?it/s]

val loss: nan | val acc: 0.76676


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


  0%|          | 0/782 [00:00<?, ?it/s]

EPOCH: 1 | tr loss: 3.941113164967588 | tr acc: 0


  0%|          | 0/782 [00:00<?, ?it/s]

val loss: nan | val acc: 0.76524


  0%|          | 0/782 [00:00<?, ?it/s]

EPOCH: 2 | tr loss: 3.8679225575893432 | tr acc: 0


  0%|          | 0/782 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [25]:
mgp = GPT2LMHeadModel.from_pretrained("gpt2")
mgp.cuda()
model = GPT2_token_classifier_bayes(mgp, gpt_tokenizer, 2)
trainer = ConditionalTrainer(model, 4, train_dataloader, val_dataloader, lr=2e-4)

In [26]:
res_gpt_bayes2 = trainer.train()

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/782 [00:00<?, ?it/s]

  out.update({'class_label':torch.tensor(labels)})


EPOCH: 0 | tr loss: 3.847259479105625 | tr acc: 0


  0%|          | 0/782 [00:00<?, ?it/s]

val loss: nan | val acc: 0.76732


  0%|          | 0/782 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [14]:
from transformers import GPT2LMHeadModel

class GPT2_token_classifier_bayes_opposite(nn.Module):
    def __init__(self, model, tokenizer, n_classes=2):
        super().__init__()
        self.gpt2 = model#GPT2LMHeadModel.from_pretrained("gpt2")
        self.criterion = nn.CrossEntropyLoss()
        self.criterion_full = nn.CrossEntropyLoss(reduction="none")
        self.tokenizer = tokenizer
        self.n_classes = n_classes
        
    def forward(self, b):
        b_cuda = {i:b[i].cuda() for i in b.keys() if i not in ["class_label"]}
        
        input_ids_opp = b["input_ids"] + 0 
        
        classes = b["class_label"].numpy().tolist()
        opposite_c = torch.tensor([self.tokenizer._convert_token_to_id(str(int(1 - i))) for i in classes])
        input_ids_opp[:, 0] = opposite_c
        input_ids_opp = input_ids_opp.cuda()
        
        
        
        o_cond = self.gpt2(b_cuda["input_ids"], 
                           attention_mask=b_cuda["attention_mask"])
        
        o_cond_opp = self.gpt2(input_ids_opp, 
                          attention_mask=b_cuda["attention_mask"])
       
        
        mask = b_cuda["labels"][:, 1:].ravel() != gpt_tokenizer.pad_token_id
        loss_cond = self.criterion(o_cond.logits[:, 2:-1].reshape(-1, o_cond.logits[:, 2:-1].shape[-1])[mask], 
                              b_cuda["labels"][:, 1:].ravel()[mask])
        
        loss_cond_opp = self.criterion(o_cond_opp.logits[:, 2:-1].reshape(-1, o_cond_opp.logits[:, 2:-1].shape[-1])[mask], 
                              b_cuda["labels"][:, 1:].ravel()[mask])
        #with torch.no_grad():
        #    loss_raw = self.criterion(o_raw.logits[:, :-1].reshape(-1, o_raw.logits[:, :-1].shape[-1])[mask], 
        #                          b_cuda["labels"][:, 1:].ravel()[mask])
        #print(loss_cond, loss_raw)

        return loss_cond - loss_cond_opp#- loss_raw.detach()
    
    def predict(self, b):
        b_cuda = {i:b[i].cuda() for i in b.keys() if i not in ["class_label"]}
        res = []
        for c in range(self.n_classes):
            input_ids = b_cuda["input_ids"] + 0
            input_ids[:, 0] = self.tokenizer._convert_token_to_id(str(c))
        
            o_cond = self.gpt2(input_ids, 
                               attention_mask=b_cuda["attention_mask"])

            o_raw = self.gpt2(b_cuda["input_ids"][:, 2:], 
                              attention_mask=b_cuda["attention_mask"][:, 2:])


            mask = b_cuda["labels"][:, 1:] != gpt_tokenizer.pad_token_id
            
#             print(o_cond.logits[:, 2:-1].shape)
#             print(o_raw.logits[:, :-1].permute(0, 2, 1).shape)
#             print(b_cuda["labels"][:, 1:].shape)
#             print(mask.shape)
            
            
            loss_cond = (self.criterion_full(o_cond.logits[:, 2:-1].permute(0, 2, 1), 
                                  b_cuda["labels"][:, 1:])*mask).sum(axis=-1)

            loss_raw = (self.criterion_full(o_raw.logits[:, :-1].permute(0, 2, 1), 
                                  b_cuda["labels"][:, 1:])*mask).sum(axis=-1)
            res.append(torch.exp(-(loss_cond - loss_raw)))
        res = torch.stack(res, dim=-1)
        return res, res.argmax(dim=-1)

In [15]:
model = GPT2_token_classifier_bayes_opposite(mgp, gpt_tokenizer, 2)
#model.to(device)

In [16]:
trainer = ConditionalTrainer(model, 4, train_dataloader, val_dataloader, lr=1e-4)

In [17]:
res_gpt_bayes_opp = trainer.train()

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/782 [00:00<?, ?it/s]

  out.update({'class_label':torch.tensor(labels)})


EPOCH: 0 | tr loss: -30.526235630750048 | tr acc: 0


  0%|          | 0/782 [00:00<?, ?it/s]

val loss: nan | val acc: 0.54692


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


  0%|          | 0/782 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
gpt_tokenizer.pad_token_id

In [None]:
gpt_tokenizer.sep_token_id = 50257

In [None]:
gpt_tokenizer.sep_token_id