In [1]:
print('b')

b


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2ForSequenceClassification

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

sentiment_model = GPT2ForSequenceClassification.from_pretrained("distilgpt2").to(device)
gpt2_model = GPT2LMHeadModel.from_pretrained("distilgpt2").to(device)
gpt2_model_ref = GPT2LMHeadModel.from_pretrained("distilgpt2").to(device)
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")

sentiment_model.config.pad_token_id = tokenizer.eos_token_id
gpt2_model.config.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

Some weights of the model checkpoint at distilgpt2 were not used when initializing GPT2ForSequenceClassification: ['lm_head.weight']
- This IS expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at distilgpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
from datasets import load_dataset
from tqdm import tqdm

datasets = load_dataset("json", field='data', data_files={
    "train": "../data/tldr-filtered-test.json",
    "validation": "../data/tldr-filtered-test.json"
})

# prep dataset
def tokenize_function(examples):
    output = tokenizer([txt + ' TLDR:' for txt in examples['content']], max_length=512, truncation=True, padding=True)
    return output

tokenized_datasets = datasets.map(
    tokenize_function,
    batched=True,
    num_proc=1,
    remove_columns = datasets["train"].column_names
)

Using custom data configuration default
Reusing dataset json (/home/kip/.cache/huggingface/datasets/json/default-a8082db3ce507167/0.0.0/70d89ed4db1394f028c651589fcab6d6b28dddcabbe39d3b21b4d41f9a708514)
Loading cached processed dataset at /home/kip/.cache/huggingface/datasets/json/default-a8082db3ce507167/0.0.0/70d89ed4db1394f028c651589fcab6d6b28dddcabbe39d3b21b4d41f9a708514/cache-f7fcc241cef89abd.arrow
Loading cached processed dataset at /home/kip/.cache/huggingface/datasets/json/default-a8082db3ce507167/0.0.0/70d89ed4db1394f028c651589fcab6d6b28dddcabbe39d3b21b4d41f9a708514/cache-fa5a61946f26d8f2.arrow


In [4]:
def collate_wrapper(batch):
    return tokenizer.pad(batch, return_tensors='pt')

from torch.utils.data import DataLoader
loader = DataLoader(tokenized_datasets['train'], batch_size=32, pin_memory=False, collate_fn=collate_wrapper)

In [5]:
response_len = 64
sequences = 4
keep = 2
    
def query_model(query, batch_size, response_len = 32, samples_per_input = 2):    
    response_tensors = []
    tensor_shape = query['input_ids'].size(0)
    for i in range(int(tensor_shape/batch_size)):
        with torch.no_grad():
            generation_output = gpt2_model.generate(input_ids=query['input_ids'][i*batch_size:(i+1)*batch_size],
                                                    attention_mask=query['attention_mask'][i*batch_size:(i+1)*batch_size],
                                                    max_length=query['attention_mask'].size(1)+response_len, 
                                                    do_sample=True, 
                                                    top_p = 1.0,
                                                    num_return_sequences=samples_per_input)
        for tensor in generation_output:
            response_tensors.append(tensor)
    
    torch.cuda.empty_cache()
    #return tokenizer.pad(response_tensors)
    return tokenizer.pad({'input_ids': response_tensors}, 
                         padding=True)['input_ids'].to(device)

def shift_left(inputs):
    #Move non <|endoftext|> to the beginning
    input_ids = torch.full_like(inputs, 50256)
    attention_mask = torch.zeros_like(inputs)
    ne = torch.ne(inputs, 50256)
    
    for i in range(ne.shape[0]):
        input_ids[i, :ne[i].sum(-1)] = inputs[i,ne[i]]
        attention_mask[i, :ne[i].sum(-1)] = 1
    
    return {'input_ids': input_ids, 'attention_mask': attention_mask}

def calculate_scores(generated, batch_size):
    scores_tensor = []
    for i in range(int(generated['input_ids'].size(0)/batch_size)):
        
        input_ids = generated['input_ids'][i*batch_size:(i+1)*batch_size]
        attention_mask = generated['attention_mask'][i*batch_size:(i+1)*batch_size]
        
        with torch.no_grad():
            scores = sentiment_model(input_ids = input_ids,
                                    attention_mask = attention_mask)['logits']
            
        scores = torch.squeeze(scores)
        scores_tensor.append(scores)
        
    torch.cuda.empty_cache()
    return torch.cat(scores_tensor)

In [7]:
import wandb
wandb.init(project='transformer_bap')

optimizer = torch.optim.Adam(gpt2_model.parameters(), lr=4e-5)

optimizer.zero_grad()
for idx, batch in enumerate(tqdm(loader)):
    batch = batch.to(device)

    generated = query_model(batch, batch_size = 2, response_len = 32, samples_per_input = 6)

    left_generated = shift_left(generated)

    scores = calculate_scores(left_generated, batch_size = 2)

    _, sorted_idx = torch.sort(scores, descending=True)
    sorted_idx = sorted_idx[:keep]

    out = gpt2_model(input_ids = left_generated['input_ids'][sorted_idx],
               attention_mask = left_generated['input_ids'][sorted_idx],
               labels = left_generated['input_ids'][sorted_idx].clone())
    
    out['loss'].backward()
    
    if (idx + 1) % 2 == 0: 
        optimizer.step()
        optimizer.zero_grad()
        
    wandb.log({
        "loss": out['loss'].cpu().detach(),
        "scores": scores.cpu().detach().mean()
    })

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
loss,7.61611
scores,1.07309
_runtime,6.0
_timestamp,1623881977.0
_step,0.0


0,1
loss,▁
scores,▁
_runtime,▁
_timestamp,▁
_step,▁


[34m[1mwandb[0m: wandb version 0.10.32 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


 46%|████▌     | 579/1263 [56:15<1:06:27,  5.83s/it]


KeyboardInterrupt: 

In [9]:
scores.shape

torch.Size([128])

In [14]:
tokenizer.decode(generated[10], skip_special_tokens=True)

'Before I explain the situation that has led up to this, just know this is my first post and I\'m on mobile. I apologize for my bad grammar. \n My baby daddy and I have always had problems. We had an off-and-on-again type of relationship. He was into drugs and I was left supporting a beautiful baby girl without his help. Despite all the fuck ups, I always took him back. \n We recently split up again because he was falling into the hole again. He would hardly ask about our baby and eventually he "vanished." I was devastated but I always knew he would come back to us. He came back alright... the first call I received from him in a few weeks was from the hospital. He was hit by a car and broke his arm. I was scared, worried, and angry because I didn\'t know where he was until that day. However, he was not alone. He admitted to his girlfriend being there. I was taken aback. He never mentioned her up until that point. He said her name was Cassidy. \n Rewind a month earlier, we were together

In [10]:
sorted_idx

tensor([ 92, 120,  26, 114,  81,  16,  29,   3,   2,   5, 108,  79,  89,  28,
          0, 113], device='cuda:0')