In [220]:
!pip install transformers



In [3]:
!pip install tqdm



In [358]:
print(transformers.__version__)
print(torch.__version__)

4.24.0
1.13.0.dev20220619


In [260]:
import transformers

from torch.utils.data import Dataset
import json

from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from torch.optim import Adam, AdamW
from torch.utils.data import DataLoader
import tqdm
import torch

In [222]:
print(transformers.__version__)

4.24.0


In [172]:
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

# Define our Dataset class.

In [346]:
class ASCIIDataset(Dataset):
    def __init__(self, path, tokenizer):
        '''
            Path: string of the path to the raw data to convert to a dataset.

                The path should a contain a file formatted as an array of JSON objects where the object is of the form:
                    {
                        "prompt": "...",
                        "text": "..."
                    }

            Tokenizer:

                The tokenizer used on the prompts and responses of the dataset. Should be GPT2 pre-trained tokenizer.
        '''
        self.data = None
        self.X = []
        self.tokenizer = tokenizer
        # ID OF THE TOKEN USED TO INDICATE WHEN A RESPONSE BEGINS
        self.res_id = 50260 # = self.decode_str("<RES>:")


        with open(path, 'r') as file:
            self.data = json.load(file)
            for entry in self.data[:100]:
                prompt = entry['prompt'] 
                text = entry['text']
                self.X.append(f'<BOS> {prompt}\n\n<RES>:\n\n{text}\n\n<EOS>')
            # print(test[0])

        # for entry in self.data:
        #     prompt = entry['prompt']
        #     text = entry['text']
        #     self.X.append(f'<BOS> {prompt} <bot>: {text} <EOS>')
        



        print("Tokenizing Text...")
        self.X_encoded = self.tokenizer(self.X, max_length=1024, truncation=True, padding="max_length", return_tensors="pt")
        # print(self.X_encoded.size())
        # self.X_encoded = self.X_encoded.to(device)
        print("Done Tokenizing.")
        self.input_ids = self.X_encoded['input_ids']
        print(self.input_ids.size())
        self.attention_mask = self.X_encoded['attention_mask']
        '''
            https://huggingface.co/docs/transformers/model_doc/gpt2#transformers.GPT2Tokenizer
            
            1 for tokens that are not masked,
            0 for tokens that are masked.
        '''
        # Mask the ground truth responses so the model can learn.
        for i in range(len(self.X)):
            res_i = (self.input_ids[i] == self.res_id).nonzero(as_tuple=True)[0]
            self.attention_mask[i][res_i + 1:] = 0
            # print(res_i)
            # print(self.attention_mask[i].sum())

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return (self.input_ids[idx], self.attention_mask[idx])
        # return {'input_ids': self.input_ids[idx], 'attention_mask': self.attention_mask[idx]}
        # return (self.X[idx])
    def decode(self, tokens):
        return self.tokenizer.decode(tokens)
    
    def decode_token(self, token_id):
        return self.tokenizer.decoder.get(token_id)
    
    def decode_str(self, word):
        return self.tokenizer.get_vocab()[word]

## Instantiate our Tokenizer, model, dataset, and optimizer

In [347]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({"pad_token": "<pad>", 
                                "bos_token": "<BOS>",
                                "eos_token": "<EOS>"})
tokenizer.add_tokens(["<RES>:"])

model = GPT2LMHeadModel.from_pretrained("gpt2-medium")
model.resize_token_embeddings(len(tokenizer))

# optim = Adam(model.parameters())

ASCII_DATA = ASCIIDataset("./raw_data.json", tokenizer)

Tokenizing Text...
Done Tokenizing.
torch.Size([100, 1024])


In [348]:
print(ASCII_DATA[0][0].size())

# for _ in (ASCII_DATA[0][1]):
#     print(_)
ASCII_DATA.decode_str("<RES>:")
# ASCII_DATA[0]['input_ids'].size()
# for _ in range(5):
#     print(ASCII_DATA[_][0][:10])
#     print(ASCII_DATA[_][1][:20])
#     print("\n\n\n")

torch.Size([1024])


50260

In [349]:
dataloader = DataLoader(ASCII_DATA, batch_size=16) # can be changed to the number of available cores

In [355]:
def train(model, optimizer, dataloader):
 
    e = 1

    model.train()

    for _ in tqdm.tqdm(range(e)):

        for b_ids, b_masks in dataloader:

            # batch = {k:v.type(torch.long).to(device) for k,v in batch.items()}
            # b_ids = batch['input_ids']
            # b_masks = batch['attention_mask']
            # b_ids = torch.tensor(b_ids.to(device)).unsqueeze(0)
            # b_ids = batch['input_ids']
            # b_masks = batch['attention_mask']
            # b_ids = b_ids.to(device)
            # b_labels = b_ids.to(device)
            # b_masks = torch.tensor(b_masks).to(device)
            b_ids = b_ids.to(device)
            b_masks = b_masks.to(device)
            b_labels =

            print(b_ids.size(), b_masks.size(), b_labels.size())

            loss = model(input_ids=b_ids, 
                         labels=b_labels,
                         attention_mask=b_masks).loss()


            # print(batch['input_ids'].size())
            # print(batch['input_ids'])
#           p

            # test = torch.ones(16, 1024).type(torch.long).to(device)
            # test = torch.ones(1024).type(torch.long).to(device)
            # print(test.unsqueeze(0))
            # print(test.unsqueeze(0).size())

            # loss = model(input_ids=test.unsqueeze(0))
            # loss = model(input_ids=batch['input_ids'])


            # print(test.size())

            # loss = torch.zeros(100, 1023)

            # print(loss)

            break

        # for (b_ids, b_masks) in dataloader:
        #     # b_ids = torch.tensor(b_ids.to(device)).unsqueeze(0)
        #     b_ids = torch.tensor(b_ids).unsqueeze(0).to(device)
        #     b_labels = b_ids.to(device)
        #     b_masks = torch.tensor(b_masks).unsqueeze(0).to(device)

        #     print(b_ids.size(), b_masks.size(), b_labels.size())

        #     loss = model(**b_ids, 
        #                 #  labels=b_labels,
        #                  attention_mask=b_masks).loss()
        #     loss.backward()
        #     optimizer.step()
            
        torch.save(model.state_dict(), f"model_state_{_}.pt")
        break

In [248]:
def prompt(prompt, text):
    inp = f'<BOS> {prompt}\n<RES>:\n{text}\n<EOS>'
    inp = tokenizer(inp, return_tensors="pt")
    x = inp["input_ids"].to(device)
    a = inp["attention_mask"].to(device)
    output = model.generate(x, attention_mask=a)
    output = tokenizer.decode(output[0])
    return output

In [353]:
model = model.to(device)
model.train()

BATCH_SIZE = 16
EPOCHS = 5
LEARNING_RATE = 3e-5
WARMUP_STEPS = 5000
MAX_SEQ_LEN = 1024

optim = AdamW(model.parameters(), lr=LEARNING_RATE)
# scheduler = WarmupLinearSchedule(optimizer, warmup_steps=WARMUP_STEPS, t_total = -1)

In [356]:
train(model, optim, dataloader)

  0%|          | 0/1 [00:00<?, ?it/s]

torch.Size([16, 1024]) torch.Size([16, 1024]) torch.Size([16, 1024])





RuntimeError: tensors must be 2-D

In [None]:
<BOS> Apple emoji of grinning face <RES>: @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
@@@@@@@@@@@@@@@@@@.;+++****++++;@@@@@@@@@@@@@@@@@
@@@@@@@@@@@@@@@@+++************+++@@@@@@@@@@@@@@@
@@@@@@@@@@@@@@+++***??%%%%%%??***+++@@@@@@@@@@@@@
@@@@@@@@@@@@+++**?%SS#########S%?**+++@@@@@@@@@@@
@@@@@@@@@@@++**?%S###############S?**++@@@@@@@@@@
@@@@@@@@@+++**%S##################S%?*+++@@@@@@@@
@@@@@@@@:++*?S##S###############SS##S?*++:@@@@@@@
@@@@@@@*++*?SSSSSSSSS#########SSSSSSSS?*++*@@@@@@
@@@@@@@++*?SSSSSSSSSSSSSSSSSSSSSSSSSSSS?*++@@@@@@
@@@@@@++**%SSSSSSSSSSSSSSSSSSSSSSSSSSSSS?*++@@@@@
@@@@@++**%%%SSSSSSSSSSSSSSSSSSSSSSSSSS%%%**++@@@@
@@@@@++*?%%%%SSSSSSSSSSSSSSSSSSSSSSS%%%%%%*++@@@@
@@@@++*?%%%%%%%%%%%%SSSSSSSSS%%%%%%%%%%%%%?*++@@@
@@@@++*?%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%?*++@@@
@@@;++*??%%%%%%%%??%%%%%%%%%%%?*?%%%%%%%%???*+;@@
@@@++*?????%%%%%+,,;%%%%%%%%%?:,:*%%%%%?????*++@@
@@:++*???????%%*,,,,+%%%%%%%%;,:,:?%%???????*++;@
@@+++*????????%+,:::;%%%%%%%?::::,*?????????*++;@
@@;++*?????????+::;:;?????%%?::;::*?????????*++;@
@@+++**????????*::::+????????;::::?????????**+++@
@@+++**?????????+::;?????????*:::*?????????**++;@
@@;++***?????????**????????????*??????????***++;@
@@;++***??????????????????????????????????***++;@
@@;++***??????????????????????????????????***++;@
@@;++***??????????????????????????????????***++;@
@@;++***?**???????????????????????????*******++;@
@@;;+***?+,:+**????????????????????*+;:,*?***++;@
@@;;++***;:;,,::;++****????****++;:,,:;,+***++;;@
@@;;++***;;S%?*;:,,,,,,,,,,,,,,,,:;*?%?:****++;;@
@@?;+++**+:%SSSSS%%??*++++++*??%%SSSSS*:***+++;+@
@@@;;++***:*SSSS##################SSS%;;***++;;@@
@@@;;+++**;:*?%SS################SS%?+:***+++;;@@
@@@@;;++***::::+*?%SS######SSS%?*+;::,;**+++;;@@@
@@@@;;+++**+::::::::;;;;;;;;;:::::::::***+++;;@@@
@@@@:;;+++**;::;::::::::::::::::::;::+**+++;;@@@@
@@@@@;;;+++**;::::::::::::::::::::::+**+++;;;@@@@
@@@@@@;;;+++**;::::::::::::::::::::+**+++;;;@@@@@
@@@@@@@;;;+++**+:::::::::::::::::;***+++;;;@@@@@@
@@@@@@@+;;;++++**;:::::::::::::;+**++++;;;+@@@@@@
@@@@@@@@*;;+++++***+;::::::::;+***++++;;;*@@@@@@@
@@@@@@@@@;;;++++++****++++++****++++++;;+@@@@@@@@
@@@@@@@@@@@;;;++++++++******++++++++;;;@@@@@@@@@@
@@@@@@@@@@@@;;;;++++++++++++++++++;;;+@@@@@@@@@@@
@@@@@@@@@@@@@@;;;;;++++++++++++;;;;;@@@@@@@@@@@@@
@@@@@@@@@@@@@@@@;;;;;;;;;;;;;;;;;;@@@*?@@@@@@@@@@
@@@@@@@@@@@@@@;+@@+;;;;;;;;;;;;+@@@@@@@@@@@@@@@@@
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ <EOS> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>

In [None]:
<BOS> ASCII art of an emoji of grinning face <RES>: %?????******?????%                    
                S???*****++;;;;;;++*****???S               
             S???**++;::............::;;+**???%            
          ???**+;::.....................:;+**???          
        ???**+::::::::..............::::::::;**???        
       %??**+;:::::::::::::::::::::::::::::::::;**??S      
     ???*+;::::::::::::::::::::::::::::::::::;:;+*???     
    ??**+;;;;;;;;;;;::::::::::::::::::;;;;;;;;;;;+*???    
    %??*+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;+*??%   
   S??**+++++;;;;;;*%SS%+;;;;;;;;;;*%SS%+;;;;;;++++++*???  
  ???*++++++++++;*#SSS#%+;;;;;;;;*##SS#S++++++++++++*??%  
  *%??**++++++++++*#SSSS%++++++++;*SSSSSS+++++++++++**??%? 
 ????***++++++++++*%%%%+++++++++++*%%%%*++++++++++***??%% 
  %%??****+++++++++++++++++++++++++++++++++++++++++***??%% 
  %%??***+++++++++++++++++++++++++++++++++++++++++****??%% 
 ?%???***+*?**++++++++++++++++++++++++++++++**?*+***???%% 
  *%???****%#%%S%%%???****************???%%%S%S#?****???%? 
   %%???***%S;;;++**?%%%%%%SSS%SS%%%%%%??*+++;*S****???%%  
   *%%???***S%;:::........................::;+S?***???%%?  
   ?%%???***SS%??*++;::..........:::;++**?%%S%***???%%?   
    ?%%????**%SSSSSSSSS%%%%%%%%%%%SSSSSSSSSS?***???%%%    
      %%%????**?%SSSSSSSSSSSSSSSSSSSSSSSSSS%***????%%?     
      ?%%%????**?%SSSSSSSSSSSSSSSSSSSSS%?***????%%%*      
         %%%%?????**??%%SSSSSSSSSSSS%%?***?????%%%%        
           %%%%??????***???????????****??????%%%%          
            ?%%%%%???????????*??????????%%%%%%            
                %%%%%%%%%??????????%%%%%%%%%               
                     %%%%%%%%%%%%%%%%%%S <EOS>