In [1]:
!nvidia-smi

zsh:1: command not found: nvidia-smi


In [2]:
!pip install transformers



In [3]:
!pip install tqdm



In [4]:
import transformers

from torch.utils.data import Dataset
import json

from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from torch.optim import Adam, AdamW
from torch.utils.data import DataLoader
import tqdm
import torch

In [5]:
print(transformers.__version__)
print(torch.__version__)

4.24.0
1.13.0.dev20220619


In [6]:
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

device

'mps'

# Define our Dataset class.

In [9]:
class ASCIIDataset(Dataset):
    def __init__(self, path, tokenizer):
        '''
            Path: string of the path to the raw data to convert to a dataset.

                The path should a contain a file formatted as an array of JSON objects where the object is of the form:
                    {
                        "prompt": "...",
                        "text": "..."
                    }

            Tokenizer:

                The tokenizer used on the prompts and responses of the dataset. Should be GPT2 pre-trained tokenizer.
        '''
        self.data = None
        self.X = []
        self.tokenizer = tokenizer
        # ID OF THE TOKEN USED TO INDICATE WHEN A RESPONSE BEGINS
        self.res_id = 50260 # = self.decode_str("<RES>:")


        with open(path, 'r') as file:
            self.data = json.load(file)
            # for entry in self.data[:100]:
            #     prompt = entry['prompt'] 
            #     text = entry['text']
            #     self.X.append(f'<BOS> {prompt}\n<RES>:\n{text}\n<EOS>')

            # For now just train on the first image repeatedly to see if it can learn on it
            for entry in self.data[:1]:
                prompt = entry['prompt'] 
                text = entry['text']
                self.X.append(f'<BOS> {prompt}\n<RES>:\n{text}\n<EOS>')
            for _ in range(99):
                self.X.append(self.X[0])
            # print(test[0])

        # for entry in self.data:
        #     prompt = entry['prompt']
        #     text = entry['text']
        #     self.X.append(f'<BOS> {prompt} <bot>: {text} <EOS>')
        



        print("Tokenizing Text...")
        self.X_encoded = self.tokenizer(self.X, truncation=True, return_tensors="pt")
        # for I in range(999):
        #     self.X_encoded.append(self.X_encoded[0])
        # self.X_encoded = 
        # print(self.X_encoded.size())
        # self.X_encoded = self.X_encoded.to(device)
        print("Done Tokenizing.")
        self.input_ids = self.X_encoded['input_ids']
        print(self.input_ids.size())
        self.attention_mask = self.X_encoded['attention_mask']
        '''
            https://huggingface.co/docs/transformers/model_doc/gpt2#transformers.GPT2Tokenizer
            
            1 for tokens that are not masked,
            0 for tokens that are masked.
        '''
        # Mask the ground truth responses so the model can learn.
        for i in range(len(self.X)):
            res_i = (self.input_ids[i] == self.res_id).nonzero(as_tuple=True)[0]
            self.attention_mask[i][res_i + 1:] = 0
        #     # print(res_i)
        #     # print(self.attention_mask[i].sum())

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return (self.input_ids[idx], self.attention_mask[idx])
        # return {'input_ids': self.input_ids[idx], 'attention_mask': self.attention_mask[idx]}
        # return (self.X[idx])
    def decode(self, tokens):
        return self.tokenizer.decode(tokens)
    
    def decode_token(self, token_id):
        return self.tokenizer.decoder.get(token_id)
    
    def decode_str(self, word):
        return self.tokenizer.get_vocab()[word]

## Instantiate our Tokenizer, model, dataset, and optimizer

In [10]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({"pad_token": "<pad>", 
                                "bos_token": "<BOS>",
                                "eos_token": "<EOS>"})
tokenizer.add_tokens(["<RES>:"])

model = GPT2LMHeadModel.from_pretrained("gpt2-medium")
model.resize_token_embeddings(len(tokenizer))

# optim = Adam(model.parameters())

ASCII_DATA = ASCIIDataset("./raw_data.json", tokenizer)

Tokenizing Text...
Done Tokenizing.
torch.Size([100, 1024])


In [11]:
len(ASCII_DATA)
ASCII_DATA.decode(ASCII_DATA[0][0])

'<BOS> Apple emoji of grinning face <RES>:...........................................................\n.....................%?????******?????%....................\n................S???*****++;;;;;;++*****???S...............\n.............S???**++;::,,,,,,,,,,,,::;;+**???%............\n...........???**+;::,,,,,,,,,,,,,,,,,,,,,:;+**???..........\n.........???**+::::::::,,,,,,,,,,,,,,::::::::;**???........\n.......%??**+;:::::::::::::::::::::::::::::::::;**??S......\n......???*+;::::::::::::::::::::::::::::::::::;:;+*???.....\n.....??**+;;;;;;;;;;;::::::::::::::::::;;;;;;;;;;;+*???....\n....%??*+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;+*??%...\n...S??**+++++;;;;;;*%SS%+;;;;;;;;;;*%SS%+;;;;;;++++++*???..\n...???*++++++++++;*#SSS#%+;;;;;;;;*##SS#S++++++++++++*??%..\n..*%??**++++++++++*#SSSS%++++++++;*SSSSSS+++++++++++**??%?.\n..????***++++++++++*%%%%+++++++++++*%%%%*++++++++++***??%%.\n..%%??****+++++++++++++++++++++++++++++++++++++++++***??%%.\n..%%??***++++++++++++++++++++++++++++++++++

In [12]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

count_parameters(model)

354827264

In [13]:
print(ASCII_DATA[0][0].size())

# for _ in (ASCII_DATA[0][1]):
#     print(_)
ASCII_DATA.decode_str("<RES>:")
# ASCII_DATA[0]['input_ids'].size()
# for _ in range(5):
#     print(ASCII_DATA[_][0][:10])
#     print(ASCII_DATA[_][1][:20])
#     print("\n\n\n")

torch.Size([1024])


50260

In [14]:
dataloader = DataLoader(ASCII_DATA, batch_size=4) # can be changed to the number of available cores

In [15]:
def train(model, optimizer, dataloader):
 
    e = 1

    model.train()

    for _ in tqdm.tqdm(range(e)):

        for b_ids, b_masks in dataloader:
            

            b_ids = b_ids.to(device)
            b_masks = b_masks.to(device)
            b_labels = b_ids

            # print(b_ids.size(), b_masks.size(), b_labels.size())


            optim.zero_grad()

            out = model(input_ids=b_ids, 
                         labels=b_labels,
                         attention_mask=b_masks)
            
            loss = out.loss
            # print('poo')
            # print(loss.loss)
            # test = loss.loss()
            # print('poo')
            
            print(f'current loss: {loss}')
            loss.backward()
            optim.step()


        torch.save(model.state_dict(), f"model_state_{_}.pt")

In [20]:
'''
    The JSON input should be of the same form that is passed to the dataloader.

    {
        "prompt: "...",
        "text": "..."
    }
'''

def prompt(prompt):

    # prompt = json_input['prompt']
    # text = ""
    # inp = f'<BOS> {prompt}\n<RES>:\n{tbext}\n<EOS>'
    inp = f'<BOS> {prompt}\n<RES>:\n'
    print(inp)
    # inp = tokenizer(inp, return_tensors="pt")
    # inp = tokenizer(inp, max_new_tokens=1024, truncation=True, padding="max_length", return_tensors="pt")
    inp = tokenizer(prompt, return_tensors='pt')
    # print(inp.size())
    x = inp["input_ids"].to(device)
    a = inp["attention_mask"].to(device)
    # print(a)

    # res_i = (x == 50260).nonzero(as_tuple=True)[0]
    # a[res_i + 1:] = 0

    pad_token = ASCII_DATA.decode_str("<pad>")
    # eos_token = ASCII_DATA.decode_str("<pad>")

    output = model.generate(x, attention_mask=a, pad_token_id=pad_token, max_length=1024)
    output = tokenizer.decode(output[0])
    return output

In [19]:
prompt('Apple emoji of grinning face')

<BOS> Apple emoji of grinning face
<RES>:



NotImplementedError: The operator 'aten::cumsum.out' is not current implemented for the MPS device. If you want this op to be added in priority during the prototype phase of this feature, please comment on https://github.com/pytorch/pytorch/issues/77764. As a temporary fix, you can set the environment variable `PYTORCH_ENABLE_MPS_FALLBACK=1` to use the CPU as a fallback for this op. WARNING: this will be slower than running natively on MPS.

In [18]:
model = model.to(device)
model.train()

BATCH_SIZE = 16
EPOCHS = 5
LEARNING_RATE = 3e-5
WARMUP_STEPS = 5000
MAX_SEQ_LEN = 1024

optim = AdamW(model.parameters(), lr=LEARNING_RATE)
# scheduler = WarmupLinearSchedule(optimizer, warmup_steps=WARMUP_STEPS, t_total = -1)

In [21]:
train(model, optim, dataloader)

  0%|          | 0/1 [00:01<?, ?it/s]


RuntimeError: tensors must be 2-D

In [None]:
!nvidia-smi

In [61]:
print(prompt("Apple emoji of grinning face"))

<BOS> Apple emoji of grinning face
<RES>:

Apple emoji of grinning face <pad> @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
@@@@@@@@@@@@@@@@@
@@@@@@@@+++************+++@@@@@@@@@@@@@@@@@@@@@@@@@
@@@@@@@@+++************+++@@@@@@@@@@@@@@@
@@@@@@@@@@@@@@@
@@@@@@@@@@+++************+++@@@@@@@@@@@@@@@
@@@@@@@@@@@@@@@
@@@@@@@@@@@@@@+++************+++@@@@@@@@@@@@@
@@@@@@@@@@@@@@@
@@@@@@@@:@@@@@@@+++??%%%%%%??***+++@@@@@@@@@@@@@@@
@@@@@@@@@@@@@@@
@@@@@@@@@@@@+++**?%SS#########S%?**++@@@@@@@@@@@
@@@@@@@@@@@@@
@@@@@@@@@@@@@++**%S#########S?**++@@@@@@@@@@@@
@@@@@@@@@@@@@
@@@@@@@@:++**%S###############S?**++*@@@@@@@@@@
@@@@@@@@:++*?S#################S%?*++*?S?*++*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*?*

In [47]:
torch.save(model.state_dict(), './test.pt')

In [50]:
tokenizer.save_pretrained('./tokenizer')

('./tokenizer/tokenizer_config.json',
 './tokenizer/special_tokens_map.json',
 './tokenizer/vocab.json',
 './tokenizer/merges.txt',
 './tokenizer/added_tokens.json')

In [51]:
from transformers import pipeline

chef = pipeline('ascii-gen',model='./test.pt', tokenizer='./tokenizer',config={'max_length':1024})

KeyError: ignored

In [None]:
<BOS> Apple emoji of grinning face <RES>:...........................................................
.....................%?????******?????%....................
................S???*****++;;;;;;++*****???S...............
.............S???**++;::,,,,,,,,,,,,::;;+**???%............
...........???**+;::,,,,,,,,,,,,,,,,,,,,,:;+**???..........
.........???**+::::::::,,,,,,,,,,,,,,::::::::;**???........
.......%??**+;:::::::::::::::::::::::::::::::::;**??S......
......???*+;::::::::::::::::::::::::::::::::::;:;+*???.....
.....??**+;;;;;;;;;;;::::::::::::::::::;;;;;;;;;;;+*???....
....%??*+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;+*??%...
...S??**+++++;;;;;;*%SS%+;;;;;;;;;;*%SS%+;;;;;;++++++*???..
...???*++++++++++;*#SSS#%+;;;;;;;;*##SS#S++++++++++++*??%..
..*%??**++++++++++*#SSSS%++++++++;*SSSSSS+++++++++++**??%?.
..????***++++++++++*%%%%+++++++++++*%%%%*++++++++++***??%%.
..%%??****+++++++++++++++++++++++++++++++++++++++++***??%%.
..%%??***+++++++++++++++++++++++++++++++++++++++++****??%%.
..?%???***+*?**++++++++++++++++++++++++++++++**?*+***???%%.
..*%???****%#%%S%%%???****************???%%%S%S#?****???%?.
...%%???***%S;;;++**?%%%%%%SSS%SS%%%%%%??*+++;*S****???%%..
...*%%???***S%;:::,,,,,,,,,,,,,,,,,,,,,,,,::;+S?***???%%?..
....?%%???***SS%??*++;::,,,,,,,,,,:::;++**?%%S%***???%%?...
.....?%%????**%SSSSSSSSS%%%%%%%%%%%SSSSSSSSSS?***???%%%....
......%%%????**?%SSSSSSSSSSSSSSSSSSSSSSSSSS%***????%%?.....
.......?%%%????**?%SSSSSSSSSSSSSSSSSSSSS%?***????%%%*......
.........%%%%?????**??%%SSSSSSSSSSSS%%?***?????%%%%........
...........%%%%??????***???????????****??????%%%%..........
.............?%%%%%???????????*??????????%%%%%%............
................%%%%%%%%%??????????%%%%%%%%%...............
.....................%%%%%%%%%%%%%%%%%%S................... <EOS> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>

In [None]:
<BOS> Please generate ASCII art of an emoji of grinning face <RES>: %?????******?????%                    
                S???*****++;;;;;;++*****???S               
             S???**++;::............::;;+**???%            
          ???**+;::.....................:;+**???          
        ???**+::::::::..............::::::::;**???        
       %??**+;:::::::::::::::::::::::::::::::::;**??S      
     ???*+;::::::::::::::::::::::::::::::::::;:;+*???     
    ??**+;;;;;;;;;;;::::::::::::::::::;;;;;;;;;;;+*???    
    %??*+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;+*??%   
   S??**+++++;;;;;;*%SS%+;;;;;;;;;;*%SS%+;;;;;;++++++*???  
  ???*++++++++++;*#SSS#%+;;;;;;;;*##SS#S++++++++++++*??%  
  *%??**++++++++++*#SSSS%++++++++;*SSSSSS+++++++++++**??%? 
 ????***++++++++++*%%%%+++++++++++*%%%%*++++++++++***??%% 
  %%??****+++++++++++++++++++++++++++++++++++++++++***??%% 
  %%??***+++++++++++++++++++++++++++++++++++++++++****??%% 
 ?%???***+*?**++++++++++++++++++++++++++++++**?*+***???%% 
  *%???****%#%%S%%%???****************???%%%S%S#?****???%? 
   %%???***%S;;;++**?%%%%%%SSS%SS%%%%%%??*+++;*S****???%%  
   *%%???***S%;:::........................::;+S?***???%%?  
   ?%%???***SS%??*++;::..........:::;++**?%%S%***???%%?   
    ?%%????**%SSSSSSSSS%%%%%%%%%%%SSSSSSSSSS?***???%%%    
      %%%????**?%SSSSSSSSSSSSSSSSSSSSSSSSSS%***????%%?     
      ?%%%????**?%SSSSSSSSSSSSSSSSSSSSS%?***????%%%*      
         %%%%?????**??%%SSSSSSSSSSSS%%?***?????%%%%        
           %%%%??????***???????????****??????%%%%          
            ?%%%%%???????????*??????????%%%%%%            
                %%%%%%%%%??????????%%%%%%%%%               
                     %%%%%%%%%%%%%%%%%%S <EOS>