In [220]:
!pip install transformers



In [3]:
!pip install tqdm



In [217]:
print(transformers.__version__)

NameError: name 'transformers' is not defined

In [221]:
import transformers

from torch.utils.data import Dataset
import json

from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torch.optim import Adam, AdamW
from torch.utils.data import DataLoader
import tqdm
import torch

In [222]:
print(transformers.__version__)

4.24.0


In [172]:
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

# Define our Dataset class.

In [195]:
class ASCIIDataset(Dataset):
    def __init__(self, path, tokenizer):
        '''
            Path: string of the path to the raw data to convert to a dataset.

                The path should a contain a file formatted as an array of JSON objects where the object is of the form:
                    {
                        "prompt": "...",
                        "text": "..."
                    }

            Tokenizer:

                The tokenizer used on the prompts and responses of the dataset. Should be GPT2 pre-trained tokenizer.
        '''
        self.data = None
        self.X = []
        self.tokenizer = tokenizer
        # ID OF THE TOKEN USED TO INDICATE WHEN A RESPONSE BEGINS
        self.res_id = 50260 # = self.decode_str("<RES>:")


        with open(path, 'r') as file:
            self.data = json.load(file)
            for entry in self.data[:100]:
                prompt = entry['prompt']
                text = entry['text']
                self.X.append(f'<BOS> {prompt}\n<RES>:\n{text}\n<EOS>')
            # print(test[0])

        # for entry in self.data:
        #     prompt = entry['prompt']
        #     text = entry['text']
        #     self.X.append(f'<BOS> {prompt} <bot>: {text} <EOS>')
        



        print("Tokenizing Text...")
        self.X_encoded = self.tokenizer(self.X, truncation=True, padding="max_length", return_tensors="pt")
        # print(self.X_encoded.size())
        # self.X_encoded = self.X_encoded.to(device)
        print("Done Tokenizing.")
        self.input_ids = self.X_encoded['input_ids']
        print(self.input_ids.size())
        self.attention_mask = self.X_encoded['attention_mask']
        '''
            https://huggingface.co/docs/transformers/model_doc/gpt2#transformers.GPT2Tokenizer
            
            1 for tokens that are not masked,
            0 for tokens that are masked.
        '''
        # Mask the ground truth responses so the model can learn.
        for i in range(len(self.X)):
            res_i = (self.input_ids[i] == self.res_id).nonzero(as_tuple=True)[0]
            self.attention_mask[i][res_i + 1:] = 0
            # print(res_i)
            # print(self.attention_mask[i].sum())

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return (self.input_ids[idx], self.attention_mask[idx])
        # return (self.X[idx])
    def decode(self, tokens):
        return self.tokenizer.decode(tokens)
    
    def decode_token(self, token_id):
        return self.tokenizer.decoder.get(token_id)
    
    def decode_str(self, word):
        return self.tokenizer.get_vocab()[word]

## Instantiate our Tokenizer, model, dataset, and optimizer

In [196]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({"pad_token": "<pad>", 
                                "bos_token": "<BOS>",
                                "eos_token": "<EOS>"})
tokenizer.add_tokens(["<RES>:"])

model = GPT2LMHeadModel.from_pretrained("gpt2-medium")
model.resize_token_embeddings(len(tokenizer))

# optim = Adam(model.parameters())

ASCII_DATA = ASCIIDataset("./raw_data.json", tokenizer)

Tokenizing Text...
Done Tokenizing.
torch.Size([100, 1024])


In [213]:
print(ASCII_DATA[0][0].size())

# for _ in (ASCII_DATA[0][1]):
#     print(_)
ASCII_DATA.decode_str("<RES>:")
len(ASCII_DATA[0])
for _ in range(5):
    print(ASCII_DATA[_][0][:10])
    print(ASCII_DATA[_][1][:20])
    print("\n\n\n")

torch.Size([1024])
tensor([50258, 16108, 44805,   286, 45702,  1986, 50260, 37991, 37991, 37991])
tensor([1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])




tensor([50258, 11708, 44805,   286, 45702,  1986, 50260, 37991, 37991, 37991])
tensor([1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])




tensor([50258, 12025, 44805,   286, 45702,  1986, 50260, 37991, 37991, 37991])
tensor([1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])




tensor([50258, 11209, 44805,   286, 45702,  1986, 50260, 37991, 37991, 37991])
tensor([1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])




tensor([50258, 14254, 44805,   286, 45702,  1986, 50260, 37991, 37991,    31])
tensor([1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])






In [198]:
dataloader = DataLoader(ASCII_DATA, batch_size=16) # can be changed to the number of available cores

In [230]:
def train(model, optimizer, dataloader):
 
    e = 1

    model.train()

    for _ in tqdm.tqdm(range(e)):

        for (b_ids, b_masks) in dataloader:
            # b_ids = torch.tensor(b_ids.to(device)).unsqueeze(0)
            b_ids = torch.tensor(b_ids).unsqueeze(0).to(device)
            b_labels = b_ids.to(device)
            b_masks = torch.tensor(b_masks).unsqueeze(0).to(device)

            print(b_ids.size(), b_masks.size(), b_labels.size())

            loss = model(**b_ids, 
                        #  labels=b_labels,
                         attention_mask=b_masks).loss()
            loss.backward()
            optimizer.step()
            
        torch.save(model.state_dict(), f"model_state_{_}.pt")
        break

In [131]:
def prompt(prompt, text):
    inp = f'<BOS> {prompt}\n<RES>:\n{text}\n<EOS>'
    inp = tokenizer(inp, return_tensors="pt")
    x = inp["input_ids"].to(device)
    a = inp["attention_mask"].to(device)
    output = model.generate(x, attention_mask=a)
    output = tokenizer.decode(output[0])
    return output

In [224]:
model = model.to(device)
model.train()

BATCH_SIZE = 16
EPOCHS = 5
LEARNING_RATE = 3e-5
WARMUP_STEPS = 5000
MAX_SEQ_LEN = 1024

optim = AdamW(model.parameters(), lr=LEARNING_RATE)
# scheduler = WarmupLinearSchedule(optimizer, warmup_steps=WARMUP_STEPS, t_total = -1)

In [231]:
train(model, optim, dataloader)

  b_ids = torch.tensor(b_ids).unsqueeze(0).to(device)
  b_masks = torch.tensor(b_masks).unsqueeze(0).to(device)
  0%|          | 0/1 [00:00<?, ?it/s]

torch.Size([1, 16, 1024]) torch.Size([1, 16, 1024]) torch.Size([1, 16, 1024])





TypeError: GPT2LMHeadModel object argument after ** must be a mapping, not Tensor