In [1]:
from torch.utils.data import Dataset
import json

class ChatData(Dataset):
    def __init__(self, path:str, tokenizer):
        self.data = json.load(open(path, "r"))

        self.X = []
        for i in self.data:
            for j in i['dialog']:
                self.X.append(j['text'])

        for idx, i in enumerate(self.X):
            try:
                self.X[idx] = "<startofstring> "+i+" <bot>: "+self.X[idx+1]+" <endofstring>"
            except:
                break

        self.X = self.X[:5000]

        print(self.X[0])

        self.X_encoded = tokenizer(self.X,max_length=40, truncation=True, padding="max_length", return_tensors="pt")
        self.input_ids = self.X_encoded['input_ids']
        self.attention_mask = self.X_encoded['attention_mask']

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return (self.input_ids[idx], self.attention_mask[idx])

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
# from ChatData import ChatData
from torch.optim import Adam
from torch.utils.data import DataLoader
import tqdm
import torch

def train(chatData, model, optim):

    epochs = 12

    for i in tqdm.tqdm(range(epochs)):
        for X, a in chatData:
            X = X.to(device)
            a = a.to(device)
            optim.zero_grad()
            loss = model(X, attention_mask=a, labels=X).loss
            loss.backward()
            optim.step()
        torch.save(model.state_dict(), "model_state.pt")
        print(infer("hello how are you"))

def infer(inp):
    inp = "<startofstring> "+inp+" <bot>: "
    inp = tokenizer(inp, return_tensors="pt")
    X = inp["input_ids"].to(device)
    a = inp["attention_mask"].to(device)
    output = model.generate(X, attention_mask=a )
    output = tokenizer.decode(output[0])
    return output


device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({"pad_token": "<pad>",
                                "bos_token": "<startofstring>",
                                "eos_token": "<endofstring>"})
tokenizer.add_tokens(["<bot>:"])

model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

model = model.to(device)

# print(tokenizer.decode(model.generate(**tokenizer("hey i was good at basketball but ",
#                          return_tensors="pt"))[0]))

chatData = ChatData("./chat_data.json", tokenizer)
chatData =  DataLoader(chatData, batch_size=64)

model.train()

optim = Adam(model.parameters(), lr=1e-3)

print("training .... ")
train(chatData, model, optim)

print("infer from model : ")
while True:
  inp = input()
  print(infer(inp))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

<startofstring> I love iphone! i just bought new iphone! <bot>: Thats good for you, i'm not very into new tech <endofstring>
training .... 


  0%|          | 0/12 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  8%|▊         | 1/12 [00:37<06:56, 37.87s/it]

<startofstring> hello how are you  <bot>:  <pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 17%|█▋        | 2/12 [01:15<06:19, 37.93s/it]

<startofstring> hello how are you  <bot>:   <bot>:  Hi, i am a huge gamer <endofstring><pad><pad>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 25%|██▌       | 3/12 [01:59<06:03, 40.39s/it]

<startofstring> hello how are you  <bot>:  <endofstring><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 33%|███▎      | 4/12 [02:37<05:17, 39.74s/it]

<startofstring> hello how are you  <bot>:  <endofstring><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 42%|████▏     | 5/12 [03:19<04:43, 40.53s/it]

<startofstring> hello how are you  <bot>:  <endofstring><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 50%|█████     | 6/12 [03:58<03:59, 39.86s/it]

<startofstring> hello how are you  <bot>:  <endofstring><pad><pad><pad><pad><pad><pad> i am not sure what


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 58%|█████▊    | 7/12 [04:36<03:17, 39.44s/it]

<startofstring> hello how are you  <bot>:  <endofstring><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 67%|██████▋   | 8/12 [05:17<02:39, 39.75s/it]

<startofstring> hello how are you  <bot>:  <endofstring><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 75%|███████▌  | 9/12 [05:56<01:58, 39.44s/it]

<startofstring> hello how are you  <bot>:  <endofstring><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 83%|████████▎ | 10/12 [06:35<01:18, 39.27s/it]

<startofstring> hello how are you  <bot>:  <endofstring><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 92%|█████████▏| 11/12 [07:13<00:39, 39.08s/it]

<startofstring> hello how are you  <bot>:  <endofstring><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
100%|██████████| 12/12 [07:54<00:00, 39.55s/it]


<startofstring> hello how are you  <bot>:  <endofstring><pad><pad><pad><pad><pad><pad><pad> i am a huge
infer from model : 
hi


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> hi  <bot>:  <endofstring><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
how are you


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> how are you  <bot>:  <endofstring><pad><pad><pad><pad><pad><pad><pad> i am a huge<pad>
wanna be my friend


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> wanna be my friend  <bot>:  <endofstring><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
are you fucking idiot


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> are you fucking idiot  <bot>:  <endofstring><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
how old are you


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> how old are you  <bot>:  <endofstring><pad><pad><pad><pad><pad><pad><pad> i am a huge
how tall are you?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> how tall are you?  <bot>:  <endofstring><pad><pad><pad><pad><pad><pad><pad><pad> I am
are you gay??


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> are you gay??  <bot>:  <endofstring><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
tell me something


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> tell me something  <bot>:  <endofstring><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
wanna be my friend


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> wanna be my friend  <bot>:  <endofstring><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
what is your name?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> what is your name?  <bot>:  <endofstring><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
are you bill gates?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> are you bill gates?  <bot>:  <endofstring><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
are you male/female?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> are you male/female?  <bot>:  <endofstring><pad><pad><pad><pad><pad><pad><pad><pad><pad>
are old are you?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> are old are you?  <bot>:  <endofstring><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
what is your name?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> what is your name?  <bot>:  <endofstring><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
how are you


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> how are you  <bot>:  <endofstring><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>


In [None]:
!pip install transformers huggingface_hub
