In [1]:
pip install transformers

Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m59.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m38.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m88.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m87.4 MB/s[0m eta [36m0:00:00[0m
Colle

In [2]:
from torch.utils.data import Dataset
import json

class ChatData(Dataset):
    def __init__(self, path:str, tokenizer):
        self.data = json.load(open(path, "r"))

        self.X = []
        for i in self.data:
            for j in i['dialog']:
                self.X.append(j['text'])

        for idx, i in enumerate(self.X):
            try:
                self.X[idx] = "<startofstring> "+i+" <bot>: "+self.X[idx+1]+" <endofstring>"
            except:
                break

        self.X = self.X[:5000]

        print(self.X[0])

        self.X_encoded = tokenizer(self.X,max_length=40, truncation=True, padding="max_length", return_tensors="pt")
        self.input_ids = self.X_encoded['input_ids']
        self.attention_mask = self.X_encoded['attention_mask']

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return (self.input_ids[idx], self.attention_mask[idx])

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torch.optim import Adam
from torch.utils.data import DataLoader
import tqdm
import torch

def train(chatData, model, optim):

    epochs = 12

    for i in tqdm.tqdm(range(epochs)):
        for X, a in chatData:
            X = X.to(device)
            a = a.to(device)
            optim.zero_grad()
            loss = model(X, attention_mask=a, labels=X).loss
            loss.backward()
            optim.step()
        torch.save(model.state_dict(), "model_state.pt")
        print(infer("hello how are you"))

def infer(inp):
    inp = "<startofstring> "+inp+" <bot>: "
    inp = tokenizer(inp, return_tensors="pt")
    X = inp["input_ids"].to(device)
    a = inp["attention_mask"].to(device)
    output = model.generate(X, attention_mask=a )
    output = tokenizer.decode(output[0])
    return output


device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({"pad_token": "<pad>",
                                "bos_token": "<startofstring>",
                                "eos_token": "<endofstring>"})
tokenizer.add_tokens(["<bot>:"])

model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

model = model.to(device)

chatData = ChatData("/content/chat_data.json", tokenizer)
chatData =  DataLoader(chatData, batch_size=64)

model.train()

optim = Adam(model.parameters(), lr=1e-3)

print("training .... ")
train(chatData, model, optim)

print("infer from model : ")
while True:
  inp = input()
  print(infer(inp))

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

<startofstring> How’s it going? <bot>: i listen to rap music. you? <endofstring>
training .... 


  0%|          | 0/12 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  8%|▊         | 1/12 [00:53<09:43, 53.02s/it]

<startofstring>hello how are you <bot>: <pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 17%|█▋        | 2/12 [01:42<08:30, 51.05s/it]

<startofstring>hello how are you <bot>: I am a teacher.<endofstring><pad><pad><pad><pad><pad><pad><pad><pad>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 25%|██▌       | 3/12 [02:37<07:55, 52.87s/it]

<startofstring>hello how are you <bot>: I am a little lazy. I am a little lazy. I am


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 33%|███▎      | 4/12 [03:33<07:11, 53.99s/it]

<startofstring>hello how are you <bot>: I am a little lazy person.<endofstring><pad><pad><pad><pad><pad><pad>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 42%|████▏     | 5/12 [04:24<06:11, 53.02s/it]

<startofstring>hello how are you <bot>: hello?<endofstring><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 50%|█████     | 6/12 [05:21<05:25, 54.19s/it]

<startofstring>hello how are you <bot>: I am doing well. Just got back from the gym. I am


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 58%|█████▊    | 7/12 [06:17<04:33, 54.77s/it]

<startofstring>hello how are you <bot>: hello, how are you this morning?<endofstring><pad><pad><pad><pad><pad>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 67%|██████▋   | 8/12 [07:14<03:41, 55.49s/it]

<startofstring>hello how are you <bot>: I'm fine. How are you?<endofstring><pad><pad><pad><pad><pad>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 75%|███████▌  | 9/12 [08:10<02:47, 55.84s/it]

<startofstring>hello how are you <bot>: i am good 🉐 set up on my new book i love


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 83%|████████▎ | 10/12 [09:06<01:51, 55.88s/it]

<startofstring>hello how are you <bot>: I am doing well. Just got back from the store. Where do


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 92%|█████████▏| 11/12 [10:03<00:56, 56.11s/it]

<startofstring>hello how are you <bot>: I am doing well. Just got done with my girlfriend. Do you


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
100%|██████████| 12/12 [11:06<00:00, 55.51s/it]


<startofstring>hello how are you <bot>: i am good. tell me about yourself<endofstring><pad><pad><pad><pad><pad>
infer from model : 
Hello


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring>Hello <bot>: hi<endofstring><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
What is the time right now?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring>What is the time right now? <bot>: Time to give me 5 stars :)<endofstring><pad><pad><pad>
You are funny


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring>You are funny <bot>: I am.<endofstring><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
and cheeky too!


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring>and cheeky too! <bot>: I love to bear my dog.<endofstring><pad><pad><pad><pad><pad>
what is the name of your dog?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring>what is the name of your dog? <bot>: mickey! he's such a cute thing.
do you have a cat?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring>do you have a cat? <bot>: I do not. I like cats.<endofstring><pad><pad><pad>
do you like rain?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring>do you like rain? <bot>: hope it is not really a thing?<endofstring><pad><pad><pad>
do you like stars?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring>do you like stars? <bot>: I do not know.<endofstring><pad><pad><pad><pad><pad><pad><pad>
what do you know?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring>what do you know? <bot>: I know.<endofstring><pad><pad><pad><pad><pad><pad><pad><pad><pad>
but what?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring>but what? <bot>: I do not know what I do.<endofstring><pad><pad><pad><pad><pad><pad>
what is your name?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring>what is your name? <bot>: I am steve. I am not sure what that is.
what is your name?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring>what is your name? <bot>: I'm steve. I'm steve.<endofstring><pad><pad>
