In [1]:
!pip install transformers
!pip install torch
!pip install pandas
!pip install tqdm



In [2]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import tqdm
import torch

In [3]:
!gdown 'https://drive.google.com/uc?id=1zc-eqUjoBuFs8Q9m1PZxu_Y_0Zd0zydz'

Downloading...
From: https://drive.google.com/uc?id=1zc-eqUjoBuFs8Q9m1PZxu_Y_0Zd0zydz
To: /content/dialogs.txt
  0% 0.00/244k [00:00<?, ?B/s]100% 244k/244k [00:00<00:00, 30.8MB/s]


In [4]:
chats=pd.read_csv('dialogs.txt' , sep='\t')
chats.to_csv('dialogs.csv', index=False)
chats.columns=['Questions','Answers']

In [5]:
a= {'Questions':'how are you doing','Answers':"i'm fine. how about yourself?"}
b={'Questions':'how are you','Answers':"i'm fine. how about yourself?"}
c = {'Questions':'Hello','Answers':'hi'}
d = {'Questions':'Hi','Answers':'hello'}
chats = pd.concat([chats, pd.DataFrame([a,b,c,d])], ignore_index=True)

In [6]:
chats.to_csv('chat_data.csv', index=False)

In [7]:
class ChatData(Dataset):
    def __init__(self, csv_path, tokenizer, max_length=100):
        self.data = pd.read_csv(csv_path)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        question = self.data.iloc[idx]['Questions']
        answer = self.data.iloc[idx]['Answers']

        input_text = f"<startofstring> {question} <bot>: {answer} <endofstring>"

        encoding = self.tokenizer(
            input_text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        input_ids = encoding["input_ids"].squeeze()
        attention_mask = encoding["attention_mask"].squeeze()
        return input_ids, attention_mask

In [8]:
def train(chatData, model, optim):
    epochs = 12
    for epoch in range(epochs):
        for X, a in tqdm.tqdm(chatData):
            X = X.to(device)
            a = a.to(device)
            optim.zero_grad()
            loss = model(X, attention_mask=a, labels=X).loss
            loss.backward()
            optim.step()
        torch.save(model.state_dict(), "model_state.pt")
        print(f"Epoch {epoch+1} Loss: {loss.item()}")
        print("Sample inference:", infer("hello how are you"))

In [9]:
def infer(inp):
    inp = f"<startofstring> {inp} <bot>: "
    inputs = tokenizer(inp, return_tensors="pt", truncation=True, max_length=100)
    X = inputs["input_ids"].to(device)
    a = inputs["attention_mask"].to(device)

    output = model.generate(X, attention_mask=a, max_length=100, pad_token_id=tokenizer.eos_token_id)
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

In [10]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({
    "pad_token": "<pad>",
    "bos_token": "<startofstring>",
    "eos_token": "<endofstring>"
})
tokenizer.add_tokens(["<bot>:"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]



1

In [11]:
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [12]:
csv_path = "./chat_data.csv"
chatData = ChatData(csv_path, tokenizer)
chatData = DataLoader(chatData, batch_size=8, shuffle=True)

In [13]:
model.train()
optim = Adam(model.parameters(), lr=1e-3)

In [15]:
print("Training the model...")
train(chatData, model, optim)

Training the model...


100%|██████████| 466/466 [01:48<00:00,  4.28it/s]


Epoch 1 Loss: 0.13994286954402924
Sample inference:  hello how are you  <bot>:  


100%|██████████| 466/466 [01:49<00:00,  4.25it/s]


Epoch 2 Loss: 0.18362519145011902
Sample inference:  hello how are you  <bot>:  


100%|██████████| 466/466 [01:48<00:00,  4.28it/s]


Epoch 3 Loss: 0.1462448239326477
Sample inference:  hello how are you  <bot>:  and you mean? 


100%|██████████| 466/466 [01:49<00:00,  4.27it/s]


Epoch 4 Loss: 0.1322510540485382
Sample inference:  hello how are you  <bot>:  


100%|██████████| 466/466 [01:48<00:00,  4.28it/s]


Epoch 5 Loss: 0.14313626289367676
Sample inference:  hello how are you  <bot>:  


100%|██████████| 466/466 [01:48<00:00,  4.28it/s]


Epoch 6 Loss: 0.14406462013721466
Sample inference:  hello how are you  <bot>:  and you know what you're talking about? 


100%|██████████| 466/466 [01:49<00:00,  4.25it/s]


Epoch 7 Loss: 0.14633840322494507
Sample inference:  hello how are you  <bot>:  


100%|██████████| 466/466 [01:49<00:00,  4.24it/s]


Epoch 8 Loss: 0.15844348073005676
Sample inference:  hello how are you  <bot>:  


100%|██████████| 466/466 [01:49<00:00,  4.24it/s]


Epoch 9 Loss: 0.12763020396232605
Sample inference:  hello how are you  <bot>:  


100%|██████████| 466/466 [01:49<00:00,  4.24it/s]


Epoch 10 Loss: 0.13205641508102417
Sample inference:  hello how are you  <bot>:  


100%|██████████| 466/466 [01:49<00:00,  4.24it/s]


Epoch 11 Loss: 0.11799348145723343
Sample inference:  hello how are you  <bot>:  


100%|██████████| 466/466 [01:49<00:00,  4.24it/s]


Epoch 12 Loss: 0.13670039176940918
Sample inference:  hello how are you  <bot>:  


In [16]:
print("Inference from model:")
while True:
    inp = input("User: ")
    if inp.lower() in ['exit', 'quit', 'bye']:
        print("Chatbot: Goodbye!")
        break
    print("Chatbot:", infer(inp))

Inference from model:
User: hi
Chatbot:  hi  <bot>:  iceiceiceiceiceiceiceiceice available. 
User: how are you
Chatbot:  how are you  <bot>:  
User: are you well
Chatbot:  are you well  <bot>:  
User: say something
Chatbot:  say something  <bot>:  
User: exit
Chatbot: Goodbye!
