In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"
! pip -q install transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m85.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m122.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torch.optim import Adam
from torch.utils.data import DataLoader
import tqdm
import torch


In [None]:
from torch.utils.data import Dataset
import json

class ChatData(Dataset):
    def __init__(self, path:str, tokenizer):
        self.data = json.load(open(path, "r",  encoding="utf-8"))

        self.X = []
        for i in self.data:
            for j in i['dialog']:
                self.X.append(j['text'])

        for idx, i in enumerate(self.X):
            try:
                self.X[idx] = "<startofstring> " +i+" <bot>: "+self.X[idx+1]+" <endofstring>"
            except:
                break

        self.X = self.X[:5000]
        
        print(self.X[0])

        self.X_encoded = tokenizer(self.X, max_length=42, truncation=True, padding="max_length", return_tensors="pt")
        self.input_ids = self.X_encoded['input_ids']
        self.attention_mask = self.X_encoded['attention_mask']

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return (self.input_ids[idx], self.attention_mask[idx])

In [None]:
def infer(inp):
    inp = "<startofstring> "+inp+" "
    inp = tokenizer(inp, return_tensors="pt")
    X = inp["input_ids"].to(device)
    a = inp["attention_mask"].to(device)
    output = model.generate(X, attention_mask=a, max_length=60, temperature=0.5, top_p=0.9)
    # print(output)
    output = tokenizer.decode(output[0], skip_special_tokens=True).strip()
    return output

In [None]:
#
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
tokenizer.add_special_tokens({"pad_token": "<pad>", 
                                "bos_token": "<startofstring>",
                                "eos_token": "<endofstring>"})
tokenizer.add_tokens(["<bot>:"])

model = GPT2LMHeadModel.from_pretrained("gpt2-medium")
model.resize_token_embeddings(len(tokenizer))

model = model.to(device)


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
#Load data
chatData = ChatData("/content/drive/MyDrive/Chatbot/newdata.json", tokenizer)
chatData =  DataLoader(chatData, batch_size=32)

<startofstring> Hi <bot>: Hello! How can I assist you today? <endofstring>


In [None]:
#Training loop

model.train()

optim = Adam(model.parameters(), lr=1e-5)

# print("training .... ")

epochs = 50

for i in tqdm.tqdm(range(epochs)):
    lss = 0
    for X, a in chatData:
        X = X.to(device)
        a = a.to(device)
        optim.zero_grad()
        loss = model(X, attention_mask=a, labels=X).loss
        lss = loss
        loss.backward()
        optim.step()
    print("loss: ", lss)
    torch.save(model.state_dict(), "/content/drive/MyDrive/Chatbot/model_state.pt")
    print(infer("hello how are you"))


In [None]:
#Load the saved model
state_dict = torch.load('/content/drive/MyDrive/Chatbot/model_state.pt')
model.load_state_dict(state_dict)

<All keys matched successfully>

In [None]:
#Chatbot interaction and self-learning
def collect_feedback(question, model, tokenizer, knowledge):
    answer = infer(question)#, model, tokenizer, knowledge)
    feedback = input(f"Was the response helpful? (yes/no/skip): ")
    if feedback.lower() == "yes":
        return None
    elif feedback.lower() == "no":
        correct_answer = input("Bot: Please provide the correct answer: ")
        return correct_answer
    else:
        return None

# Load or create the knowledge dictionary
knowledge_path = "/content/drive/MyDrive/Chatbot/knowledge.json"
try:
    with open(knowledge_path, "r") as f:
        knowledge = json.load(f)
except FileNotFoundError:
    pass

print("Enter your query about Bangladesh and press Enter:")

while True:
    print("\n")
    user_input = input("User: ")
    if user_input == 'exit':
        break

    if user_input.lower() in knowledge:
        response = knowledge[user_input.lower()]
        print("Bot:", response)
        feedback = collect_feedback(user_input, model, tokenizer, knowledge)
        if feedback is not None:
            knowledge[user_input.lower()] = feedback
            print("Bot: Thanks. I've learned something new")
    else:
        response = infer(user_input)
        response = response.replace(user_input,"")
        response = response.replace('<bot>:',"")
        print("Bot:", response)
        feedback = collect_feedback(user_input, model, tokenizer, knowledge)
        if feedback is not None:
            knowledge[user_input.lower()] = feedback
            print("Bot: Thanks. I've learned something new")
# Save the knowledge dictionary
with open(knowledge_path, "w") as f:
    json.dump(knowledge, f)

Enter your query about Bangladesh and press Enter:


User: hello


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Bot: hello ಠ_ಠ: This is a widely used greeting in Bangladesh. It is often used to greet someone in a formal or informal setting. <bot>: Thank you for sharing! It's helpful to know the different ways we can say goodbye in different languages. <bot>:
Was the response helpful? (yes/no/skip): yes


User: exit
