In [50]:
import torch
import torch.nn as nn
import re
import random
from nltk.tokenize import word_tokenize
from collections import defaultdict

#Function to clean tokens
def clean_token(token):
    cleaned_token = re.sub(r'[^a-zA-Z0-9\s]', '', token)
    return cleaned_token

with open('harrypotter.txt', 'r', encoding='utf-8') as file:
    text = file.read().lower()

tokens = word_tokenize(text)

cleaned_tokens = [clean_token(token) for token in tokens]

cleaned_tokens = [token for token in cleaned_tokens if token]

vocab = set(cleaned_tokens)
vocablist = list(vocab)

vocab.add('<unk>')
word2idx = {word: idx for idx, word in enumerate(vocab)}
idx2word = {idx: word for word, idx in word2idx.items()}

print(word2idx)


def getResponse(text, num_words=5):
    random_words = random.sample(vocablist, num_words)
    resp = ' '.join(random_words)
    return resp



def tokens_to_tensor(tokens):
    indices = [word2idx.get(token, 1) for token in tokens] 
    return torch.tensor(indices, dtype=torch.long)

#chatBot()


{'whisper': 0, 'swooping': 1, 'goodfornothing': 2, 'head': 3, 'lips': 4, 'casually': 5, 'some': 6, 'mood': 7, 'would': 8, 'strange': 9, 'rain': 10, 'real': 11, 'found': 12, 'saw': 13, 'peered': 14, 'normally': 15, 'neither': 16, 'such': 17, 'drummed': 18, 'boy': 19, 'well': 20, 'sleep': 21, 'shot': 22, 'tight': 23, 'er': 24, 'tabby': 25, 'none': 26, 'dressed': 27, 'busy': 28, 'tail': 29, 'black': 30, 'happening': 31, 'do': 32, 'office': 33, 'kiss': 34, 'clock': 35, 'passed': 36, 'someone': 37, 'smile': 38, 'mysterious': 39, 'yelled': 40, 'pulled': 41, 'gets': 42, 'happy': 43, 'by': 44, 'around': 45, 'blushed': 46, 'albus': 47, 'hoped': 48, 'fixed': 49, 'together': 50, 'tantrum': 51, 'wrestled': 52, 'overhead': 53, 'dundee': 54, 'everything': 55, 'nose': 56, 'wife': 57, 'distance': 58, 'shouted': 59, 'strangely': 60, 'unable': 61, 'something': 62, 'angrily': 63, 'dashed': 64, 'pointed': 65, 'flickered': 66, 'turned': 67, 'openmouthed': 68, 'perfectly': 69, 'heads': 70, 'one': 71, 'strai

In [None]:
class SelfAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(SelfAttention, self).__init__()
        self.heads = heads
        self.head_dim = embed_size // heads
        self.values = nn.Linear(embed_size, embed_size)  
        self.keys = nn.Linear(embed_size, embed_size)
        self.queries = nn.Linear(embed_size, embed_size)
        self.fc_out = nn.Linear(embed_size, embed_size)
        
        
    def forward(self, values, keys, queries):
        N = values.shape[0]
        value_len, key_len, query_len = values.shape[1], keys.shape[1], queries.shape[1]
        
        #print("Before: values = ",values)
        
        values = self.values(values)
        keys = self.keys(keys)
        queries = self.queries(queries)
        
        
#         print("After: values = ",values)
#         print("After: keys = ",keys)
#         print("After: queries = ",queries)
        
        
        values = values.view(N, value_len, self.heads, self.head_dim).permute(0, 2, 1, 3)
        keys = keys.view(N, key_len, self.heads, self.head_dim).permute(0, 2, 1, 3)
        queries = queries.view(N, query_len, self.heads, self.head_dim).permute(0, 2, 1, 3)
        
        attention_score = torch.einsum("nqhd,nkhd->nhqk", queries, keys) / (self.head_dim ** 0.5)
        #print("attention_score", attention_score)
        
        attention = torch.softmax(attention_score, dim=-1)
        #print("attention", attention)
        out = torch.einsum("nhql,nlhd->nqhd", attention, values).reshape(N, query_len, -1)
        #print("out", out)
        
        return self.fc_out(out)
        
        
class SentenceGenerator(nn.Module):
    def __init__(self, vocab_size, embed_size, heads, max_len=10):
        super(SentenceGenerator, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.attention = SelfAttention(embed_size, heads)
        self.fc_out = nn.Linear(embed_size, vocab_size)
        self.max_len = max_len
        
    def forward(self, x):
        embedding = self.embedding(x)
        #print("embedding = ",embedding)
        attention = self.attention(embedding, embedding, embedding)
        out = self.fc_out(attention)
        return out

    
    def generate_sentence(self, user_input, temperature=1.0):
        seed_tokens = word_tokenize(user_input.lower())
        seed_idx = [word2idx.get(word, word2idx['the']) for word in seed_tokens]
        sentence = seed_idx[:]
        
        print("seedtokens = ",seed_tokens)
        #print("seedIdx = ",seed_idx)
        
        for _ in range(self.max_len - len(seed_idx)):
            
            input_tensor = torch.tensor(sentence, dtype= torch.long).unsqueeze(0)
            print("input_tensor", input_tensor)
            logits = self.forward(input_tensor)
            #print("logits", logits)
            logits = logits.squeeze(0)
            #print("After sqeeze logits", logits)
            probs = torch.softmax(logits[-1], dim=-1)
            #print("probs", probs)
            top_probs, top_idx = torch.topk(probs, 3)
            #top_probs = top_probs.sqeeze().tolist()
            top_idx = top_idx.squeeze().tolist()
        
            print("Top 3 most probable words are:")
            for idx, prob in zip(top_idx, top_probs):
                word = idx2word.get(idx, '<unk>')
                print(f"Word: {word}, Probability: {prob:.4f}")
        
            next_word = random.choices(top_idx, k=1)[0]
            sentence.append(next_word)

        return ' '.join([idx2word.get(idx, 'the') for idx in sentence])
    
def trainings_pairs(tokens, seq_len=3):
    inputs, targets = [], []
    for i in range(len(tokens) - seq_len):
        input_seq = tokens[i:i+seq_len]
        target_seq = tokens[i+1:i+1+seq_len]
        inputs.append(input_seq)
        targets.append(target_seq)
    input_tensors = [tokens_to_tensor(seq) for seq in inputs]
    target_tensors = [tokens_to_tensor(seq) for seq in targets]
    return input_tensors, target_tensors
    
def train(model, input_tensors, target_tensors, epochs, batch_size=1):
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    loss_fn = nn.CrossEntropyLoss()
    for epoch in range(epochs):
        total_loss = 0
        for i in range(0, len(input_tensors), batch_size):
            batch_inputs = input_tensors[i:i+batch_size]
            batch_targets = target_tensors[i:i+batch_size]
            for inputs, targets in zip(batch_inputs, batch_targets):
                inputs, targets = inputs.unsqueeze(0), targets.unsqueeze(0)
                outputs = model(inputs).view(-1, len(vocab))
                targets = targets.view(-1)
                loss = loss_fn(outputs, targets)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                total_loss += loss.item()
        avg_loss = total_loss / len(input_tensors)
        print(f'Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}')
    
    
    
def chat_with_bot(model):
    print("Chatbot: Hi! I am your simple chatbot. Type 'quit' to end the chat.")
    while True:
        user_input = input("You: ")
        if user_input.lower() == 'quit':
            print("Chatbot: Bye! Have a great day!")
            break
        response = model.generate_sentence(user_input)
        print(f"Chatbot: {response}")

embed_size = 256
heads = 1
model = SentenceGenerator(len(vocab), embed_size, heads)

input_tensors, target_tensors = trainings_pairs(cleaned_tokens)
train(model, input_tensors, target_tensors, epochs=10)

chat_with_bot(model)
        

Epoch 1/10, Loss: 6.5780
Epoch 2/10, Loss: 5.8156
Epoch 3/10, Loss: 5.4719
Epoch 4/10, Loss: 5.2218
Epoch 5/10, Loss: 5.1101
Epoch 6/10, Loss: 4.9536
Epoch 7/10, Loss: 4.8696
Epoch 8/10, Loss: 4.7734
Epoch 9/10, Loss: 4.7452
Epoch 10/10, Loss: 4.6726
Chatbot: Hi! I am your simple chatbot. Type 'quit' to end the chat.
You: voldemort said
seedtokens =  ['voldemort', 'said']
input_tensor tensor([[377, 130]])
Top 3 most probable words are:
Word: the, Probability: 0.2793
Word: dumbledore, Probability: 0.1248
Word: a, Probability: 0.0542
input_tensor tensor([[377, 130, 119]])
Top 3 most probable words are:
Word: piercing, Probability: 0.7809
Word: woman, Probability: 0.1822
Word: cat, Probability: 0.0080
input_tensor tensor([[377, 130, 119, 560]])
Top 3 most probable words are:
Word: had, Probability: 0.9965
Word: james, Probability: 0.0031
Word: are, Probability: 0.0003
input_tensor tensor([[377, 130, 119, 560, 827]])
Top 3 most probable words are:
Word: are, Probability: 0.9925
Word: she, 

You: they were
seedtokens =  ['they', 'were']
input_tensor tensor([[179, 875]])
Top 3 most probable words are:
Word: bound, Probability: 0.8169
Word: to, Probability: 0.0326
Word: she, Probability: 0.0295
input_tensor tensor([[179, 875, 844]])
Top 3 most probable words are:
Word: find, Probability: 0.4800
Word: believe, Probability: 0.1164
Word: be, Probability: 0.0202
input_tensor tensor([[179, 875, 844, 565]])
Top 3 most probable words are:
Word: dead, Probability: 0.8572
Word: choosing, Probability: 0.0783
Word: true, Probability: 0.0559
input_tensor tensor([[179, 875, 844, 565, 458]])
Top 3 most probable words are:
Word: dumbledore, Probability: 0.9590
Word: professor, Probability: 0.0388
Word: james, Probability: 0.0010
input_tensor tensor([[179, 875, 844, 565, 458, 807]])
Top 3 most probable words are:
Word: mcgonagall, Probability: 0.9890
Word: surely, Probability: 0.0038
Word: i, Probability: 0.0007
input_tensor tensor([[179, 875, 844, 565, 458, 807, 183]])
Top 3 most probable 