# K(AI)ra version 3.1.2


### Changelog
-   reworked entire structure
-   added printout clarity
-   changed dataset from cornell movie dialogues corpus to daily dialogues

### Fixes / Improvements / additions needed
-   pyVTS integration
-   speech synthesis
-   output length
-   output logs
-   control panel
-   speech recognition
-   screen vision
-   username = voice detection
-   new word adding to dictionary
-   latency
-   dataset modernisation
-   automatic data collection
-   model saving/loading
-   multithreading

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pyttsx3 as tts
import re
from torch.utils.data import Dataset, DataLoader

  from .autonotebook import tqdm as notebook_tqdm


### Variable Settings

In [2]:
batch_size       = 32           #32
epochs           = 1            #1
embedding_dim    = 10           #10
hidden_dim       = 50           #50
learning_rate    = 0.001        #0.001
acceptable_loss  = 0.01         #0.01
file_path        = 'C:/Users/logan/Documents/Coding/Python/kAIra/3.0/Main/dialogues.txt'

### Text processing and data preparation

In [3]:
with open(file_path, "r", errors="ignore") as file:
    data = file.read()

def parse(text):
    text = re.sub(r" â€™ ", "'", text)
    text = re.sub(r'[A-Z]', lambda match: match.group().lower(), text)
    
    return text

def process_text_file(file_path):
    tuples_list = []
    with open(file_path, 'r', errors='ignore') as file:
        lines = file.readlines()
        for line in lines:
            line = line.strip()
            entries = line.split('__eou__')
            for i in range(len(entries) - 1):
                tuple_entry = (entries[i], entries[i + 1])
                tuples_list.append(tuple_entry)
    
    return tuples_list


data = process_text_file(file_path)
print(data[:10])

[('the kitchen stinks . ', " i'll throw out the garbage . "), (" i'll throw out the garbage . ", ''), ('so dick , how about getting some coffee for tonight ? ', " coffee ? i don't honestly like that kind of stuff . "), (" coffee ? i don't honestly like that kind of stuff . ", ' come on , you can at least try a little , besides your cigarette . '), (' come on , you can at least try a little , besides your cigarette . ', " what's wrong with that ? cigarette is the thing i go crazy for . "), (" what's wrong with that ? cigarette is the thing i go crazy for . ", ' not for me , dick . '), (' not for me , dick . ', ''), ('are things still going badly with your houseguest ? ', " getting worse . now he's eating me out of house and home . i've tried talking to him but it all goes in one ear and out the other . he makes himself at home , which is fine . but what really gets me is that yesterday he walked into the living room in the raw and i had company over ! that was the last straw . "), (" ge

### Tokenization and vocabulary building

In [4]:
word2idx = {}
idx2word = {}
for sentence, response in data:
    for word in sentence.split() + response.split():
        if word not in word2idx:
            idx2word[len(word2idx)] = word
            word2idx[word] = len(word2idx)

def tokenize(sentence):
    return [word2idx[word] for word in sentence.split()]

def detokenize(tokens):
    return ' '.join([idx2word[token] for token in tokens])

input_data = [tokenize(sentence) for sentence, _ in data]
target_data = [tokenize(response) for _, response in data]

def pad_sequence(seq, max_length):
    return seq + [0] * (max_length - len(seq))

max_length = max(max(len(seq) for seq in input_data), max(len(seq) for seq in target_data))
input_data = [pad_sequence(seq, max_length) for seq in input_data]
target_data = [pad_sequence(seq, max_length) for seq in target_data]

input_data = torch.tensor(input_data, dtype=torch.long)
target_data = torch.tensor(target_data, dtype=torch.long)


### Model Definition

In [5]:
class ChatbotModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(ChatbotModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        lstm_out, _ = self.lstm(x)
        logits = self.fc(lstm_out)
        return logits

vocab_size = len(word2idx)
output_dim = vocab_size

model = ChatbotModel(vocab_size, embedding_dim, hidden_dim, output_dim)


### Training

In [6]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


class Mydataset(Dataset):
    def __init__(self, data, targets):
        self.data = data
        self.targets = targets
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx], self.targets[idx]

data = torch.randint(0, vocab_size, (1000, 20))
targets = torch.randint(0, vocab_size, (1000, 20))

dataset = Mydataset(data, targets)
data_loader = DataLoader(dataset, batch_size = batch_size, shuffle = True)

for epoch in range(epochs):
    for input_data, target_data in data_loader:
        optimizer.zero_grad()
        output = model(input_data)
        loss = criterion(output.view(-1, vocab_size), target_data.view(-1))
        loss.backward()
        optimizer.step()
    
    print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')
    
    if loss.item() < acceptable_loss:
        print('Loss under {acceptable_loss}, finishing training')
        break


Epoch [1/1], Loss: 10.0759


### Response generation


In [7]:
def predict(sentence):
    model.eval()
    with torch.no_grad():
        input_seq = torch.tensor([pad_sequence(tokenize(sentence), max_length)], dtype=torch.long)
        output = model(input_seq)
        output_seq = torch.argmax(output, dim=2).numpy().flatten()
        response = detokenize(output_seq)
    return response



### Speech synthesis

In [8]:
def speak(speech):
    tts.speak(speech)

### User interface

In [9]:
def printblank():
    print('    |')

print(f'    | Chat opened')
while True:
    try:
        user_input = input("You: ")
        if user_input.lower() == "quit":
            printblank()
            print(f'____| Quit statement used')
            break
        response = predict(parse(user_input))
        printblank()
        print(f'    | Bot: {response}')
        speak(response)
    except KeyError:
        printblank()
        print(f'!!! | KeyError raised')
        print(f'!!! | User: {user_input}')
        printblank()
        print(f'??? | Do you wish to add this term to the dictionary? (y/n)')
        print(f'??? | kAIra will not be trained with it.')
        addyesno = input('')
        if addyesno == 'y':
            printblank()
            print(f'+++ | Adding term...')
            
            #add terms !!!!!!!!!!!!!!!!!
            
            print(f'+++ | {user_input} added.')
        else:
            printblank()
            print(f'xxx | Term not added')



    | Chat opened
    |
!!! | KeyError raised
!!! | User: you're a silly goober
    |
??? | Do you wish to add this term to the dictionary? (y/n)
??? | kAIra will not be trained with it.
    |
xxx | Term not added
    |
    | Bot: drink lunar lowly properly madmen madmen 1433 1433 1433 1433 yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.hey yup.