# K(AI)ra version 3.1.3


### Changelog
-   updated printout clarity
-   reworked stopping conditions
-   output logs
-   Unknown word handling

### Fixes / Improvements / additions needed
-   pyVTS integration
-   speech synthesis
-   output length
-   control panel
-   speech recognition
-   screen vision
-   username = voice detection
-   latency
-   dataset modernisation
-   automatic data collection
-   model saving/loading
-   multithreading

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pyttsx3 as tts
import re
import threading
from torch.utils.data import Dataset, DataLoader
import time
import datetime

### Variable Settings

In [6]:
#       VARIABLE              DEFAULT             DESCRIPTION
#
#       batch_size            (32)                Lower batch size causes faster training, but lower accuracy
#       embedding_dim         (100)               Higher embedding dimensions can cause higher performance, but too high and the performance is decreased.
#       hidden_dim            (128)               Deeper models are better at generalization, but wider models are better at memorization.
#       learning_rate         (0.001)             Higher learning rate causes faster changes.
#
#       stoptype              (t, e, l)           When to stop training t = time, e = epochs, l = loss. training will stop at epochs if it has not reached loss or time.
#
#       acceptable_loss       (0.001)             An amount considered to be acceptable. training will stop here even if the amount of epochs is not reached.
#       stop_minutes          (480)               Amount of minutes until training stops
#       epochs                (10000)             How many generations to train for. The more, the longer training but the more accurate the model is.

batch_size       = 32           
embedding_dim    = 64           
hidden_dim       = 128           
learning_rate    = 0.001        

stoptype         = "t"

acceptable_loss  = 0.001        
stop_minutes     = 30    
epochs           = 100

file_path        = 'dialogues_text.txt'

### Text processing and data preparation

In [3]:
with open(file_path, "r", errors="ignore") as file:
    data = file.read()

def parse(text):
    text = re.sub(r" â€™ ", "'", text)
    text = re.sub(r'[A-Z]', lambda match: match.group().lower(), text)
    
    return text

def process_text_file(file_path):
    tuples_list = []
    with open(file_path, 'r', errors='ignore') as file:
        lines = file.readlines()
        for line in lines:
            line = line.strip()
            entries = line.split('__eou__')
            for i in range(len(entries) - 1):
                tuple_entry = (entries[i], entries[i + 1])
                tuples_list.append(tuple_entry)
    
    return tuples_list


data = process_text_file(file_path)
print(data[:10])

[('The kitchen stinks . ', " I'll throw out the garbage . "), (" I'll throw out the garbage . ", ''), ('So Dick , how about getting some coffee for tonight ? ', ' Coffee ? I don â€™ t honestly like that kind of stuff . '), (' Coffee ? I don â€™ t honestly like that kind of stuff . ', ' Come on , you can at least try a little , besides your cigarette . '), (' Come on , you can at least try a little , besides your cigarette . ', ' What â€™ s wrong with that ? Cigarette is the thing I go crazy for . '), (' What â€™ s wrong with that ? Cigarette is the thing I go crazy for . ', ' Not for me , Dick . '), (' Not for me , Dick . ', ''), ('Are things still going badly with your houseguest ? ', ' Getting worse . Now he â€™ s eating me out of house and home . I â€™ Ve tried talking to him but it all goes in one ear and out the other . He makes himself at home , which is fine . But what really gets me is that yesterday he walked into the living room in the raw and I had company over ! That was th

### Tokenization and vocabulary building

In [4]:
word2idx = {}
idx2word = {}

idx2word[0] = '<EOS>'
word2idx['<EOS>'] = 0

for sentence, response in data:
    for word in sentence.split() + response.split():
        if word not in word2idx:
            idx2word[len(word2idx)] = word
            word2idx[word] = len(word2idx)

idx2word[len(word2idx)] = '<UNK>'
word2idx['<UNK>'] = len(word2idx)

def tokenize(sentence):
    tokens = []
    for word in sentence.split():
        if word in word2idx:
            tokens.append(word2idx[word])
        else:
            tokens.append(word2idx['<UNK>'])
    return tokens

def detokenize(tokens):
    return ' '.join([idx2word[token] for token in tokens])

input_data = [tokenize(sentence) for sentence, _ in data]
target_data = [tokenize(response) for _, response in data]

def pad_sequence(seq, max_length):
    return seq + [word2idx['<EOS>']] * (max_length - len(seq))

max_length = max(len(seq) for seq in input_data + target_data)
input_data = [pad_sequence(seq, max_length) for seq in input_data]
target_data = [pad_sequence(seq, max_length) for seq in target_data]

print(max_length)
print(input_data[:1])
print(target_data[:1])

input_data = torch.tensor(input_data, dtype=torch.long)
target_data = torch.tensor(target_data, dtype=torch.long)

278
[[1, 2, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
[[5, 6, 7, 8, 9, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

### Model Definition

In [5]:
class ChatbotModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(ChatbotModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        lstm_out, _ = self.lstm(x)
        logits = self.fc(lstm_out)
        return logits

vocab_size = len(word2idx)
output_dim = vocab_size
model = ChatbotModel(vocab_size, embedding_dim, hidden_dim, output_dim)


### Stat printing

In [6]:
def printstats(epoch, loss, start_time,):
    if stoptype == "t":
        time_elapsed = time.time() - start_time
        time_epoch = time_elapsed / (epoch + 1)
        time_left = (stop_minutes * 60) - time_elapsed
        loss_dif = oldloss - loss.item()
        ETA_time = time.time() + time_left
        progress_total = 100 * (time_elapsed / (60 * stop_minutes))
        formatted_time = datetime.datetime.fromtimestamp(ETA_time).strftime("%H:%M:%S")
        expected_epochs = time_left / time_epoch
        
        print(f'Epoch {epoch+1}:')
        print(f' -  Loss:                   {loss.item():.16f}')
        if loss_dif < 0:
            print(f' -  Difference:            {loss_dif:.16f}')
            print(f' -  Change:                 {"Regressed"}')
        else:
            print(f' -  Difference:             {loss_dif:.16f}')
            print(f' -  Change:                 {"Improved"}')
        print(f' -  Progress:               {progress_total:.4f}%')
        print(f' -  Time for last epoch:    {time_epoch:.4f} seconds')
        print(f' -  Expected epochs left:   {expected_epochs:.0f}')
        print(f' -  Time left:              {time_left:.4f} seconds')
        print(f' -  ETA:                    {formatted_time}')
        print(f'')
    
    
    
    if stoptype == "e":
        time_elapsed = time.time() - start_time
        time_epoch = time_elapsed / (epoch + 1)
        progress_total = 100 * (epoch + 1) / epochs
        progress_left = 100 - progress_total
        time_left = time_epoch * progress_left
        loss_dif = oldloss - loss.item()
        ETA_time = time.time() + time_left
        formatted_time = datetime.datetime.fromtimestamp(ETA_time).strftime("%H:%M:%S")
        
        
        print(f'Epoch {epoch+1}:')
        print(f' -  Loss:                   {loss.item():.16f}')
        if loss_dif < 0:
            print(f' -  Difference:            {loss_dif:.16f}')
            print(f' -  Change:                 {"Regressed"}')
        else:
            print(f' -  Difference:             {loss_dif:.16f}')
            print(f' -  Change:                 {"Improved"}')
        print(f' -  Progress:               {progress_total:.4f}%')
        print(f' -  Time for last epoch:    {time_epoch:.4f} seconds')
        print(f' -  Time left:              {time_left:.4f} seconds')
        print(f' -  ETA:                    {formatted_time}')
        print(f'')
    
    
    
    if stoptype == "l":
        time_elapsed = time.time() - start_time
        time_epoch = time_elapsed / (epoch + 1)
        progress_total = 100 * ((first_loss - loss) / (first_loss - acceptable_loss))
        progress_left = 100 - progress_total
        loss_dif = oldloss - loss
        time_left = progress_left * time_epoch
        ETA_time = time.time() + time_left
        formatted_time = datetime.datetime.fromtimestamp(ETA_time).strftime("%H:%M:%S")
        
        
        print(f'Epoch {epoch+1}:')
        print(f' -  Loss:                   {loss.item():.16f}')
        if loss_dif < 0:
            print(f' -  Difference:            {loss_dif:.16f}')
            print(f' -  Change:                 {"Regressed"}')
        else:
            print(f' -  Difference:             {loss_dif:.16f}')
            print(f' -  Change:                 {"Improved"}')
        print(f' -  Progress:               {progress_total:.4f}%')
        print(f' -  Time for last epoch:    {time_epoch:.4f} seconds') 
        print(f' -  Time left:              {time_left:.4f} seconds')
        print(f' -  ETA:                    {formatted_time}')
        print(f'')
    


### Training

In [11]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

class Mydataset(Dataset):
    def __init__(self, data, targets):
        self.data = data
        self.targets = targets

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx], self.targets[idx]

data = torch.randint(0, vocab_size, (1000, 20))
targets = torch.randint(0, vocab_size, (1000, 20))

dataset = Mydataset(data, targets)
data_loader = DataLoader(dataset, batch_size = batch_size, shuffle = True)

start_time = time.time()
oldloss = 0

for epoch in range(epochs):
    for input_data, target_data in data_loader:
        optimizer.zero_grad()
        output = model(input_data)
        loss = criterion(output.view(-1, vocab_size), target_data.view(-1))
        loss.backward()
        optimizer.step()
    
    if epoch == 0:
        first_loss = loss
    
    printstats(epoch, loss, start_time,)
    if stoptype == "l":
        epochs = epochs + 1
        if loss.item() < acceptable_loss:
            print(f'Loss under {acceptable_loss}, finishing training')
            break
    elif stoptype == "t":
        epochs = epochs + 1
        if (time.time() - start_time) >= (stop_minutes * 60):
            print(f'Time over {stop_minutes} minutes, finishing training')
            break
    
    oldloss = loss



Epoch 1:
 -  Loss:                   12.1403999328613281
 -  Difference:            -12.1403999328613281
 -  Change:                 Regressed
 -  Progress:               0.5870%
 -  Time for last epoch:    10.5666 seconds
 -  Expected epochs left:   169
 -  Time left:              1789.4334 seconds
 -  ETA:                    20:09:16

Epoch 2:
 -  Loss:                   10.8412437438964844
 -  Difference:             1.2991561889648438
 -  Change:                 Improved
 -  Progress:               1.1903%
 -  Time for last epoch:    10.7129 seconds
 -  Expected epochs left:   166
 -  Time left:              1778.5742 seconds
 -  ETA:                    20:09:16

Epoch 3:
 -  Loss:                   9.9647388458251953
 -  Difference:             0.8765048980712891
 -  Change:                 Improved
 -  Progress:               1.7792%
 -  Time for last epoch:    10.6754 seconds
 -  Expected epochs left:   166
 -  Time left:              1767.9737 seconds
 -  ETA:                  

### Response generation

In [8]:
def predict(sentence):
    model.eval()
    with torch.no_grad():
        input_seq = torch.tensor([pad_sequence(tokenize(sentence), max_length)], dtype=torch.long)
        output = model(input_seq)
        output_seq = torch.argmax(output, dim=2).numpy().flatten()
        response = []
        for token in output_seq:
            if idx2word[token] == '<EOS>' or idx2word[token] == '<UNK>':
                break
            response.append(idx2word[token])
        response = ' '.join(response)
    return response



### Speech synthesis

In [9]:
engine = tts.init()
voice = engine.getProperty('voices')
engine.setProperty('voice', voice[1].id)
def speak(speech):
    tts.speak(speech)

### User interface

In [5]:
def printblank():
    print('    |')

print(f'    | Chat opened')
while True:
    try:
        user_input = input("You: ")
        if user_input.lower() == "quit":
            printblank()
            print(f'____| Quit statement used')
            break
        if user_input.lower() == "tokensplease":
            printblank()
            print(f'<o> | Tokens: {tokenize(response)}')
            continue
        if user_input.lower() == "wordsplease":
            printblank()
            print(f'<o> | Words: {len(response.split())}')
            continue
        if user_input.lower() == "logplease":
            printblank()
            with open('log.txt', 'w') as f:
                f.write(response)
                f.close
            print(f'<o> | Added bot response to log')
        response = predict(parse(user_input))
        printblank()
        print(f'    | Bot: {response}')
        #speak(response)
    except KeyError:
        printblank()
        print(f'!!! | KeyError raised')
        print(f'!!! | User: {user_input}')

    | Chat opened
    |
____| Quit statement used
