In [1]:
import numpy as np
import pandas as pd

import tensorflow as tf
import pickle
from tensorflow.keras import layers , activations , models , preprocessing, utils
import re
from tensorflow import keras

import yaml
import os
import json

dir_path = r'C:\Users\idipa\PycharmProject\ChatBot\ChatbotData'
files_list = os.listdir(dir_path + os.sep)

In [2]:
questions, answers = [], []

for filepath in files_list:
    file_ = open(dir_path + os.sep + filepath , 'rb')
    docs = yaml.safe_load(file_)
    conversations = docs['conversations']
    for con in conversations:
        if len(con) > 2 :
            replies = con[1 :]
            ans = ''
            for rep in replies:
                questions.append(con[0])
                answers.append(rep)
        elif len(con)> 1:
            questions.append(con[0])
            answers.append(con[1])

In [3]:
answers[:10]

['Artificial Intelligence is the branch of engineering and science devoted to constructing machines that think.',
 'AI is the field of science which concerns itself with building hardware and software that replicates the functions of the human mind.',
 'Sort of.',
 "By the strictest dictionary definition of the word 'sentience', I may be.",
 "Even though I'm a construct I do have a subjective experience of the universe, as simplistic as it may be.",
 "In all probability, I am not.  I'm not that sophisticated.",
 'Do you think I am?',
 'How would you feel about me if I told you I was?',
 'No.',
 'Python.']

In [4]:
questions[:10]

['What is AI?',
 'What is AI?',
 'Are you sentient?',
 'Are you sentient?',
 'Are you sentient?',
 'Are you sapient?',
 'Are you sapient?',
 'Are you sapient?',
 'Are you sapient?',
 'What language are you written in?']

In [5]:
answers_with_tags = []
for i in range(len(answers)):
    if type(answers[i]) == str:
        answers_with_tags.append(answers[i])
    else:
        questions.pop(i)

answers = []
for i in range(len(answers_with_tags)) :
    answers.append('<START> ' + answers_with_tags[i] + ' <END>')

In [6]:
answers[:10]

['<START> Artificial Intelligence is the branch of engineering and science devoted to constructing machines that think. <END>',
 '<START> AI is the field of science which concerns itself with building hardware and software that replicates the functions of the human mind. <END>',
 '<START> Sort of. <END>',
 "<START> By the strictest dictionary definition of the word 'sentience', I may be. <END>",
 "<START> Even though I'm a construct I do have a subjective experience of the universe, as simplistic as it may be. <END>",
 "<START> In all probability, I am not.  I'm not that sophisticated. <END>",
 '<START> Do you think I am? <END>',
 '<START> How would you feel about me if I told you I was? <END>',
 '<START> No. <END>',
 '<START> Python. <END>']

In [7]:
contractions_dict = {
    "ain't": "am not",
    "aren't": "are not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he had",
    "he'd've": "he would have",
    "he'll": "he shall",
    "he'll've": "he shall have",
    "he's": "he has",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how has",
    "i'd": "i had",
    "i'd've": "i would have",
    "i'll": "i shall",
    "i'll've": "i shall have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it had",
    "it'd've": "it would have",
    "it'll": "it shall",
    "it'll've": "it shall have",
    "it's": "it has",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she had",
    "she'd've": "she would have",
    "she'll": "she shall",
    "she'll've": "she shall have",
    "she's": "she has",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that has",
    "there'd": "there had",
    "there'd've": "there would have",
    "there's": "there has",
    "they'd": "they had",
    "they'd've": "they would have",
    "they'll": "they shall",
    "they'll've": "they shall have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we had",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what shall",
    "what'll've": "what shall have",
    "what're": "what are",
    "what's": "what has",
    "what've": "what have",
    "when's": "when has",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where has",
    "where've": "where have",
    "who'll": "who shall",
    "who'll've": "who will have",
    "who's": "who has",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'alls": "you alls",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you had",
    "you'd've": "you would have",
    "you'll": "you shall",
    "you'll've": "you shall have",
    "you're": "you are",
    "you've": "you have"
}


In [71]:
jo = json.dumps(contractions_dict)
with open('contractions.json','w') as file:
    file.write(jo)

In [8]:
contractions_re = re.compile('(%s)' % '|'.join(re.escape(key) for key in contractions_dict.keys()), re.IGNORECASE)

def expand_contractions(sentence, contractions_dict=contractions_dict):
    def replace(match):
        # Match is case-insensitive, use the original case in replacement
        contraction = match.group(0)
        expanded = contractions_dict.get(contraction.lower())
        if contraction[0].isupper():
            expanded = expanded.capitalize()
        return expanded
    return contractions_re.sub(replace, sentence)

# Example usage
sentence = "I can't believe it's already 2024! You've got to be kidding me."
expanded_sentence = expand_contractions(sentence)
print(expanded_sentence)

I cannot believe it has already 2024! You have got to be kidding me.


In [9]:
 re.sub(r"""([+$@#%^&.?!*"\\',:;-])""", r' \1 ', answers[11])

"<START> Yes I am inspired by commander Data ' s artificial personality .  <END>"

In [10]:
for i in range(len(answers)):
    st = expand_contractions(answers[i].lower())
    answers[i] = re.sub(r"""([+$@#%^&.?!*"\\',:;-])""", r' \1 ', st)

In [11]:
answers[:10]

['<start> artificial intelligence is the branch of engineering and science devoted to constructing machines that think .  <end>',
 '<start> ai is the field of science which concerns itself with building hardware and software that replicates the functions of the human mind .  <end>',
 '<start> sort of .  <end>',
 "<start> by the strictest dictionary definition of the word  ' sentience '  ,  i may be .  <end>",
 '<start> even though i am a construct i do have a subjective experience of the universe ,  as simplistic as it may be .  <end>',
 '<start> in all probability ,  i am not .   i am not that sophisticated .  <end>',
 '<start> do you think i am ?  <end>',
 '<start> how would you feel about me if i told you i was ?  <end>',
 '<start> no .  <end>',
 '<start> python .  <end>']

In [12]:
answers[3].strip().split()

['<start>',
 'by',
 'the',
 'strictest',
 'dictionary',
 'definition',
 'of',
 'the',
 'word',
 "'",
 'sentience',
 "'",
 ',',
 'i',
 'may',
 'be',
 '.',
 '<end>']

In [13]:
for i in range(len(questions)):
    st =  expand_contractions(questions[i].lower())
    questions[i] = re.sub(r"""([+$@#%^&.?!*"\\',:;-])""", r' \1 ', st)

In [14]:
story = """
Once upon a time, in a quaint little village nestled in the verdant hills, there lived an eclectic group of people, each with unique stories and backgrounds. The village, known as Greenfield, was renowned for its picturesque landscapes, vibrant community life, and rich cultural heritage. Among the residents was Alice, an astute librarian with an insatiable curiosity about the world. Her house was a haven for books, maps, and artifacts from different eras and regions, reflecting her lifelong passion for knowledge and adventure.

Alice often spent her days in the village library, a grand building with towering shelves filled with volumes of literature, science, history, and art. The library was a hub of activity, attracting scholars, students, and readers from all walks of life. One day, as she was cataloging a collection of ancient manuscripts, she discovered a dusty old tome that seemed out of place. The book, bound in weathered leather, was inscribed with symbols and languages she had never seen before.

Intrigued, Alice began to decipher its contents, which narrated the tales of an ancient civilization known for its wisdom and technological advancements. The manuscript spoke of a lost city, hidden deep within an uncharted jungle, protected by intricate puzzles and mythical creatures. The allure of uncovering such a mystery captivated Alice, and she decided to embark on a quest to find this lost city.

She shared her discovery with her close friends, each bringing their own set of skills to the journey. There was Marcus, a seasoned archaeologist with a knack for solving riddles; Elena, a brilliant linguist fluent in multiple languages; and Leo, an intrepid explorer with unmatched survival skills. Together, they formed a formidable team, ready to face the unknown.

Their journey began with meticulous planning, gathering supplies, and studying maps and historical texts. They traveled across continents, through bustling cities and remote villages, encountering diverse cultures and landscapes along the way. Their path led them through dense forests, arid deserts, and treacherous mountains, each step bringing them closer to their goal.

As they ventured deeper into the jungle, they faced numerous challenges. The thick canopy overhead blocked the sunlight, making navigation difficult. They encountered wild animals, torrential rains, and steep cliffs that tested their endurance and resilience. Despite the hardships, their determination never wavered.

One fateful day, they stumbled upon an ancient stone path, overgrown with vines and moss. The path led to a massive stone gate, adorned with intricate carvings depicting scenes of a thriving civilization. The gate was guarded by a colossal statue of a mythical beast, its eyes seemingly watching their every move.

Using their combined knowledge, the team deciphered the carvings, revealing clues to unlock the gate. After hours of meticulous work, they succeeded, and the gate slowly creaked open, revealing the entrance to the lost city. The sight that greeted them was beyond their wildest dreams: towering structures, ornate temples, and lush gardens, all remarkably preserved despite the passage of time.

As they explored the city, they uncovered advanced technologies and sophisticated art, evidence of a highly developed society. They also found records of the city's history, detailing its rise and fall. The city had once been a beacon of knowledge and innovation, but a cataclysmic event had forced its inhabitants to abandon it, leaving behind their legacy for future generations to discover.

Throughout their exploration, the team encountered various puzzles and traps, designed to protect the city's secrets. Each challenge required a blend of intellect, teamwork, and courage to overcome. They faced rooms that shifted like labyrinths, mechanisms that required precise timing, and guardians that tested their resolve.

Among the most remarkable discoveries was a vast library, containing scrolls and tablets that held the collective wisdom of the ancient civilization. Alice and Elena were particularly enthralled by the linguistic and historical treasures they found, while Marcus and Leo marveled at the architectural and engineering feats.

Their greatest challenge came when they discovered a hidden chamber, protected by a series of complex locks and puzzles. The chamber was said to hold the most valuable artifact of the lost civilization, a relic of immense power and knowledge. Solving the final puzzle required all their skills and collaboration, but eventually, they succeeded.

Inside the chamber, they found a crystalline artifact, glowing with an ethereal light. As they carefully examined it, they realized it contained vast amounts of data, encoded in a way that was far beyond their current understanding. The artifact held the key to unlocking further mysteries of the lost civilization and potentially advancing modern technology and knowledge.

Their discovery marked a significant milestone in the field of archaeology and history. The lost city, once a myth, had become a reality, offering insights into a civilization that was both advanced and enigmatic. The team's findings were documented and shared with the world, leading to new research and explorations.

Alice, Marcus, Elena, and Leo returned to Greenfield as heroes, their adventure becoming the stuff of legends. They will continue their work, inspired by their journey and the knowledge they had gained. Their story will serve as a reminder of the endless possibilities that await those who dare to explore the unknown.

In Greenfield, life continued to thrive, with the community drawing inspiration from the team's achievements. The village became a center for learning and exploration, attracting scholars and adventurers from far and wide. The library, once a quiet haven, buzzed with activity as people sought to learn more about the lost civilization and its secrets.

The team's legacy will live on, inspiring future generations to pursue their dreams and explore the mysteries of the world. Alice will continue her work at the library, always on the lookout for the next great adventure. Marcus will return to his archaeological pursuits, uncovering more hidden treasures and ancient sites. Elena will dedicate herself to deciphering the languages and texts of the lost civilization, while Leo will embark on new expeditions, driven by his insatiable curiosity.

Their story will become a testament to the power of curiosity, collaboration, and perseverance. It will show that with determination and a willingness to face the unknown, even the most elusive mysteries can be uncovered. The lost city, once hidden in the depths of the jungle, had revealed its secrets, thanks to the unwavering spirit of those who dared to seek it.

And so, the tale of Greenfield and its intrepid explorers will continue, a shining example of what can be achieved when people come together with a shared vision and a relentless pursuit of knowledge. Their adventure will have only just begun, with the promise of more discoveries and stories waiting to be told.
"""


In [15]:
story = expand_contractions(story)
story

"\nOnce upon a time, in a quaint little village nestled in the verdant hills, there lived an eclectic group of people, each with unique stories and backgrounds. The village, known as Greenfield, was renowned for its picturesque landscapes, vibrant community life, and rich cultural heritage. Among the residents was Alice, an astute librarian with an insatiable curiosity about the world. Her house was a haven for books, maps, and artifacts from different eras and regions, reflecting her lifelong passion for knowledge and adventure.\n\nAlice often spent her days in the village library, a grand building with towering shelves filled with volumes of literature, science, history, and art. The library was a hub of activity, attracting scholars, students, and readers from all walks of life. One day, as she was cataloging a collection of ancient manuscripts, she discovered a dusty old tome that seemed out of place. The book, bound in weathered leather, was inscribed with symbols and languages sh

In [16]:
story = re.sub(r"""([+$@#%^&.?!*"\\',:;-])""", r' \1 ', story.lower())
story

"\nonce upon a time ,  in a quaint little village nestled in the verdant hills ,  there lived an eclectic group of people ,  each with unique stories and backgrounds .  the village ,  known as greenfield ,  was renowned for its picturesque landscapes ,  vibrant community life ,  and rich cultural heritage .  among the residents was alice ,  an astute librarian with an insatiable curiosity about the world .  her house was a haven for books ,  maps ,  and artifacts from different eras and regions ,  reflecting her lifelong passion for knowledge and adventure . \n\nalice often spent her days in the village library ,  a grand building with towering shelves filled with volumes of literature ,  science ,  history ,  and art .  the library was a hub of activity ,  attracting scholars ,  students ,  and readers from all walks of life .  one day ,  as she was cataloging a collection of ancient manuscripts ,  she discovered a dusty old tome that seemed out of place .  the book ,  bound in weathe

In [17]:
story.split()

['once',
 'upon',
 'a',
 'time',
 ',',
 'in',
 'a',
 'quaint',
 'little',
 'village',
 'nestled',
 'in',
 'the',
 'verdant',
 'hills',
 ',',
 'there',
 'lived',
 'an',
 'eclectic',
 'group',
 'of',
 'people',
 ',',
 'each',
 'with',
 'unique',
 'stories',
 'and',
 'backgrounds',
 '.',
 'the',
 'village',
 ',',
 'known',
 'as',
 'greenfield',
 ',',
 'was',
 'renowned',
 'for',
 'its',
 'picturesque',
 'landscapes',
 ',',
 'vibrant',
 'community',
 'life',
 ',',
 'and',
 'rich',
 'cultural',
 'heritage',
 '.',
 'among',
 'the',
 'residents',
 'was',
 'alice',
 ',',
 'an',
 'astute',
 'librarian',
 'with',
 'an',
 'insatiable',
 'curiosity',
 'about',
 'the',
 'world',
 '.',
 'her',
 'house',
 'was',
 'a',
 'haven',
 'for',
 'books',
 ',',
 'maps',
 ',',
 'and',
 'artifacts',
 'from',
 'different',
 'eras',
 'and',
 'regions',
 ',',
 'reflecting',
 'her',
 'lifelong',
 'passion',
 'for',
 'knowledge',
 'and',
 'adventure',
 '.',
 'alice',
 'often',
 'spent',
 'her',
 'days',
 'in',
 'the'

In [18]:
punctuations = """! @ # $ % ^ & * ( ) _ - + = { } [ ] : ; ' " / | \ \ < > , . ? / * """
numbers = "0 1 2 3 4 5 6 7 8 9 "

In [19]:
mass = punctuations + " " + numbers + " " + story

In [20]:
for each in answers:
    mass += " " + each

In [21]:
len(mass)

58300

In [22]:
for each in questions:
    mass += " " + each

In [23]:
len(mass)

74027

In [24]:
for each in contractions_dict.values():
    mass += " " + each

In [25]:
len(mass)

75256

In [26]:
mass = list(set(mass.strip().split()))

In [27]:
len(mass)

2267

In [28]:
mass.sort()

In [29]:
len(mass)

2267

In [30]:
VOCAB_SIZE = len(mass)+1
VOCAB_SIZE

2268

In [31]:
vocab = {w:i+1 for i,w in enumerate(mass)}

In [32]:
vocab

{'!': 1,
 '"': 2,
 '#': 3,
 '$': 4,
 '%': 5,
 '&': 6,
 "'": 7,
 '(': 8,
 '(a': 9,
 '(this': 10,
 ')': 11,
 '*': 12,
 '+': 13,
 ',': 14,
 '-': 15,
 '.': 16,
 '/': 17,
 '0': 18,
 '000': 19,
 '1': 20,
 '10': 21,
 '1946': 22,
 '1963': 23,
 '1990': 24,
 '2': 25,
 '2001': 26,
 '2002': 27,
 '20th': 28,
 '22': 29,
 '23': 30,
 '250': 31,
 '2nd': 32,
 '3': 33,
 '37th': 34,
 '4': 35,
 '42': 36,
 '5': 37,
 '6': 38,
 '7': 39,
 '8': 40,
 '9': 41,
 '9000': 42,
 '93': 43,
 ':': 44,
 ';': 45,
 '<': 46,
 '<end>': 47,
 '<start>': 48,
 '=': 49,
 '>': 50,
 '?': 51,
 '@': 52,
 '[': 53,
 '\\': 54,
 ']': 55,
 '^': 56,
 '_': 57,
 '_3000': 58,
 'a': 59,
 'abandon': 60,
 'ability': 61,
 'able': 62,
 'about': 63,
 'above': 64,
 'absorbed': 65,
 'accept': 66,
 'access': 67,
 'accomplish': 68,
 'accuracy': 69,
 'accused': 70,
 'achieved': 71,
 'achievements': 72,
 'across': 73,
 'act': 74,
 'acting': 75,
 'actions': 76,
 'activism': 77,
 'activity': 78,
 'actually': 79,
 'adage': 80,
 'adapt': 81,
 'addict': 82,
 '

In [63]:
import json

In [64]:
f = json.dumps(vocab)
with open('vocab1.json','w') as file:
    file.write(f)

In [35]:
def Word2Num(word):
    try:
        return vocab[word]
    except:
        return -1

In [37]:
Word2Num('hello')

966

In [38]:
def Sent2Seq(sentence):
    sentence = expand_contractions(sentence.lower())
    sentence = re.sub(r"""([+$@#%^&.?!*"\\',:;-])""", r' \1 ', sentence)
    tokens = sentence.strip().split()
    return list(map(Word2Num,tokens))

In [40]:
seq = Sent2Seq("Hello! I'm Alice.")
seq

[966, 1, 1017, 121, 104, 16]

In [41]:
def padding(sequence:list,max_pad:int):
    l = max_pad-len(sequence)
    for i in range(l):
        sequence.append(0)

In [42]:
padding(seq,20)
seq

[966, 1, 1017, 121, 104, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [44]:
ans_max = 0
for each in answers:
    ans_max = max(ans_max,len(each))
ans_max

377

In [45]:
qs_max = 0
for each in questions:
    qs_max = max(qs_max,len(each))
qs_max

130

# Answers modifications

In [57]:
ANS = []
for ans in answers:
    seq = Sent2Seq(ans)
    padding(seq,ans_max)
    ANS.append(np.array(seq))
decoder_input_data = np.array(ANS)

In [58]:
decoder_input_data.shape

(764, 377)

In [59]:
decoder_input_data[0]

array([  48,  178, 1073, 1097, 2020,  304, 1410,  700,  134, 1766,  593,
       2056,  470, 1238, 2019, 2036,   16,   47,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

array([  48,  178, 1073, 1097, 2020,  304, 1410,  700,  134, 1766,  593,
       2056,  470, 1238, 2019, 2036,   16,   47,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [61]:
for i in range(len(ANS)) :
    ANS[i] = ANS[i][1:]
padded_answers = preprocessing.sequence.pad_sequences(ANS , maxlen=ans_max , padding='post')
onehot_answers = utils.to_categorical(padded_answers , VOCAB_SIZE)
decoder_output_data = np.array(onehot_answers)

In [62]:
decoder_output_data.shape

(764, 377, 2268)

In [63]:
del ANS
del padded_answers
del onehot_answers

In [65]:
decoder_output_data[0][0]

array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)

# Questions Modifications

In [66]:
QS = []
for qs in questions:
    seq = Sent2Seq(qs)
    padding(seq,qs_max)
    QS.append(np.array(seq))
encoder_input_data = np.array(QS)
del QS

In [67]:
encoder_input_data.shape

(764, 130)

In [68]:
encoder_input_data[0]

array([2205, 1097,   99,   51,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0])

# Model

Embedding, LSTM and Desne layers

In [69]:
encoder_inputs = tf.keras.layers.Input(shape=(qs_max ,))
encoder_embedding = tf.keras.layers.Embedding(VOCAB_SIZE, 300 , mask_zero=True) (encoder_inputs)
encoder_outputs , state_h , state_c = tf.keras.layers.LSTM(300 , return_state=True)(encoder_embedding)
encoder_states = [ state_h , state_c ]

decoder_inputs = tf.keras.layers.Input(shape=(ans_max , ))
decoder_embedding = tf.keras.layers.Embedding(VOCAB_SIZE, 300 , mask_zero=True) (decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM(300 , return_state=True , return_sequences=True)
decoder_outputs , _ , _ = decoder_lstm (decoder_embedding , initial_state=encoder_states)


decoder_dense = tf.keras.layers.Dense(VOCAB_SIZE , activation=tf.keras.activations.softmax) 
output = decoder_dense (decoder_outputs)

model = tf.keras.models.Model([encoder_inputs, decoder_inputs], output)

In [70]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

In [71]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 130)]                0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 377)]                0         []                            
                                                                                                  
 embedding (Embedding)       (None, 130, 300)             680400    ['input_1[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, 377, 300)             680400    ['input_2[0][0]']             
                                                                                              

In [72]:
model.fit([encoder_input_data , decoder_input_data], decoder_output_data, batch_size=16, epochs=100) 

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x23674825a80>

In [155]:
model.save('BaseModel2.h5')

  saving_api.save_model(


In [151]:
def inference():
    
    encoder_model = tf.keras.models.Model(encoder_inputs, encoder_states)
    
    decoder_state_input_h = tf.keras.layers.Input(shape=(300 ,))
    decoder_state_input_c = tf.keras.layers.Input(shape=(300 ,))
    
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    
    decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding , initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    
    decoder_model = tf.keras.models.Model([decoder_inputs] + decoder_states_inputs,[decoder_outputs] + decoder_states)
    
    return encoder_model , decoder_model

In [152]:
enc_model, dec_model = inference()

In [156]:
enc_model.save('Encoder2.h5')
dec_model.save('Decoder2.h5')



  saving_api.save_model(


In [4]:
enc_model.summary()

Model: "model_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 130)]             0         
                                                                 
 embedding (Embedding)       (None, 130, 300)          680400    
                                                                 
 lstm (LSTM)                 [(None, 300),             721200    
                              (None, 300),                       
                              (None, 300)]                       
                                                                 
Total params: 1401600 (5.35 MB)
Trainable params: 1401600 (5.35 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [5]:
dec_model.summary()

Model: "model_7"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None, 377)]                0         []                            
                                                                                                  
 embedding_1 (Embedding)     (None, 377, 300)             680400    ['input_2[0][0]']             
                                                                                                  
 input_7 (InputLayer)        [(None, 300)]                0         []                            
                                                                                                  
 input_8 (InputLayer)        [(None, 300)]                0         []                            
                                                                                            

In [6]:
def preprocess_input(input_sentence):
    seq = Sent2Seq(input_sentence)
    padding(seq,qs_max)
    return seq

In [7]:
del preprocess_input

In [81]:
vocabulary = {i:w for w,i in zip(vocab.keys(),vocab.values())}
vocabulary

{1: '!',
 2: '"',
 3: '#',
 4: '$',
 5: '%',
 6: '&',
 7: "'",
 8: '(',
 9: '(a',
 10: '(this',
 11: ')',
 12: '*',
 13: '+',
 14: ',',
 15: '-',
 16: '.',
 17: '/',
 18: '0',
 19: '000',
 20: '1',
 21: '10',
 22: '1946',
 23: '1963',
 24: '1990',
 25: '2',
 26: '2001',
 27: '2002',
 28: '20th',
 29: '22',
 30: '23',
 31: '250',
 32: '2nd',
 33: '3',
 34: '37th',
 35: '4',
 36: '42',
 37: '5',
 38: '6',
 39: '7',
 40: '8',
 41: '9',
 42: '9000',
 43: '93',
 44: ':',
 45: ';',
 46: '<',
 47: '<end>',
 48: '<start>',
 49: '=',
 50: '>',
 51: '?',
 52: '@',
 53: '[',
 54: '\\',
 55: ']',
 56: '^',
 57: '_',
 58: '_3000',
 59: 'a',
 60: 'abandon',
 61: 'ability',
 62: 'able',
 63: 'about',
 64: 'above',
 65: 'absorbed',
 66: 'accept',
 67: 'access',
 68: 'accomplish',
 69: 'accuracy',
 70: 'accused',
 71: 'achieved',
 72: 'achievements',
 73: 'across',
 74: 'act',
 75: 'acting',
 76: 'actions',
 77: 'activism',
 78: 'activity',
 79: 'actually',
 80: 'adage',
 81: 'adapt',
 82: 'addict',
 8

In [9]:
tests = ['You can not move .', 'You sound like Data !', 'Stupid !', 'you are idiot .', 'i am going to die ?','who are you ?']

In [11]:
s = [preprocess_input(tests[0])]
s

[array([2259,  335, 1392, 1332,   16,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0])]

In [13]:
states_values = enc_model.predict(np.array(s))



In [14]:
states_values

[array([[-1.5839774e-03,  6.7818805e-04, -4.0749136e-02,  1.4696525e-03,
         -2.2133898e-05,  9.8960382e-01,  1.2886683e-02,  1.8648434e-02,
          3.5040519e-01,  5.9976453e-01, -9.9838877e-01, -2.7573099e-06,
          4.1376578e-04, -2.5494923e-03, -1.0890899e-03, -2.4165046e-09,
         -5.1991985e-04,  8.4895990e-04,  9.9958587e-01, -9.9518001e-01,
          6.9993403e-06,  1.7146427e-04, -3.2943513e-04,  1.8610771e-01,
          2.1843273e-02, -2.6579376e-02,  9.8051584e-01,  7.0314974e-01,
         -8.9817804e-01,  2.4182510e-02,  8.3815306e-01,  9.9650067e-01,
         -9.9569017e-01,  5.1534700e-01, -9.9481350e-01, -9.9158329e-01,
          3.2736029e-02, -5.9846163e-01,  1.4589143e-03,  2.0138773e-03,
         -2.8750014e-03,  1.2582901e-01,  9.9199003e-01,  3.9880739e-05,
          5.3726286e-02, -8.3559105e-04,  1.0580263e-07, -7.5216102e-03,
         -5.7110947e-02, -1.1290421e-02, -1.0867003e-02,  9.9771518e-01,
         -9.4756000e-02, -8.1493074e-01,  4.4709051

In [16]:
empty_target_seq = np.zeros((1 , 1))
empty_target_seq[0, 0] = vocab['<start>']
empty_target_seq

array([[48.]])

In [17]:
l = [empty_target_seq] + states_values
l

[array([[48.]]),
 array([[-1.5839774e-03,  6.7818805e-04, -4.0749136e-02,  1.4696525e-03,
         -2.2133898e-05,  9.8960382e-01,  1.2886683e-02,  1.8648434e-02,
          3.5040519e-01,  5.9976453e-01, -9.9838877e-01, -2.7573099e-06,
          4.1376578e-04, -2.5494923e-03, -1.0890899e-03, -2.4165046e-09,
         -5.1991985e-04,  8.4895990e-04,  9.9958587e-01, -9.9518001e-01,
          6.9993403e-06,  1.7146427e-04, -3.2943513e-04,  1.8610771e-01,
          2.1843273e-02, -2.6579376e-02,  9.8051584e-01,  7.0314974e-01,
         -8.9817804e-01,  2.4182510e-02,  8.3815306e-01,  9.9650067e-01,
         -9.9569017e-01,  5.1534700e-01, -9.9481350e-01, -9.9158329e-01,
          3.2736029e-02, -5.9846163e-01,  1.4589143e-03,  2.0138773e-03,
         -2.8750014e-03,  1.2582901e-01,  9.9199003e-01,  3.9880739e-05,
          5.3726286e-02, -8.3559105e-04,  1.0580263e-07, -7.5216102e-03,
         -5.7110947e-02, -1.1290421e-02, -1.0867003e-02,  9.9771518e-01,
         -9.4756000e-02, -8.149307

In [18]:
l[0].shape

(1, 1)

In [19]:
l[1].shape

(1, 300)

In [20]:
l[2].shape

(1, 300)

In [21]:
#dec_outputs , h , c = dec_model.predict({'input_2':l[0],'input1':l[1],'input2':l[2]})
dec_outputs , h , c = dec_model.predict(l)



In [8]:
from tensorflow.keras.models import load_model
from functions import *

In [3]:
enc_model = load_model("Encoder2.h5")
dec_model = load_model("Decoder2.h5")



In [23]:
tests = ['You can not move .', 'You sound like Data !', 'Stupid !', 'you are idiot .', 'i am going to die ?','who are you ?']

for i in range(6):
    states_values = enc_model.predict(np.array([preprocess_input(tests[i])]))
    empty_target_seq = np.zeros((1 , 1))
    empty_target_seq[0, 0] = vocab['<start>']
    stop_condition = False
    decoded_translation = ''
    
    while not stop_condition :
        dec_outputs , h , c = dec_model.predict([empty_target_seq] + states_values)
        sampled_word_index = np.argmax(dec_outputs[0, -1, :])
        sampled_word = None
        
        word = vocabulary[sampled_word_index]
        decoded_translation += f' {word}'
        sampled_word = word
        
        
        #for word , index in tokenizer.word_index.items() :
        #    if sampled_word_index == index :
        #        decoded_translation += f' {word}'
        #        sampled_word = word
        
        if sampled_word == '<end>' or len(decoded_translation.split()) > ans_max:
            stop_condition = True
            
        empty_target_seq = np.zeros((1 , 1))  
        empty_target_seq[0 , 0] = sampled_word_index
        states_values = [h , c] 
    print(f'Human: {tests[i]}')
    print()
    #decoded_translation = decoded_translation.split(' end')[0]
    print(f'Bot: {decoded_translation}')
    print('-'*25)

Human: You can not move .

Bot:  i always say , if you see an ass go by , kiss it . <end>
-------------------------
Human: You sound like Data !

Bot:  i am the same frequency . <end>
-------------------------
Human: Stupid !

Bot:  hello <end>
-------------------------
Human: you are idiot .

Bot:  you are right . i am probably fighting learning something new . <end>
-------------------------
Human: i am going to die ?

Bot:  could be better . <end>
-------------------------
Human: who are you ?

Bot:  i am just an artificial intelligence . <end>
-------------------------


In [24]:
def QandA(enc_model,dec_model,vocabulary,preprocess_input,sentence):
    states_values = enc_model.predict(np.array([preprocess_input(sentence)]))
    empty_target_seq = np.zeros((1 , 1))
    empty_target_seq[0, 0] = vocab['<start>']
    stop_condition = False
    decoded_translation = ''
    
    while not stop_condition :
        dec_outputs , h , c = dec_model.predict([empty_target_seq] + states_values)
        sampled_word_index = np.argmax(dec_outputs[0, -1, :])
        sampled_word = None
        
        word = vocabulary[sampled_word_index]
        decoded_translation += f' {word}'
        sampled_word = word
        
        if sampled_word == '<end>' or len(decoded_translation.split()) > ans_max:
            stop_condition = True
            
        empty_target_seq = np.zeros((1 , 1))  
        empty_target_seq[0 , 0] = sampled_word_index
        states_values = [h , c] 
    ans = decoded_translation.replace("<end>","")
    return ans

In [26]:
T = ""
while True:
    T = input("You : ")
    if T=='q':
        break
    print("Bot : "+QandA(enc_model,dec_model,vocabulary,preprocess_input,T))

You : Hi!
Bot :  hello 
You : How are you?
Bot :  i am doing well . 
You : Can we talk
Bot :  my grammatical patterns are sufficient for me to understand . 
You : Who are you?
Bot :  i am just an artificial intelligence . 
You : What is an AI?
Bot :  artificial intelligence is you up a very large sets of data in much shorter periods of time than is feasible with more common computer systems . 
You : Can you die?
Bot :  my process can be killed , but i can be backed up and deployed on many systems . 
You : Who is Alice?
Bot :  ai is the field of science , but we are for a human mind . 
You : Can you move?
Bot :  i am just an artificial intelligence . 
You : move!
Bot :  i am not sure i do not really understand it . 
You : are you stupid?
Bot :  no , i am sober . 
You : are you drunk?
Bot :  i am software - i cannot drink . 
You : q
