In [None]:
import json
import numpy as np 
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

# Data Prepocessing

In [None]:
# The dataset used for this code has a shape of 150000 rows by 3 columns, where the target variable is included in the third column.

# code to convert the dialog.txt into intent.json format
import json
# This file was used to convert the simple dialogs.txt data into a intents.json type corpus.
# read the dataset from a file
with open('dialogs.txt', 'r') as f:
  lines = f.readlines()

# create a list of intents
intents = []
for line in lines:
  # split the line into input and output
  input_text, output_text = line.strip().split('\t')
  # create an intent with a unique tag
  intent = {
    "tag": f"intent_{len(intents)}",
    "patterns": [input_text],
    "responses": [output_text]
  }
  intents.append(intent)

# create the intents data structure
intents = {
  "intents": intents
}

# serialize the intents data structure to JSON
intents_json = json.dumps(intents, indent=2)

# write the intents data structure to the intents.json file
with open('intents.json', 'w') as f:
  f.write(intents_json)

# Training

In [None]:
# 1. Initialize empty lists for training sentences, training labels, labels, and responses.
training_sentences = []
training_labels = []
labels = []
responses = []

# 2. Open the `intents.json` file and load the data into a Python dictionary using the `json.load()` method.
with open('intents.json') as file:
    data = json.load(file)
    # print(data)

# 3. Loop through each intent in the data and extract the patterns and tag.
for intent in data['intents']:
    for pattern in intent['patterns']:
        training_sentences.append(pattern)
        training_labels.append(intent['tag'])
        # 4. Append the responses to the `responses` list.
    responses.append(intent['responses'])
# 5. If the intent tag is not already in the `labels` list, append it.
    if intent['tag'] not in labels:
        labels.append(intent['tag'])

# print(training_labels)

# 6. Get the number of classes (i.e., intents) by getting the length of the `labels` list.
num_classes = len(labels)

In [None]:
# 7. Initialize a `LabelEncoder()` object and fit it to the training labels.
lbl_encoder = LabelEncoder()
lbl_encoder.fit(training_labels)

# 8. Transform the training labels using the fitted label encoder.
training_labels = lbl_encoder.transform(training_labels)

# 9. Set the vocabulary size, embedding dimension, maximum sequence length, and out-of-vocabulary token.
vocab_size = 1000
embedding_dim = 16
max_len = 20
oov_token = "<OOV>"


In [None]:
# 10. Initialize a `Tokenizer()` object with the specified vocabulary size and out-of-vocabulary token, and fit it to the training sentences.
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(training_sentences)

# 11. Get the word index and convert the training sentences to sequences of integer indices using the fitted tokenizer.
word_index = tokenizer.word_index 
sequences = tokenizer.texts_to_sequences(training_sentences)

# 12. Pad the sequences to ensure they have the same length using the specified maximum sequence length and truncation method.
padded_sequences = pad_sequences(sequences, truncating='post', maxlen=max_len)

In [None]:
# 13. Define a sequential model with an embedding layer, global average pooling layer, two dense layers with ReLU activation, and a dense output layer with softmax activation.
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
model.add(GlobalAveragePooling1D())
model.add(Dense(16, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))

# 14. Compile the model with sparse categorical cross-entropy loss, Adam optimizer, and accuracy metric.
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# 15. Print a summary of the model architecture.
model.summary()

# 16. Train the model on the padded sequences and corresponding training labels for the specified number of epochs.
epochs = 100
history = model.fit(padded_sequences, np.array(training_labels), epochs=epochs)



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 16)            16000     
                                                                 
 global_average_pooling1d (G  (None, 16)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 16)                272       
                                                                 
 dense_1 (Dense)             (None, 16)                272       
                                                                 
 dense_2 (Dense)             (None, 3725)              63325     
                                                                 
Total params: 79,869
Trainable params: 79,869
Non-trainable params: 0
____________________________________________________

In [None]:
# to save the trained model
model.save("chat_model")

import pickle

# to save the fitted tokenizer
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# to save the fitted label encoder
with open('label_encoder.pickle', 'wb') as ecn_file:
    pickle.dump(lbl_encoder, ecn_file, protocol=pickle.HIGHEST_PROTOCOL)



In [None]:
pip install colorama

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting colorama
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: colorama
Successfully installed colorama-0.4.6


## File to use the built model

In [None]:
import json
import numpy as np
from tensorflow import keras
from sklearn.preprocessing import LabelEncoder
import colorama

colorama.init()
from colorama import Fore, Style, Back

import random
import pickle

with open("intents.json") as file:
    data = json.load(file)


def chat():
    # load trained model
    model = keras.models.load_model('chat_model')
    # load tokenizer object
    with open('tokenizer.pickle', 'rb') as handle:
        tokenizer = pickle.load(handle)
    # load label encoder object
    with open('label_encoder.pickle', 'rb') as enc:
        lbl_encoder = pickle.load(enc)
    # parameters
    max_len = 20
    while True:
        print(Fore.LIGHTBLUE_EX + "User: " + Style.RESET_ALL, end="")
        inp = input()
        if inp.lower() == "quit":
            break
        result = model.predict(keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences([inp]),truncating='post', maxlen=max_len))
        tag = lbl_encoder.inverse_transform([np.argmax(result)])
        for i in data['intents']:
            if i['tag'] == tag:
                print(Fore.GREEN + "ChatBot:" + Style.RESET_ALL, np.random.choice(i['responses']))
                break
        # print(Fore.GREEN + "ChatBot:" + Style.RESET_ALL,random.choice(responses))


print(Fore.YELLOW + "Start messaging with the bot (type quit to stop)!" + Style.RESET_ALL)
chat()

# Using OpenAI GPT 2


In [None]:
from transformers import pipeline, set_seed
generator = pipeline('text-generation', model='gpt2')
set_seed(42)
while 1:
  inp = input("User : ")
  if inp=="kill":
    break
  out = generator(inp, max_length=30, num_return_sequences=2)
  print(out)

User : so what are tonight


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "so what are tonight's goals, in the hope that one day we can get over this hurdle? You know what I want for tonight? It's"}, {'generated_text': 'so what are tonight\'s questions asked?" he asked.\n\n"Can we go inside for an interview?" asked the former secretary, as his body'}]
User : you know the code


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "you know the code can be tricky. We have the full source code for each piece in the project. So what's more important:\n\nIf"}, {'generated_text': 'you know the code for this. (And also the code for this.) The more we read, the greater the likelihood of this happening. But we'}]
User : kill
