In [25]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/chatbots-intent-recognition-dataset/Intent.json


**Importing Libraries and the dataset**


In [26]:
import numpy as np
import json
import re
import tensorflow as tf
import random
import spacy
nlp = spacy.load('en_core_web_sm')
with open('/kaggle/input/chatbots-intent-recognition-dataset/Intent.json') as f:
    intents = json.load(f)

Preprocessing and cleaning data

In [27]:
# Text cleaning function
def clean(line):
    # Replace any character that is not a letter, period, question mark, exclamation mark, or single quote with a space
    line = re.sub(r'[^a-zA-z.?!\']', ' ', line)
    # Replace consecutive spaces with a single space
    line = re.sub(r'[ ]+', ' ', line)
    return line

In [28]:
# Initialize empty lists to store preprocessed input data and corresponding target labels
input_list, target_list = [], []

# Initialize an empty dictionary to store responses associated with each intent
intent_document = {}

# Loop over each intent data in the 'intents' array of the 'intents' dictionary
for intent_data in intents['intents']:
    
    # Check if the intent label is not already a key in the intent_document dictionary
    if intent_data['intent'] not in intent_document:
        # If not, initialize an empty list for that intent label in the intent_document dictionary
        intent_document[intent_data['intent']] = []
        
    # Loop over each text data associated with the current intent
    for text_data in intent_data['text']:
        # Clean and preprocess the text data using the 'clean' function and append to input_list
        input_list.append(clean(text_data))
        # Append the intent label to target_list for each text example
        target_list.append(intent_data['intent'])
        
    # Loop over each response data associated with the current intent
    for response_data in intent_data['responses']:
        # Append the response data to the list associated with the current intent label in intent_document
        intent_document[intent_data['intent']].append(response_data)


In [29]:
# Tokenization function for input data
def tokenize_data(input_list):
    # Create a Tokenizer object with filters set to an empty string and out-of-vocabulary token as '<unk>'
    tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<unk>')
    
    # Fit the tokenizer on the input text data to build the vocabulary
    tokenizer.fit_on_texts(input_list)
    
    # Convert the input text data to sequences of tokens using the trained tokenizer
    input_seq = tokenizer.texts_to_sequences(input_list)

    # Pad the sequences to ensure uniform length (padding added to the beginning)
    input_seq = tf.keras.preprocessing.sequence.pad_sequences(input_seq, padding='pre')
    
    # Return the trained tokenizer and the padded input sequences
    return tokenizer, input_seq

# Preprocess input data
# Apply the tokenize_data function to the 'input_list' to obtain a trained tokenizer and padded input sequences
tokenizer, input_tensor = tokenize_data(input_list)


In [30]:
# Function to create categorical targets and provide index-to-label mapping
def create_categorical_target(targets):
    # Dictionary to store a mapping from unique target labels to integer indices
    word = {}
    # List to store integer indices corresponding to each target label
    categorical_target = []
    # Counter to assign unique indices to unique target labels
    counter = 0
    
    # Loop over target labels
    for trg in targets:
        # Check if the current target label is not already in the dictionary
        if trg not in word:
            # Assign a unique integer index to the current target label
            word[trg] = counter
            # Increment the counter for the next unique target label
            counter += 1
        # Append the integer index corresponding to the current target label to the list
        categorical_target.append(word[trg])
       
    # Convert the list of integer indices to a one-hot encoded tensor
    categorical_tensor = tf.keras.utils.to_categorical(categorical_target, num_classes=len(word), dtype='int32')
    
    # Return the one-hot encoded tensor and a dictionary mapping integer indices to target labels
    return categorical_tensor, dict((v, k) for k, v in word.items())

# Preprocess output data
# Apply the create_categorical_target function to the 'targets' list
target_tensor, trg_index_word = create_categorical_target(target_list)

In [31]:
print('input shape: {} and output shape: {}'.format(input_tensor.shape, target_tensor.shape))

input shape: (143, 9) and output shape: (143, 22)


In [32]:
# hyperparameters
epochs=50
vocab_size=len(tokenizer.word_index) + 1
embed_dim=512
units=128
target_length=target_tensor.shape[1]

In [33]:
# build RNN Model with tensorflow
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size, embed_dim),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units, dropout=0.2)),
    tf.keras.layers.Dense(units, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(target_length, activation='softmax')
])

optimizer = tf.keras.optimizers.Adam(lr=1e-2)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 512)         66048     
                                                                 
 bidirectional_1 (Bidirecti  (None, 256)               656384    
 onal)                                                           
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_3 (Dense)             (None, 22)                2838      
                                                                 
Total params: 758166 (2.89 MB)
Trainable params: 758166 (2.89 MB)
Non-trainable params: 0 (0.00 Byte)
__________________

In [34]:
early_stop = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=4)

# train the model
model.fit(input_tensor, target_tensor, epochs=epochs, callbacks=[early_stop])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50


<keras.src.callbacks.History at 0x786d6981f8b0>

In [35]:
def response(sentence):
    sent_seq = []
    doc = nlp(repr(sentence))
    
    # split the input sentences into words
    for token in doc:
        if token.text in tokenizer.word_index:
            sent_seq.append(tokenizer.word_index[token.text])

        # handle the unknown words error
        else:
            sent_seq.append(tokenizer.word_index['<unk>'])

    sent_seq = tf.expand_dims(sent_seq, 0)
    # predict the category of input sentences
    pred = model(sent_seq)

    pred_class = np.argmax(pred.numpy(), axis=1)
    
    # choice a random response for predicted sentence
    return random.choice(intent_document[trg_index_word[pred_class[0]]]), trg_index_word[pred_class[0]]

# chat with bot
print("Note: Enter 'quit' to break the loop.")
while True:
    input_ = input('You: ')
    if input_.lower() == 'quit':
        break
    res, typ = response(input_)
    print('Bot: {} -- TYPE: {}'.format(res, typ))
    print()

Note: Enter 'quit' to break the loop.


You:  what is time


Bot: One sec -- TYPE: TimeQuery



You:  quit
