In [1]:
##IMPORTING LIBRARIES

import pickle    #for serializing python objects (Eg- Saving models)
import random    #used to get random numbers
import json      #for working with JSON files

import numpy as np    #for numerical operations
import nltk           #for tokenizing and lemmatizing text
from nltk.stem import WordNetLemmatizer

#tensorflow.keras is used here to build DL models, and specifically using Keras, a high level API for  building neural networks within TF
from tensorflow.keras.models import Sequential          #Sequential is a special case of model where model is purely a stck of single-input, single-output layers
from tensorflow.keras.layers import Dense, Activation, Dropout      #Dense (Basic building block where every input connects to every output)
#Activation func. (Decides if a neuron should be activated or not, making the network smart)
#Dropout (Randomly ignores some neurons during training to make the network better at generalizing)

from tensorflow.keras.optimizers import SGD   #Stochastic gradient descent


In [2]:
nltk.download('punkt_tab') #for sentence tokenization
nltk.download('wordnet') #A lexical database fpr semantic understanding
nltk.download('omw-1.4')    #Open multilingual WordNet

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\chaan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\chaan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\chaan\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [11]:
# from google.colab import drive
# drive.mount('/content/drive')

# intents_file = open('/content/drive/MyDrive/Colab Notebooks/intents.json').read()
# intents = json.loads(intents_file)  #loads JSON file

# intents = json.loads(open('intents.json').read()) 

import json  

# Safely open and read the JSON file with the correct encoding  
try:  
    # Specify the encoding as UTF-8  
    with open('intents.json', 'r', encoding='utf-8') as file:  
        intents = json.load(file)  # Load the JSON data from the file  
except FileNotFoundError:  
    print("Error: The file 'intents.json' was not found.")  
except json.JSONDecodeError:  
    print("Error: The file 'intents.json' contains invalid JSON.")  
except UnicodeDecodeError as e:  
    print(f"Encoding error: {e}. It seems the file might not be encoded in 'utf-8'.")  
except Exception as e:  
    print(f"An unexpected error occurred: {e}")  

# Now you can use 'intents' as a regular Python data structure

In [12]:
lemmatizer = WordNetLemmatizer()  #converts into base or root form [Eg- 'Running' to 'Run]

# intents = json.load(open('intents.json'))  #loads JSON File

words = []   #to store all words found in the training patterns
classes = []  #to store unique intent tags (categories) [Eg- 'greeting' or 'neutral-response']
documents = []  #a list of tuples where each tuple contains a tokenized sentence (word list) and its associated intent tag
Ignore_Symbols = [',', '.', '?', '!']   #Specifies characters to ignore


In [13]:
for intent in intents['intents']:   #to go through all the intents in the json file
    tag = intent['tag']
    for pattern in intent['patterns']:   #to go through all the patterns in the intents
        word_list = nltk.word_tokenize(pattern)            #splits the sentences into individual words

        #tokenize and lemmatize each word in the pattern
        words_list = [lemmatizer.lemmatize(word.lower()) for word in nltk.word_tokenize(pattern) if word not in Ignore_Symbols]

        #Add words and documents
        words.extend(word_list)          #Add the tokenized words to the words list
        documents.append((word_list, tag))      #Appends the tokenized pattern and its tag as a tuple to documents

        #Add the tag to the classes if its not already present
        if tag not in classes:
            classes.append(tag)


In [None]:
#Removes duplicates by converting to a set, then back to a list, to ensure that the entries are unique and unordered
words = sorted(set(words))       #sorted() - converts the sets back to lists and sorts them
classes = sorted(set(classes))


In [16]:
#Saving data with pickle; saves the words and classes lists to files using pickle module
#It enables you to load them later without reprocessing
pickle.dump(words, open('model_words.pkl', 'wb'))
pickle.dump(classes, open('model_classes.pkl', 'wb'))

In [17]:
#Preparing training data;
training = []    #will store input-output pairs for training the model
output_empty = [0] * len(classes)       #creates a list of zeroes with equal length to the number of unique classes; this serves as a template for the output

In [18]:
##Focuses on converting the textual input and output into numeric format for ML algo

for document in documents:         #Iterates through each document in the documents list (Document is a tuple containing a tokenized sentence and its associated tag)
    bag = []        #Initialized to store an empty list for each bag of words
    word_patterns = document[0]         #Get the tokenized words from the documents
    word_patterns = [lemmatizer.lemmatize(word) for word in word_patterns if word not in Ignore_Symbols]        #Each word is being lemmatize while ignoring symbols defined in Ignore_Symbols

    for word in words:          #For each word in the words list (contains all unique words from training dataset), it checks if that word is present in word_patterns
        bag.append(1) if word in word_patterns else bag.append(0)       #It appends '1' if the word is present and '0' if not; creating binary representation of whether each word exists in the current document

    output_row = list(output_empty)         #create an output row initialized to zeroes; which is of same length as the number of unique classes (intents)
    output_row[classes.index(document[1])] = 1      #The index in output_row that corresponds to the current document's intent tag (document[1]) is set to '1'; Indicates the this particular output(class) is active
    training.append([bag, output_row])      #Append the bag and output row pair to the training list; Each entry in 'training' will be a pair of input features(bag of words) and the corresponding output (the intent representation)


In [19]:
random.shuffle(training)        #training list is shuffled to mix the order of samples; helps in preventing model from learning the sequence of training samples
training = np.array(training, dtype=object)   #Convert the list to numpy array for easier manipulation; use dtype=object for variable length arrays

train_x = list(training[:, 0])      #Extracts the input features (bag of words) from training array and converts it to the list
train_y = list(training[:, 1])      #Extracts the output labels (one-hot encoded intents) from training array and converts it into list

## ONE-HOT ENCODING - method for converting categorical variables into binary format (0s and 1s, where '1' indicates the presence of that category and '0' indicates its absence)

In [20]:
model = Sequential()    #initializes a sequential model, which is a type of neural network where the layers are stacked sequentially

In [None]:
model.add(Dense(128, input_shape=(len(train_x[0]),), activation='relu'))    #Dense(128) - adds a fully connected (dense) layer with 128 neurons(units). Each neuron will compute its output based on its inputs through weights and biases.
#input_shape=(len(train_x[0]),) - defines the input shape for the first layer. len(train_x[0]) indicates the number of features in the i/p data, ensuring the first layer knows how many inputs it will receive(the dimension of the feature vector).
#activation='relu' - specifies the activation function for the layer.
#The rectified Linear unit (ReLU) activation function is used because it helps mitigate issues like vanishing gradients and allows the network to learn complex patterns. It outputs the input directly if its positive; otherwise it outputs zero.

model.add(Dropout(0.5)) #This layer randomly sets 50% of the inputs to zero during training.
#Dropout is a regularization technique that helps prevent overfitting by ensuring that the model does not rely too heavily on any particular neuron.


In [32]:
model.add(Dense(64, activation='relu')) #Another dense layer with 64 neurons is added, again using ReLU activation function. This layer will learn complex representations and patterns from the input data.

model.add(Dropout(0.5))     #this adds regularization again to help reduce overfitting by randomly setting 50% of the activations to zero in the layer preceding it.

In [33]:
model.add(Dense(len(train_y[0]), activation='softmax'))     #this is the output layer of the network.
#The number of neurons in this layer equals the number of unique classes(intents), which corresponds to the length of train_y[0](the one-hot encoding o/p)
#activation='softmax' - used in output for multi-class classification problems. It converts the output into a probability distribution; each output value represents the relative likelihood of each class.


In [34]:
sgd = SGD(learning_rate=0.001, weight_decay=0.0001, nesterov=True) #SGD - this initializes Stochastic Gradient Descent optimizer, which updates the model weights based on the gradients of the loss function.
#learning_rate=0.001 - the step size at each iteration while moving towards a minimum of the loss function.
#weight_decay=1e-6 - this parameter applies L2 regularization to the weights, helping to prevent overfitting by penalizing large weights.
#Regularization is any mechanism that reduces overfitting.
#L1 regularization - a type of regularization that penalizes weights in proportion to the sum of the absolute value of the weights. L1 regularization helps drive the weights of irrelevant or barely relevant features to exactly 0. A feature with weight of 0 is effectively removed from the model.
#L2 regularization - a type of regularization that penalizes weights in proportions to the sum of the squares of the weights. L regularization helps drive outlier weights (those with high positive or low negative values) closer to 0 but not quite 0. Features with values very close to 0 remain in the model but don't influence the model's predictions very much.
#momentum=0.9 - this parameter helps accelerate SGD in the relevant direction and dampens oscillations; it is a technique used in gradient descent, which is a method for optimizing functions. Instead of just looking at the slope (or gradient) of the function at the current point to decide how to move, momentum also considers the slopes from the previous steps.
#nestrov=True - this enables Nestrov momentum, which provides a more responsive and faster convergence compares to standard momentum


In [35]:
model.compile(loss="categorical_crossentropy", optimizer=sgd, metrics=["accuracy"]) #compiles the model
#loss="categorical_crossentropy" - specifies the loss function to be used for training. Categorical cross-entropy is suitable for multi class classification problems, where the target outcomes are one-hot encoded.
#optimizer=sgd - sets the optimizer defined earlier to be used during training.
#metrics=["accuracy"] - performance measure during training and validation. This will track accuracy through out the training process.


In [37]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(  
    monitor='val_loss',      # Metric to monitor  
    patience=5,              # Number of epochs to wait before stopping  
    restore_best_weights=True # Restore the model weights from the epoch with the best value of the monitored quantity  
) 

In [38]:
model.fit(np.array(train_x), np.array(train_y), epochs=500, batch_size=32,  verbose=1) #fitting the model
#model.fit() - this methods starts the training proces
#np.array(train_x), np.array(train_y) - converts the training input features and corresponding labels into a numpy arrays(if not already in that format).
#epochs=50 - the number of times the entire training dataset will pass through the model.
#batch_size=5 - number of samples per gradient update. smaller batch provides regualar updates but can lead to more noisy gradients.
#verbose=1 - this controls the verbosity of the output during training. Setting it to 1 shows progress updates.


Epoch 1/500
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.0098 - loss: 4.4971
Epoch 2/500
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.0462 - loss: 4.4740
Epoch 3/500
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.1205 - loss: 4.4529
Epoch 4/500
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.1745 - loss: 4.4304
Epoch 5/500
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.1975 - loss: 4.4083
Epoch 6/500
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.2028 - loss: 4.3840
Epoch 7/500
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.2241 - loss: 4.3532
Epoch 8/500
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.2038 - loss: 4.3334
Epoch 9/500
[1m189/189[0m [32

<keras.src.callbacks.history.History at 0x288792445f0>

In [49]:
model.save('chatbot_model.keras') #Saves the trained model to the specifies file path. the model can later be reloaded to make predictions without needing to retain.