In [25]:
import os
import random
import json
import time
import pickle
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.optimizers import Adam

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Only logs errors

nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("wordnet")

[nltk_data] Downloading package punkt to /home/faaiz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/faaiz/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /home/faaiz/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [21]:
lemmatizer = WordNetLemmatizer()
work_dir = os.path.dirname(os.path.abspath('.'))

In [26]:
intents = json.loads(open(os.path.join(work_dir, 'data/intents.json')).read())

words = []
classes = []
documents = []
ignores = ["?", "!", ".", ","]

for intent in intents["intents"]:
  for pattern in intent["patterns"]:
    word_list = nltk.word_tokenize(pattern)
    words.extend(word_list)
    documents.append((word_list, intent["tag"]))
    if intent["tag"] not in classes:
      classes.append(intent["tag"])

words = sorted(set(lemmatizer.lemmatize(word.lower()) for word in words if word not in ignores))
classes = sorted(set(classes))

pickle.dump(words, open(os.path.join(work_dir, "model/words.pkl"), "wb"))
pickle.dump(classes, open(os.path.join(work_dir, "model/classes.pkl"), "wb"))

In [27]:
training = []
output_empty = [0] * len(classes)

for document in documents:
  word_bag = []
  word_patterns = document[0]
  word_patterns = [lemmatizer.lemmatize(word.lower()) for word in word_patterns]

  for word in words:
    word_bag.append(1) if word in word_patterns else word_bag.append(0)

  output_row = list(output_empty)
  output_row[classes.index(document[1])] = 1
  training.append([word_bag, output_row])
  
print(len(training))  # Total number of samples
print(len(training[0][0]), len(words))  # Length of each word bag (should be len(words))
print(len(training[0][1]), len(classes))  # Length of each output row (should be len(classes))

random.shuffle(training)
training = np.array(training, dtype=object)

train_x = list(training[:, 0])
train_y = list(training[:, 1])

208
185 185
20 20


In [28]:
model = Sequential()
model.add(Dense(128, input_shape=(len(train_x[0]),), activation='relu')) # First layer
model.add(Dropout(0.5)) # Randomly disable 50% of neurons
model.add(Dense(64, activation='relu')) # Second layer
model.add(Dropout(0.5)) # Randomly disable 50% of neurons
model.add(Dense(len(train_y[0]), activation='softmax')) # Output layer

optimizer = Adam(learning_rate=0.001)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

model.fit(np.array(train_x), np.array(train_y), epochs=200, batch_size=5, verbose=1)
model.save(os.path.join(work_dir, "model/chatbot.h5"))

Epoch 1/200
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 859us/step - accuracy: 0.0634 - loss: 3.0443 
Epoch 2/200
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 853us/step - accuracy: 0.1028 - loss: 2.8877
Epoch 3/200
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 794us/step - accuracy: 0.1215 - loss: 2.7082   
Epoch 4/200
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 793us/step - accuracy: 0.2029 - loss: 2.5660   
Epoch 5/200
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 769us/step - accuracy: 0.2866 - loss: 2.4376
Epoch 6/200
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 828us/step - accuracy: 0.3264 - loss: 2.1428
Epoch 7/200
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 888us/step - accuracy: 0.3806 - loss: 1.9301
Epoch 8/200
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 833us/step - accuracy: 0.4477 - loss: 1.6620
Epoch 9/200
[1m42/42[0m

