In [66]:
import numpy as np
import json
import tensorflow as tf
import nltk
from nltk.stem import PorterStemmer
import keras
from tensorflow.keras import layers
from tensorflow.keras.utils import to_categorical

with open('dataset.json', 'r') as file:
    dataset = json.load(file)

token_set = []
X = []
y = []

labels = []
label_dict = {}
for idx, intent in enumerate(dataset['intents']):
    label = intent['tag']
    labels.append(label)
    label_dict[idx] = label
    for q in intent['questions']:
        tokens = nltk.word_tokenize(q)
        token_set.extend(tokens)
        X.append(tokens)
        y.append(label)

print(X[:5])
print(y[:5])

stemmer = PorterStemmer()

token_stems = []
for token in token_set:
    if token is not "?":
        token_stems.append(stemmer.stem(token.lower()))

# print(token_stems)
token_stems = sorted(list(set(token_stems)))
labels = sorted(labels)

# print(token_stems)
# print(labels)

X_bags = []
y_labels = []

for idx, x in enumerate(X):
    bag_of_words = []
    tokens = [stemmer.stem(w.lower()) for w in x]
    #print(tokens)
    for t in token_stems:
        if t in tokens:
            bag_of_words.append(1)
        else:
            bag_of_words.append(0)

    X_bags.append(bag_of_words)
    y_labels.append(labels.index(y[idx]))

X_bags = np.array(X_bags)
y_labels = np.array(y_labels)
print(X_bags.shape)
# print(y_labels.shape)

print(set(y_labels))
print(len(X_bags[5]))

model = keras.Sequential([
    layers.InputLayer(input_shape=(1, X_bags.shape[1])),  # Bag-of-words shape with 1 feature
    layers.LSTM(8, activation='relu'),  # LSTM with 32 units
    layers.Dense(8, activation='relu'),
    layers.Dense(4, activation='relu'),
    layers.Dense(15, activation='softmax')  # Output layer for classification
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

y_labels_one_hot = to_categorical(y_labels, num_classes=15)
X_bags_reshaped = X_bags.reshape((-1, 1, X_bags.shape[1])) 

model.fit(X_bags_reshaped, y_labels_one_hot, epochs=200, validation_data=(X_bags_reshaped, y_labels_one_hot))


[['What', 'is', 'espresso', '?'], ['what', 'drink', 'is', 'espresso', '?'], ['How', 'is', 'espresso', 'different', 'than', 'coffee', '?'], ['What', 'is', 'the', 'origin', 'of', 'espresso', '?'], ['What', 'makes', 'espresso', 'unique', 'compared', 'to', 'other', 'brewing', 'methods', '?']]
['espresso', 'espresso', 'espresso', 'espresso', 'espresso']
(150, 184)
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}
184
Epoch 1/200


  if token is not "?":


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 67ms/step - accuracy: 0.0874 - loss: 2.7072 - val_accuracy: 0.0667 - val_loss: 2.7049
Epoch 2/200
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.0726 - loss: 2.7052 - val_accuracy: 0.0800 - val_loss: 2.7032
Epoch 3/200
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.0787 - loss: 2.7030 - val_accuracy: 0.0800 - val_loss: 2.7013
Epoch 4/200
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.0884 - loss: 2.7019 - val_accuracy: 0.0867 - val_loss: 2.6993
Epoch 5/200
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - accuracy: 0.1244 - loss: 2.6999 - val_accuracy: 0.0933 - val_loss: 2.6968
Epoch 6/200
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.0819 - loss: 2.6994 - val_accuracy: 0.0933 - val_loss: 2.6939
Epoch 7/200
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

<keras.src.callbacks.history.History at 0x22d596bfc50>

In [67]:
model.save('model.keras')

with open("vocab.txt", "w") as file:
    for item in token_stems:
        file.write(f"{item}\n")

with open("label_dict.json", "w") as file:
    json.dump(label_dict, file, indent=4)



