In [None]:
import json
import numpy as np
import random
import pickle
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout, LSTM, Embedding, Flatten
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

# Lemmatizer untuk memproses kata
lemmatizer = WordNetLemmatizer()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
# Load file JSON intents
with open("merged_dataset.json", "r", encoding="utf-8") as file:
    intents = json.load(file)


In [None]:
# List untuk menyimpan data
words = []
classes = []
documents = []
ignore_words = ["?", "!", ".","(",")",",","'"]

# Memproses intents
for intent in intents['intents']:
    for pattern in intent['patterns']:

        # Tokenisasi kata-kata
        word_list = nltk.word_tokenize(pattern)
        words.extend(word_list)
        documents.append((word_list, intent['tag']))

        # Tambahkan tag (label)
        if intent['tag'] not in classes:
            classes.append(intent['tag'])

# Lemmatize dan hapus duplikasi
words = [lemmatizer.lemmatize(word.lower()) for word in words if word not in ignore_words]
words = sorted(set(words))
classes = sorted(set(classes))


In [None]:
# Save words and classes to pickle files
with open("words.pkl", "wb") as words_file:
    pickle.dump(words, words_file)

with open("classes.pkl", "wb") as classes_file:
    pickle.dump(classes, classes_file)

In [None]:
len(words)


735

In [None]:
len(classes)

1540

In [None]:
# Membuat data pelatihan
training = []
output_empty = [0] * len(classes)

for document in documents:
    bag = []
    word_patterns = document[0]
    word_patterns = [lemmatizer.lemmatize(word.lower()) for word in word_patterns]
    for word in words:
        bag.append(1) if word in word_patterns else bag.append(0)

    # Output label
    output_row = list(output_empty)
    output_row[classes.index(document[1])] = 1
    training.append([bag, output_row])

# Convert ke numpy array
random.shuffle(training)
training = np.array(training, dtype=object)

# Split features dan labels
train_x = np.array(list(training[:, 0]))
train_y = np.array(list(training[:, 1]))

In [None]:
model = Sequential([
    Dense(256, activation='relu', input_shape=(len(train_x[0]),)),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(len(train_y[0]), activation='softmax')
])

model.summary()
model.input_shape


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


(None, 735)

In [None]:
# Compile model
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
# Melatih model
model.fit(train_x,
          train_y,
          epochs=30,
          batch_size=32,
          verbose=1)

Epoch 1/30
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 32ms/step - accuracy: 0.0030 - loss: 7.3232
Epoch 2/30
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.0196 - loss: 6.5247
Epoch 3/30
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.1071 - loss: 5.4388
Epoch 4/30
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.2529 - loss: 4.4157
Epoch 5/30
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.3425 - loss: 3.6584
Epoch 6/30
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4392 - loss: 3.0435
Epoch 7/30
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5066 - loss: 2.5957
Epoch 8/30
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5458 - loss: 2.1620
Epoch 9/30
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

<keras.src.callbacks.history.History at 0x78956931ceb0>

In [None]:
model.save("chatbot_model.h5")

