## Persiapan Data

In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Fungsi untuk membaca data dari file teks
def load_words(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        words = file.read().splitlines()
    return words

# Fungsi untuk membaca dan memetakan slang words
def load_slang_words(filename):
    import json
    with open(filename, 'r', encoding='utf-8') as file:
        slang_dict = json.loads(file.read())
    return slang_dict

# Memuat data dari file
root_words = load_words('../Dataset/indonesian_word/combined_root_words.txt')
stop_words = load_words('../Dataset/indonesian_word/combined_stop_words.txt')
slang_dict = load_slang_words('../Dataset/indonesian_word/combined_slang_words.txt')

# Gabungkan semua kata untuk pembentukan kosakata
all_words = list(set(root_words + stop_words + list(slang_dict.keys()) + list(slang_dict.values())))

# Inisialisasi tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_words)

# Konversi kata menjadi urutan angka
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

## Membuat Model Embedding

In [2]:
# Parameter model
embedding_dim = 50

# Membuat model embedding yang lebih fleksibel
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=None),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(64, activation='relu'),  # Lapisan tambahan untuk fleksibilitas
    tf.keras.layers.Dense(vocab_size, activation='softmax')  # Output layer bisa diubah sesuai kebutuhan
])

# Menampilkan ringkasan model
model.summary()

# Mengompilasi model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 50)          1465950   
                                                                 
 global_average_pooling1d (G  (None, 50)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 64)                3264      
                                                                 
 dense_1 (Dense)             (None, 29319)             1905735   
                                                                 
Total params: 3,374,949
Trainable params: 3,374,949
Non-trainable params: 0
_________________________________________________________________


## Training Model

In [3]:
# Tokenisasi data
sequences = tokenizer.texts_to_sequences(all_words)
padded_sequences = pad_sequences(sequences, padding='post')

# Untuk tujuan pelatihan sederhana, kita akan menggunakan kata berikutnya sebagai label
import numpy as np

# Membuat input dan label dari sequences
train_inputs = padded_sequences[:-1]
train_labels = np.array([seq[0] for seq in padded_sequences[1:]])

# Melatih model dengan data gabungan
model.fit(train_inputs, train_labels, epochs=10, validation_split=0.2)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x216eb892aa0>

In [5]:
# Menyimpan model
model.save('../saved_model/base_model_saved/base_model_02/flexible_embedding_model.h5')

# Konversi model ke TensorFlow Lite
converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()

# Menyimpan model TensorFlow Lite
with open('../saved_model/base_model_saved/base_model_02/flexible_embedding_model.tflite', 'wb') as f:
    f.write(tflite_model)


INFO:tensorflow:Assets written to: C:\Users\gabri\AppData\Local\Temp\tmpmtcnen0k\assets


INFO:tensorflow:Assets written to: C:\Users\gabri\AppData\Local\Temp\tmpmtcnen0k\assets


In [14]:
from tensorflow.keras.utils import plot_model

# Visualisasi model
plot_model(model, to_file='model_architecture.png', show_shapes=True, show_layer_names=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model/model_to_dot to work.
