<a href="https://colab.research.google.com/github/Dh0t/Tugas/blob/main/Tuags_NLP_PTM11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import tensorflow as tf
import numpy as np
import random
import string

# Persiapan Data

input_texts = [
    "satu", "dua", "tiga", "empat", "lima",
    "satu dua", "dua tiga", "tiga empat", "empat lima",
    "satu dua tiga", "dua tiga empat", "tiga empat lima"
]

target_texts = [
    "01", "10", "11", "001", "010",
    "0110", "1011", "11001", "001010",
    "011011", "1011001", "11001010"
]

# Target sequence
target_texts_input = ['\t' + text for text in target_texts] # Input decoder
target_texts_output = [text + '\n' for text in target_texts] # Output decoder

# Karakter unik untuk input dan target
input_characters = sorted(list(set(' '.join(input_texts))))
target_characters = sorted(list(set(''.join(target_texts_input + target_texts_output))))

num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)

max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts]) + 2

print(f"Jumlah sampel: {len(input_texts)}")
print(f"Karakter unik encoder: {num_encoder_tokens}")
print(f"Karakter unik decoder: {num_decoder_tokens}")
print(f"Panjang sequence encoder maksimum: {max_encoder_seq_length}")
print(f"Panjang sequence decoder maksimum: {max_decoder_seq_length}")

# Untuk mapping karakter ke indeks integer
input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])
reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())

# Inisialisasi array untuk data input, target input, dan target output (one-hot encoding)
encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens),
    dtype='float32'
)
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32'
)
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32'
)

# Isi array dengan data one-hot
for i, (input_text, target_text_input, target_text_output) in enumerate(zip(input_texts, target_texts_input, target_texts_output)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.
    for t, char in enumerate(target_text_input):
        decoder_input_data[i, t, target_token_index[char]] = 1.
    for t, char in enumerate(target_text_output):
        if t > 0:
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.

# Membangun Model Seq2Seq (Encoder-Decoder)

# Encoder
encoder_inputs = tf.keras.Input(shape=(None, num_encoder_tokens))
encoder_lstm = tf.keras.layers.LSTM(256, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_inputs)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = tf.keras.Input(shape=(None, num_decoder_tokens))
decoder_lstm = tf.keras.layers.LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = tf.keras.layers.Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Definisi model
model = tf.keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Kompilasi model
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

# Ringkasan model
model.summary()

# Pelatihan Model
batch_size = 64
epochs = 100

print("\nMulai pelatihan model...")
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2)
print("Pelatihan selesai.")

# Inferensi (Prediksi) Model

# Encoder Inferensi: Mengambil input dan menghasilkan state
encoder_model = tf.keras.Model(encoder_inputs, encoder_states)

# Decoder Inferensi: Mengambil state awal dan input token saat ini
decoder_state_input_h = tf.keras.Input(shape=(256,))
decoder_state_input_c = tf.keras.Input(shape=(256,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = tf.keras.Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states
)

# Fungsi untuk melakukan decoding (menerjemahkan)
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    target_seq[0, 0, target_token_index['\t']] = 1.

    # Loop untuk menghasilkan karakter demi karakter
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Ambil token dengan probabilitas tertinggi
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Kondisi berhenti: jika mencapai token akhir atau panjang maksimum
        if (sampled_char == '\n' or
                len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # Update target sequence (input untuk langkah berikutnya)
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence

print("\n--- Hasil Prediksi ---")
for seq_index in range(len(input_texts)):
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print(f"Input: {input_texts[seq_index]}")
    print(f"Target: {target_texts[seq_index]}")
    print(f"Prediksi: {decoded_sentence.strip()}")
    print("-" * 10)

Jumlah sampel: 12
Karakter unik encoder: 12
Karakter unik decoder: 4
Panjang sequence encoder maksimum: 15
Panjang sequence decoder maksimum: 10



Mulai pelatihan model...
Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - accuracy: 0.4333 - loss: 0.4795 - val_accuracy: 0.3667 - val_loss: 0.9622
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 725ms/step - accuracy: 0.1556 - loss: 0.4740 - val_accuracy: 0.3333 - val_loss: 0.9539
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 299ms/step - accuracy: 0.1444 - loss: 0.4703 - val_accuracy: 0.3333 - val_loss: 0.9452
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 143ms/step - accuracy: 0.1333 - loss: 0.4668 - val_accuracy: 0.3333 - val_loss: 0.9331
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 163ms/step - accuracy: 0.1333 - loss: 0.4624 - val_accuracy: 0.3333 - val_loss: 0.9068
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 178ms/step - accuracy: 0.1333 - loss: 0.4527 - val_accuracy: 0.3333 - val_loss: 0.8288
Epoch 7/100