### **Import Library**

In [1]:
import xml.etree.ElementTree as ET
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
import matplotlib.pyplot as plt

2024-06-26 15:17:10.247689: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-26 15:17:10.584138: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-26 15:17:11.907270: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### **Parsing XML Data**

In [2]:
def parse_xml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()

    kanji_data = []
    for child in root:
        data = {
            'id': int(child.find('id').text),
            'Comp_Word': child.find('Comp__Word').text,
            'Frequency': int(child.find('Frequency').text),
            'Grammatical_Feature': child.find('Grammatical_Feature').text,
            'Pronunciation': child.find('Pronunciation').text,
            'English_Translation': child.find('English_Translation').text,
            'Position': child.find('Position').text,
            'Kanji': child.find('Kanji').text,
            'KanjiID': int(child.find('KanjiID').text),
        }
        kanji_data.append(data)
    return kanji_data

file_path = 'Jukugo.xml'
data = parse_xml(file_path)

print(f"Jumlah data yang berhasil diparsing: {len(data)}")

Jumlah data yang berhasil diparsing: 52791


### **Menghapus data dengan value None**

In [3]:
kanji_data = [d for d in data if d['Comp_Word'] is not None and d['English_Translation'] is not None]

kanji_words = [d['Comp_Word'] for d in kanji_data]
english_translations = [d['English_Translation'] for d in kanji_data]

### **Tokenization**

In [4]:
tokenizer_kanji = Tokenizer(char_level=True)
tokenizer_kanji.fit_on_texts(kanji_words)
kanji_sequences = tokenizer_kanji.texts_to_sequences(kanji_words)
kanji_padded = pad_sequences(kanji_sequences, padding='post')

tokenizer_eng = Tokenizer()
tokenizer_eng.fit_on_texts(english_translations)
eng_sequence = tokenizer_eng.texts_to_sequences(english_translations)
eng_padded = pad_sequences(eng_sequence, padding='post')

### **Membuat Model**

In [5]:
embedding_dim = 256
units = 512

kanji_input = Input(shape=(None,), dtype='int32', name='kanji_input')
embedding = Embedding(input_dim=len(tokenizer_kanji.word_index) + 1, output_dim=embedding_dim)(kanji_input)
lstm_1 = LSTM(units, return_sequences=True, return_state=True)
encoder_ouputs, state_h, state_c = lstm_1(embedding)
encoder_states = [state_h, state_c]

eng_input = Input(shape=(None,), dtype='int32', name='eng_input')
eng_embedding = Embedding(input_dim=len(tokenizer_eng.word_index) + 1, output_dim=embedding_dim)(eng_input)
lstm_2 = LSTM(units, return_sequences=True, return_state=True)
decoder_ouputs, _, _ = lstm_2(eng_embedding, initial_state=encoder_states)
dense = Dense(len(tokenizer_eng.word_index) + 1, activation='softmax')
output = dense(decoder_ouputs)

model = Model([kanji_input, eng_input], output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

### **Melatih Model**

In [6]:
target_data = np.expand_dims(eng_padded, -1)
model.fit([kanji_padded, eng_padded], target_data, batch_size=64, epochs=100, validation_split=0.2)

Epoch 1/100


2024-06-26 15:17:28.964234: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1480589312 exceeds 10% of free system memory.


[1m  1/660[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:36:29[0m 9s/step - loss: 9.3295

2024-06-26 15:17:33.691905: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1480589312 exceeds 10% of free system memory.


[1m  2/660[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m42:48[0m 4s/step - loss: 9.2951  

2024-06-26 15:17:37.534484: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1480589312 exceeds 10% of free system memory.


[1m  3/660[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m41:49[0m 4s/step - loss: 9.2366

2024-06-26 15:17:41.303424: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1480589312 exceeds 10% of free system memory.


[1m  4/660[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m41:24[0m 4s/step - loss: 9.1287

2024-06-26 15:17:45.014037: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1480589312 exceeds 10% of free system memory.


[1m350/660[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m19:01[0m 4s/step - loss: 1.3509

### **Evaluasi Model**

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()