In [None]:
%pip install numpy matplotlib seaborn scikit nltk tensorflow keras gensim

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from gensim.models import Word2Vec
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, TimeDistributed
from tensorflow.keras.layers import Embedding

In [None]:
# Contoh Dataset POS (Part Of Speech) Menggunakan Bahasa Jawab

sentences = [
    ['Bapak', 'ngaso', 'wonten', 'griya'],
    ['Bapak', 'badhe', 'tumbas', 'buku', 'ing', 'toko'],
    ["aku", "tuku", " beras"],
    ["dheweke", "mangan", " nasi"]
]

tags = [
    ['NOUN', 'VERB', 'ADP', 'NOUN'],
    ['PRON', 'AUX', 'VERB', 'NOUN', 'ADP', 'NOUN'],
    ["PRON", "VERB", "NOUN"],
    ["PRON", "VERB", "NOUN"],
]

# Combine the words and tags into a list of sentences with tagged words
tagged_sentences = [[(word, tag) for word, tag in zip(sentence, tag_list)]
                   
                    for sentence, tag_list in zip(sentences, tags)]

print(tagged_sentences)

In [None]:
# Menyimpan Dataset ke dalam bentuk file txt

with open('pos_tags_sentences.txt', 'w', encoding='utf-8') as file:
    for sentence, tag_list in zip(sentences, tags):
        file.write(' '.join(sentence) + "\n")
        file.write(' '.join(tag_list) + "\n")

In [None]:
# Fungsi baca Dataset dari file txt 

def read_sentences_tags(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        sentences = []
        tags = []
        for i in range(0, len(lines), 2):
            sentences.append(lines[i].strip().split())
            tags.append(lines[i+1].strip().split())
    return sentences, tags

In [None]:
# Fungsi baca Dataset dari file txt 

filepath = 'pos_tags_sentences.txt'
sentences, tags = read_sentences_tags(filepath)
print("Loaded Sentences:", sentences)
print("Loaded Tags:", tags)

In [None]:
# Inspeksi dataset, lihat antara kata dan tag nya apa

tagged_sentences = [[(word, tag) for word, tag in zip(sentence, tag_list)]
                   
                    for sentence, tag_list in zip(sentences, tags)]

print(tagged_sentences)

In [None]:
# semua sentences di lower case, dengan asumsi besar kecil huruf tidak berpengaruh terhadap tag

sentences = [[word.lower() for word in sublist] for sublist in sentences]
print(sentences)

In [None]:
# Ubah kata dalam bentuk token, karena program tidak mengerti dalam bentuk string
# Selanjutnya, akan di perkenalkan teknik yang lebih advanced yaitu penggunaan word embedding yang dapat menambahkan smiliaity antar kata

# Feed the tokenizer
tokenizer = Tokenizer()

# Tokenizers for sentences and tags
word_tokenizer = Tokenizer()
tag_tokenizer = Tokenizer()

# Fit tokenizers on data
word_tokenizer.fit_on_texts(sentences)
tag_tokenizer.fit_on_texts(tags)

# Convert sentences and tags into sequences
X_train = word_tokenizer.texts_to_sequences(sentences)
y_train = tag_tokenizer.texts_to_sequences(tags)

# Example output after tokenizing
print("Tokenized sentences:", X_train)
print("Tokenized tags:", y_train)

In [None]:
# Padding di gunakan untuk menyamakan ukuran kalimat, karena antar kalimat panjang kata mungkin beda

# Pad sequences
max_len = max(len(s) for s in X_train)  # Maximum sentence length

X_train_padded = pad_sequences(X_train, maxlen=max_len, padding='pre', truncating="post")
y_train_padded = pad_sequences(y_train, maxlen=max_len, padding='pre', truncating="post")

print("Padded sentences:", X_train_padded)
print("Padded tags:", y_train_padded)

In [None]:
vocab_size = len(word_tokenizer.word_index) + 1
tag_size = len(tag_tokenizer.word_index) + 1

print(vocab_size, tag_size)

In [None]:
# Arsitektur RNN baru disini, disini lah yang bisa kita utak atik untuk optimalisasi

# Dimensionality of the embedding space
embedding_dim = 100

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, trainable=True),  # Use the embedding layer as the first layer
    SimpleRNN(units=64, return_sequences=True),  # RNN layer
    TimeDistributed(Dense(tag_size, activation='softmax'))  # Output layer for each token
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

# Assuming you have padded your inputs as X_train_padded and outputs as y_train_padded
# Train the model
model.fit(X_train_padded, y_train_padded, batch_size=2, epochs=40)

In [None]:
# testing

test_sentence = ["aku tuku beras"]
test_seq = word_tokenizer.texts_to_sequences(test_sentence)
test_padded = pad_sequences(test_seq, maxlen=max_len, padding='post')

In [None]:
prediction = model.predict(test_padded)
predicted_indices = np.argmax(prediction, axis=-1)[0]  # get index of max prob in predictions for each position

# Translate indices to tags
predicted_tags = [tag_tokenizer.index_word.get(i, 'UNK') for i in predicted_indices]  # Use get to avoid KeyError

print("Test Sentence:", test_sentence)
print("Predicted Tags:", predicted_tags)