In [1]:
import pandas as pd
import numpy as np

In [2]:
corpus_df = pd.read_csv("res/pan2malay.csv")

```
1, aba, aba-h
2, abaŋ, abaŋ-an
...
```

↓

`[aba, aba-h, abaŋ, abaŋ-an...]`

↓

`abaaba-habaŋabaŋ-an...`

↓

then get all the unique characters

In [3]:
def get_char_set():
    without_index = np.delete(corpus_df.to_numpy(), 0, 1)
    flat = without_index.flatten().astype("str")
    giant_string = "".join(flat)
    return set(giant_string)

In [4]:
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt

In [5]:
encoder_mapping = list(get_char_set())
one_hot = tf.eye(len(encoder_mapping))

def encode(c):
    return one_hot[encoder_mapping.index(c)]

def decode(arr):
    return encoder_mapping[np.argmax(arr)]

vec_of_ng = encode("ŋ")
print(vec_of_ng)
print(decode(vec_of_ng))

tf.Tensor(
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.], shape=(52,), dtype=float32)
ŋ


In [13]:
WORD_MAX_LEN = 64

def text_encoder(doc):
    encoded = []
    for word in doc:
        word_matrix = tf.stack([encode(c) for c in word.ljust(WORD_MAX_LEN, " ")])
        encoded.append(word_matrix.numpy().flatten())
    return np.vstack(encoded)

def text_decoder(matrix_list):
    decoded = []
    for m in matrix_list:
        reshaped = m.reshape((WORD_MAX_LEN, len(encoder_mapping)))
        guess = "".join([decode(reshaped[i]) for i in range(WORD_MAX_LEN)])
        decoded.append(str.strip(guess))
    return decoded

print(corpus_df["pan"][:5])
doc_encoding_example = text_encoder(corpus_df["pan"][:5])
print("\nencode batch of words into matrix(flattened)")
print(doc_encoding_example)
doc_decoding_example = text_decoder(doc_encoding_example)
print("\ninterpret the guess matrix back into word")
print(doc_decoding_example)

0     aba
1    abaŋ
2     adu
3    aduq
4    agas
Name: pan, dtype: object

encode batch of words into matrix(flattened)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]

interpret the guess matrix back into word
['aba', 'abaŋ', 'adu', 'aduq', 'agas']


In [131]:
model = keras.Sequential()
model.add(keras.Input(shape=(WORD_MAX_LEN*len(encoder_mapping), )))
model.add(keras.layers.Dense(128, activation='tanh'))
model.add(keras.layers.Dense(64, activation='tanh'))
model.add(keras.layers.Dense(WORD_MAX_LEN*len(encoder_mapping), activation='relu'))
model.compile(optimizer=keras.optimizers.Adam(0.01), loss='cosine_similarity')
model.summary()

Model: "sequential_27"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_80 (Dense)            (None, 128)               426112    
                                                                 
 dense_81 (Dense)            (None, 64)                8256      
                                                                 
 dense_82 (Dense)            (None, 3328)              216320    
                                                                 
Total params: 650,688
Trainable params: 650,688
Non-trainable params: 0
_________________________________________________________________


In [132]:
from sklearn.utils import shuffle

shuffled_corpus = shuffle(corpus_df).dropna()[:int(0.1 * corpus_df.shape[0])]
train = shuffled_corpus[:int(0.8 * shuffled_corpus.shape[0])]
test = shuffled_corpus[int(0.8 * shuffled_corpus.shape[0]):]

tf.stack(text_encoder(train["malay"]))

model.fit(
    text_encoder(train["malay"]), 
    text_encoder(train["pan"]), 
    epochs=10, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1d3716d3d00>

In [133]:
pd.DataFrame({
    "Question": test["malay"].to_list(),
    "Prediction": text_decoder(
        model.predict(text_encoder(test["malay"]))),
    "CorrectAnswer": test["pan"].to_list()
})



Unnamed: 0,Question,Prediction,CorrectAnswer
0,pihak,*a*** *** ...,piqak
1,keréh,*a*** *** ...,keriq
2,titir,*a*** *** ...,tirtir
3,belimbiŋ,*a*** *** ...,baliŋbiŋ
4,tintiŋ,*a*** *** ...,tiŋtiŋ
5,jaŋkaŋ,*a*** *** ...,zaŋkaŋ
6,selap,*a*** *** ...,selep
7,lapus,*a*** *** ...,la(m)pus
8,taŋkis,*a*** *** ...,taŋkis
9,pukau,*a*** *** ...,pukaw
