In [1]:
import pandas as pd
import numpy as np

In [2]:
corpus_df = pd.read_csv("res/pan2malay.csv")

```
1, aba, aba-h
2, abaŋ, abaŋ-an
...
```

↓

`[aba, aba-h, abaŋ, abaŋ-an...]`

↓

`abaaba-habaŋabaŋ-an...`

↓

then get all the unique characters

In [3]:
def get_char_set():
    without_index = np.delete(corpus_df.to_numpy(), 0, 1)
    flat = without_index.flatten().astype("str")
    giant_string = "".join(flat)
    return set(giant_string)

In [4]:
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt

In [6]:
encoder_mapping = list(get_char_set())
one_hot = tf.eye(len(encoder_mapping))

def encode(c):
    return one_hot[encoder_mapping.index(c)]

def decode(arr):
    return encoder_mapping[np.argmax(arr)]

vec_of_ng = encode("ŋ")
print(vec_of_ng)
print(decode(vec_of_ng))

tf.Tensor(
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.], shape=(52,), dtype=float32)
ŋ


In [24]:
WORD_MAX_LEN = 128

def text_encoder(doc):
    encoded = []
    for word in doc:
        word_matrix = tf.stack([encode(c) for c in word.ljust(WORD_MAX_LEN, " ")])
        encoded.append(word_matrix)
    return encoded

def text_decoder(matrix_list):
    decoded = []
    for m in matrix_list:
        guess = "".join([decode(m[i]) for i in range(WORD_MAX_LEN)])
        decoded.append(str.strip(guess))
    return decoded

print(corpus_df["pan"][:5])
doc_encoding_example = text_encoder(corpus_df["pan"][:5])
print("\nencode batch of words into matrix")
print(doc_encoding_example)
doc_decoding_example = text_decoder(doc_encoding_example)
print("\ninterpret the guess matrix back into word")
print(doc_decoding_example)

0     aba
1    abaŋ
2     adu
3    aduq
4    agas
Name: pan, dtype: object

encode batch of words into matrix
[<tf.Tensor: shape=(128, 52), dtype=float32, numpy=
array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>, <tf.Tensor: shape=(128, 52), dtype=float32, numpy=
array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>, <tf.Tensor: shape=(128, 52), dtype=float32, numpy=
array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0.

In [30]:
model = keras.Sequential()
model.add(keras.Input(shape=(WORD_MAX_LEN, len(encoder_mapping))))
model.add(keras.layers.Dense(32, activation='relu'))
model.add(keras.layers.Dense(16, activation='relu'))
model.add(keras.layers.Dense(len(encoder_mapping), activation='relu'))
model.compile(optimizer='sgd', loss='mse')
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_12 (Dense)            (None, 128, 32)           1696      
                                                                 
 dense_13 (Dense)            (None, 128, 16)           528       
                                                                 
 dense_14 (Dense)            (None, 128, 52)           884       
                                                                 
Total params: 3,108
Trainable params: 3,108
Non-trainable params: 0
_________________________________________________________________


In [53]:
from sklearn.utils import shuffle

shuffled_corpus = shuffle(corpus_df).dropna()
train = shuffled_corpus[:int(0.8 * corpus_df.shape[0])]
test = shuffled_corpus[int(0.8 * corpus_df.shape[0]):]

# model.fit(
#     text_encoder(train["malay"]), 
#     text_encoder(train["pan"]), 
#     epochs=4, validation_split=0.2)

# model.fit(
#     text_encoder(train["malay"])[0],
#     text_encoder(train["malay"])[1]
# )

<tf.Tensor: shape=(128, 52), dtype=float32, numpy=
array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>