In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from nltk.translate.bleu_score import corpus_bleu

In [8]:
df = pd.read_csv("eng-bassa.csv", encoding='latin-1')

In [9]:
df.head()

Unnamed: 0,english,bassa
0,"According to Exodus 23:9, how were Gods ancie...","Inoñnaga ni kaat Manyodi 23:9, lelaa litén li ..."
1,What has Shebnas experience taught you about ...,Kii yom i bi pémél Sébna i niiga we inyu bikod...
2,And he has such tender love for all who recei...,A gwé nlélém gwéha inyu ba bobasôna ba nleege...
3,TREASURES FROM GODS WORD | MARK 13-14,MASÔÔ MA NKUS MA BIBEL | MARKÔ 13-14
4,What does this prophecy teach us about Gods K...,Kii mbañ ini i niiga bés inyu Ane Djob?


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49 entries, 0 to 48
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   english  49 non-null     object
 1   bassa    49 non-null     object
dtypes: object(2)
memory usage: 912.0+ bytes


In [11]:
# Split the data into training and testing sets
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42
)

In [12]:
len(train_df)

39

In [13]:
len(test_df)

10

In [14]:
#Checking for missing values
train_df.isnull().sum()

Unnamed: 0,0
english,0
bassa,0


In [15]:
test_df.isnull().sum()

Unnamed: 0,0
english,0
bassa,0


In [16]:
def preprocessing(text):
    # lowercase
    text = text.lower()
    # removing special cbassaracters
    text = re.sub(r'[^\w\s]', '', text)
    return text

In [17]:
train_df['english'] = train_df['english'].apply(preprocessing)
train_df['bassa'] = train_df['bassa'].apply(preprocessing)

In [18]:
test_df['english'] = test_df['english'].apply(preprocessing)
test_df['bassa'] = test_df['bassa'].apply(preprocessing)

In [19]:
# Spliting the test set into testing and validation sets
test_df, val_df = train_test_split(test_df, test_size=0.5, random_state=42)

In [20]:
len(test_df)

5

In [21]:
len(val_df)

5

In [22]:
# tokenizing and converting to sequences
tokenizer_eng = Tokenizer()
tokenizer_bassa = Tokenizer()

tokenizer_eng.fit_on_texts(train_df['english'])
tokenizer_bassa.fit_on_texts(train_df['bassa'])

train_sequences_eng = tokenizer_eng.texts_to_sequences(train_df['english'])
train_sequences_bassa = tokenizer_bassa.texts_to_sequences(train_df['bassa'])

val_sequences_eng = tokenizer_eng.texts_to_sequences(val_df['english'])
val_sequences_bassa = tokenizer_bassa.texts_to_sequences(val_df['bassa'])

test_sequences_eng = tokenizer_eng.texts_to_sequences(test_df['english'])
test_sequences_bassa = tokenizer_bassa.texts_to_sequences(test_df['bassa'])

In [23]:
# Getting the max seq, then doing post-padding in order to obtain thesame length
max_eng = max(max(len(seq) for seq in train_sequences_eng), max(len(seq) for seq in val_sequences_eng), max(len(seq) for seq in test_sequences_eng))
max_bassa = max(max(len(seq) for seq in train_sequences_bassa), max(len(seq) for seq in val_sequences_bassa), max(len(seq) for seq in test_sequences_bassa))

train_padded_eng = pad_sequences(train_sequences_eng, maxlen=max_eng, padding='post')
train_padded_bassa = pad_sequences(train_sequences_bassa, maxlen=max_bassa, padding='post')

val_padded_eng = pad_sequences(val_sequences_eng, maxlen=max_eng, padding='post')
val_padded_bassa = pad_sequences(val_sequences_bassa, maxlen=max_bassa, padding='post')

test_padded_eng = pad_sequences(test_sequences_eng, maxlen=max_eng, padding='post')
test_padded_bassa = pad_sequences(test_sequences_bassa, maxlen=max_bassa, padding='post')

In [24]:
# Getting the vocabulary sizes for the english and the bassausa words ie the unique words
vocab_size_eng = len(tokenizer_eng.word_index) + 1
vocab_size_bassa = len(tokenizer_bassa.word_index) + 1

print("English Vocabulary Size: ", vocab_size_eng)
print("Bassa Vocabulary Size: ", vocab_size_bassa)

English Vocabulary Size:  286
Bassa Vocabulary Size:  265


In [25]:
embedding_dim = 256
latent_dim = 512

In [26]:
# for the encoder
encoder_inputs = Input(sbassape=(max_eng,), name='encoder_inputs')
encoder_embedding = Embedding(input_dim=vocab_size_eng, output_dim=embedding_dim, mask_zero=True, name='encoder_embedding')(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True, name='encoder_lstm')
encoder_outputs, hidden_state, cell_state = encoder_lstm(encoder_embedding)
encoder_states = [hidden_state, cell_state]

# for the decoder
decoder_inputs = Input(sbassape=(max_bassa,), name='decoder_inputs')
decoder_embedding = Embedding(input_dim=vocab_size_bassa, output_dim=embedding_dim, mask_zero=True, name='decoder_embedding')(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, name='decoder_lstm')
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(vocab_size_bassa, activation='softmax', name='decoder_dense')
decoder_outputs = decoder_dense(decoder_outputs)

In [27]:
# Creation of Seq2Seq model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [28]:
# Model compilation
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [29]:
model.summary()

In [30]:
# pre-padding the target data ie adding 0.0 before in order to ensure equal length
train_target_bassa = tf.keras.preprocessing.sequence.pad_sequences(
    train_padded_bassa[:, 1:], maxlen=max_bassa, padding='pre', value=0.0
)
val_target_bassa = tf.keras.preprocessing.sequence.pad_sequences(
    val_padded_bassa[:, 1:], maxlen=max_bassa, padding='pre', value=0.0
)
test_target_bassa = tf.keras.preprocessing.sequence.pad_sequences(
    test_padded_bassa[:, 1:], maxlen=max_bassa, padding='pre', value=0.0
)

In [35]:

history = model.fit(
    [train_padded_eng, train_padded_bassa],
    train_target_bassa,
    epochs=15,
    batch_size=16,
    validation_data=([val_padded_eng, val_padded_bassa], val_target_bassa)
)

Epoch 1/15
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 768ms/step - accuracy: 0.3486 - loss: 0.0186 - val_accuracy: 0.1545 - val_loss: 2.9044
Epoch 2/15
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 760ms/step - accuracy: 0.3845 - loss: 0.0179 - val_accuracy: 0.1545 - val_loss: 2.9131
Epoch 3/15
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 791ms/step - accuracy: 0.3919 - loss: 0.0171 - val_accuracy: 0.1545 - val_loss: 2.9081
Epoch 4/15
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1s/step - accuracy: 0.3486 - loss: 0.0164 - val_accuracy: 0.1500 - val_loss: 2.8836
Epoch 5/15
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 759ms/step - accuracy: 0.3692 - loss: 0.0162 - val_accuracy: 0.1545 - val_loss: 2.9018
Epoch 6/15
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 747ms/step - accuracy: 0.3658 - loss: 0.0158 - val_accuracy: 0.1545 - val_loss: 2.8714
Epoch 7/15
[1m3/3[0m [32m━━━━━━━━━━━━━━━

In [36]:
# evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(
    [test_padded_eng, test_padded_bassa],
    test_target_bassa
)

print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step - accuracy: 0.1955 - loss: 3.5534
Test Loss: 3.5533623695373535
Test Accuracy: 0.19545455276966095
