In [1]:
import pandas as pd


# Load the dataset
dataset = pd.read_csv("verse_pairs.csv")

# Display the first few rows of the dataset
print(dataset.head(6))
dataset.shape

                                             lingala  \
0    Na ebandeli Nzambe azalisaki likoló mpe mabele.   
1  Kasi mabele ezalaki kaka bongobongo mpe ezalak...   
2  Mpe Nzambe alobaki ete: “Pole ezala.” Bongo po...   
3  Nsima na yango, Nzambe amonaki ete pole ezalak...   
4  Nzambe abengaki pole Moi, kasi abengaki molili...   
5  Na nsima, Nzambe alobaki ete: “Etando ezala ka...   

                                             english  
0  In the beginning God created the heavens and t...  
1  Now the earth was formless and desolate, and t...  
2  And God said: “Let there be light.” Then there...  
3  After that God saw that the light was good, an...  
4  God called the light Day, but the darkness he ...  
5  Then God said: “Let there be an expanse betwee...  


(31194, 2)

In [2]:
print(dataset)

                                                 lingala  \
0        Na ebandeli Nzambe azalisaki likoló mpe mabele.   
1      Kasi mabele ezalaki kaka bongobongo mpe ezalak...   
2      Mpe Nzambe alobaki ete: “Pole ezala.” Bongo po...   
3      Nsima na yango, Nzambe amonaki ete pole ezalak...   
4      Nzambe abengaki pole Moi, kasi abengaki molili...   
...                                                  ...   
31189  Elimo ná mwasi ya libala bazali kaka koloba et...   
31190  “Nazali koyebisa moto nyonso oyo azali koyoka ...   
31191  mpe soki moto alongoli liloba moko na maloba y...   
31192  “Moto oyo azali kotatola makambo oyo alobi ete...   
31193      Boboto monene ya Nkolo Yesu ezala na basantu.   

                                                 english  
0      In the beginning God created the heavens and t...  
1      Now the earth was formless and desolate, and t...  
2      And God said: “Let there be light.” Then there...  
3      After that God saw that the light wa

In [3]:
#TOKENIZATION
!pip install tensorflow

from tensorflow.keras.preprocessing.text import Tokenizer




In [4]:
# Initialize Tokenizer for Lingala and English
lingala_tokenizer = Tokenizer()
english_tokenizer = Tokenizer()

# Fit tokenizers on text data
lingala_tokenizer.fit_on_texts(dataset['lingala'])
english_tokenizer.fit_on_texts(dataset['english'])

# Convert text data to sequences of tokens
lingala_sequences = lingala_tokenizer.texts_to_sequences(dataset['lingala'])
english_sequences = english_tokenizer.texts_to_sequences(dataset['english'])


In [6]:
# PADDING SEQUENCES - To ensure all sequences have the same length, you'll need to pad the sequences. 
# Keras provides utilities for padding sequences using the pad_sequences function

from tensorflow.keras.preprocessing.sequence import pad_sequences

# Define maximum sequence length
max_sequence_length = 50  # Adjust as needed

# Pad sequences
padded_lingala_sequences = pad_sequences(lingala_sequences, maxlen=max_sequence_length, padding='post')
padded_english_sequences = pad_sequences(english_sequences, maxlen=max_sequence_length, padding='post')


In [8]:
#Train-Validation-Test Split
#Split the dataset into training, validation, and test sets

from sklearn.model_selection import train_test_split

# Split the dataset into train, validation, and test sets
train_lingala, test_lingala, train_english, test_english = train_test_split(padded_lingala_sequences, 
                                                                            padded_english_sequences, 
                                                                            test_size=0.1, 
                                                                            random_state=42)
train_lingala, val_lingala, train_english, val_english = train_test_split(train_lingala, 
                                                                          train_english, 
                                                                          test_size=0.1, 
                                                                          random_state=42)

# Check the shapes of the datasets
print("Train set shapes:", train_lingala.shape, train_english.shape)
print("Validation set shapes:", val_lingala.shape, val_english.shape)
print("Test set shapes:", test_lingala.shape, test_english.shape)



Train set shapes: (25266, 50) (25266, 50)
Validation set shapes: (2808, 50) (2808, 50)
Test set shapes: (3120, 50) (3120, 50)


Model Architecture

In [9]:
#Implement positional encoding to provide positional information to the model.

import numpy as np

def positional_encoding(max_seq_len, d_model):
    pos_enc = np.zeros((max_seq_len, d_model))
    for pos in range(max_seq_len):
        for i in range(0, d_model, 2):
            pos_enc[pos, i] = np.sin(pos / (10000 ** ((2 * i)/d_model)))
            pos_enc[pos, i+1] = np.cos(pos / (10000 ** ((2 * (i+1))/d_model)))
    return pos_enc

# Define maximum sequence length and model dimension
max_seq_len = 50  # Same as the padded sequence length
d_model = 128  # Adjust as needed

# Generate positional encodings
positional_encoding_matrix = positional_encoding(max_seq_len, d_model)


In [10]:
#Transformer architecture using Keras layers

from tensorflow.keras.layers import Input, Embedding, Dropout, Dense, LayerNormalization, MultiHeadAttention, Add
from tensorflow.keras.models import Model

# Define input shape
input_shape = (max_seq_len,)

# Input layers for Lingala and English sequences
lingala_input = Input(shape=input_shape, name='lingala_input')
english_input = Input(shape=input_shape, name='english_input')

# Embedding layers for Lingala and English
lingala_embedding = Embedding(input_dim=len(lingala_tokenizer.word_index)+1, output_dim=d_model, input_length=max_seq_len, name='lingala_embedding')(lingala_input)
english_embedding = Embedding(input_dim=len(english_tokenizer.word_index)+1, output_dim=d_model, input_length=max_seq_len, name='english_embedding')(english_input)

# Add positional encoding to the embeddings
lingala_embedding_with_pos = Add()([lingala_embedding, positional_encoding_matrix])
english_embedding_with_pos = Add()([english_embedding, positional_encoding_matrix])

# Encoder layers
num_layers = 4  # Adjust as needed
encoder_output = lingala_embedding_with_pos
for _ in range(num_layers):
    # Self-attention layer
    self_attention = MultiHeadAttention(num_heads=8, key_dim=d_model//8)(encoder_output, encoder_output)
    self_attention = Dropout(0.1)(self_attention)
    self_attention = Add()([encoder_output, self_attention])
    self_attention = LayerNormalization(epsilon=1e-6)(self_attention)
    
    # Feed-forward neural network
    ffn = Dense(512, activation='relu')(self_attention)
    ffn = Dense(d_model)(ffn)
    ffn = Dropout(0.1)(ffn)
    ffn = Add()([self_attention, ffn])
    encoder_output = LayerNormalization(epsilon=1e-6)(ffn)

# Decoder layers
decoder_output = english_embedding_with_pos
for _ in range(num_layers):
    # Self-attention layer
    self_attention = MultiHeadAttention(num_heads=8, key_dim=d_model//8)(decoder_output, decoder_output)
    self_attention = Dropout(0.1)(self_attention)
    self_attention = Add()([decoder_output, self_attention])
    self_attention = LayerNormalization(epsilon=1e-6)(self_attention)
    
    # Encoder-decoder attention layer
    enc_dec_attention = MultiHeadAttention(num_heads=8, key_dim=d_model//8)(self_attention, encoder_output)
    enc_dec_attention = Dropout(0.1)(enc_dec_attention)
    enc_dec_attention = Add()([self_attention, enc_dec_attention])
    enc_dec_attention = LayerNormalization(epsilon=1e-6)(enc_dec_attention)
    
    # Feed-forward neural network
    ffn = Dense(512, activation='relu')(enc_dec_attention)
    ffn = Dense(d_model)(ffn)
    ffn = Dropout(0.1)(ffn)
    ffn = Add()([enc_dec_attention, ffn])
    decoder_output = LayerNormalization(epsilon=1e-6)(ffn)

# Output layer
output = Dense(len(english_tokenizer.word_index)+1, activation='softmax')(decoder_output)

# Define the model
model = Model(inputs=[lingala_input, english_input], outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()





Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 lingala_input (InputLayer)  [(None, 50)]                 0         []                            
                                                                                                  
 lingala_embedding (Embeddi  (None, 50, 128)              2269056   ['lingala_input[0][0]']       
 ng)                                                                                              
                                                                                                  
 add (Add)                   (50, 50, 128)                0         ['lingala_embedding[0][0]']   
                                                                                                  
 multi_head_attention (Mult  (50, 50, 128)                66048     ['add[0][0]',           

TRAINING

In [11]:
#Define Loss Function
#Since this is a Multiclass classification task (predicting the next word in the translated sequence), 
# you can use categorical cross-entropy as the loss function.


# Define the loss function
loss_function = 'sparse_categorical_crossentropy'


In [12]:
#Compile the Model
#Compile the model with an optimizer like Adam and the defined loss function. 
#Specify also additional metrics to monitor during training, such as accuracy.


from tensorflow.keras.optimizers import Adam

# Define the optimizer
optimizer = Adam(learning_rate=0.0001)  # You can adjust the learning rate as needed

# Compile the model
model.compile(optimizer=optimizer, loss=loss_function, metrics=['accuracy'])


In [16]:
#Train the model
#Train the Transformer model on the training data. Monitor training progress using metrics like validation loss. 
#Specify also the batch size and number of epochs for training.


# Define batch size and number of epochs
batch_size = 64
num_epochs = 10  # You can adjust the number of epochs as needed




In [18]:
import tensorflow as tf

# Enable eager execution
tf.config.run_functions_eagerly(True)

# Train the model
history = model.fit(
    {'lingala_input': train_lingala, 'english_input': train_english},
    train_english,  # Keep the target data as is
    validation_data=({'lingala_input': val_lingala, 'english_input': val_english}, val_english),  # Keep the validation target data as is
    batch_size=batch_size,
    epochs=num_epochs,
    verbose=1
)



Epoch 1/10


InvalidArgumentError: Exception encountered when calling layer 'add' (type Add).

{{function_node __wrapped__AddV2_device_/job:localhost/replica:0/task:0/device:CPU:0}} Incompatible shapes: [64,50,128] vs. [50,1,128] [Op:AddV2] name: 

Call arguments received by layer 'add' (type Add):
  • inputs=['tf.Tensor(shape=(64, 50, 128), dtype=float32)', 'tf.Tensor(shape=(50, 128), dtype=float32)']