In [None]:
import sys
import keras
import numpy as np
import pandas as pd
import tensorflow as tf
from read_file import ReadFile
from schedule import CustomSchedule
from transformer import Transformer
from tokenizer import FrontmanTokenizer
from metrics import masked_loss, masked_accuracy

2025-05-18 23:24:08.185640: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747610648.342429      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747610648.389944      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)

In [4]:
# tf.config.run_functions_eagerly(True)
# tf.data.experimental.enable_debug_mode()

In [5]:
print(f'Tensorflow version: {tf.__version__}')
print(f'Python version: {sys.version}')

Tensorflow version: 2.18.0
Python version: 3.11.11 (main, Dec  4 2024, 08:55:07) [GCC 11.4.0]


Tensorflow version: 2.17.1


Python version: 3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]

In [None]:
# GNOME_file = 'data/train/GNOME_yo_en.tsv'  

opus_file = 'data/train/opus_yo_en.tsv'        
raw_file = 'data/raw_yo_en.tsv'                
JW300 = 'data/JW300_en-yo.csv'                 
bible = 'data/train/bible_yo_en.csv'           
Government = 'data/Train.csv'                  

reader = ReadFile(dropna=True, drop_duplicates=True)

train_df, valid_df= reader.read_file([Government, opus_file, raw_file, bible, JW300],
                                     delimiter=',',
                                     shuffle=True, 
                                     split_ratio=0.1)

train_df = reader.shuffle_df(train_df)

In [7]:
train_df

Unnamed: 0,Yoruba,English
0,Pọ́ọ̀lù sọ pé: “A ti gbé Kristi dìde kúrò nínú...,Paul wrote that “Christ has been raised from t...
1,Fún àlàyé síwájú sí i lórí ìgbàgbọ́ Tertullian...,For further information on Tertullian’s belief...
2,"Torí náà, a ronú pé ó yẹ ká kọ́kọ́ tọ́jú owó p...",It was then that we had the discussion with th...
3,Bermuda amends ile-ifowopamọ igbese lati se oj...,Bermuda amends banking act to favor blockchain...
4,littlehorn wí pé,littlehorn says
...,...,...
541859,Gbogbo ẹ niyẹn. gbadun Iwakọ Booster Pro <UNK>...,That’s all. Enjoy Driver Booster Pro <UNK>Full...
541860,"Àìjẹ́ bẹ́ẹ̀, ìṣòro wa lè gbà wá lọ́kàn débi pé...","Otherwise, personal problems may cloud our spi..."
541861,"Òun àti Anny ìyàwó rẹ̀, àtàwọn ọmọbìnrin wọn m...","He and his wife, Anny, often served as auxilia..."
541862,"Ẹ gbọ́ ọ̀rọ̀ ẹnu mi ní ìfarabalẹ̀, jẹ́ kí ọ̀rọ...","Listen carefully to my speech, And to my decla..."


In [None]:
tokenizer = FrontmanTokenizer(model_path='tokenizer/model/yo_en_bpe.model')

In [None]:
def create_dataset(df, tokenizer, max_length=128, batch_size=128, drop_remainder=False, shuffle_size=False, cache=False):
    
    encoder_input = tokenizer.special_encode(
        df['Yoruba'].values.tolist(),       # English for English to Yoruba training
        max_length=max_length,
        truncation=True,
        padding=True
    )
    
    decoder_input = tokenizer.special_encode(
        df['English'].values.tolist(),      # Yoruba for Yoruba to English training 
        max_length=max_length,
        truncation=True,
        padding=True,
        add_bos=True
    )

    decoder_target= tokenizer.special_encode(
        df['English'].values.tolist(),      # Yoruba for Yoruba to English training 
        max_length=max_length,
        truncation=True,
        padding=True,
        add_eos=True
    )
    
    dataset = tf.data.Dataset.from_tensor_slices(((
        encoder_input,
        decoder_input,
        ),
        decoder_target
        )
    )
    
    # dataset = tf.data.Dataset.from_tensor_slices((({
    #     "encoder_input_ids": encoder_input['input_ids'],
    #     "encoder_attention_mask": encoder_input['attention_mask'],
    #     "decoder_input_ids": decoder_input['input_ids'],
    #     "decoder_attention_mask": decoder_input['attention_mask'],}
    #     ),
    #     decoder_target
    #     )
    # )

    if cache:
        dataset = dataset.cache()
        
    if shuffle_size:
        dataset = dataset.shuffle(shuffle_size)
        
    if batch_size:
        dataset = dataset.batch(batch_size, drop_remainder=drop_remainder)

    return dataset.prefetch(tf.data.AUTOTUNE)

In [10]:
train_set = create_dataset(
    df=train_df,
    tokenizer=tokenizer,
    shuffle_size=100_000,
    cache=True
)

valid_set = create_dataset(
    df=valid_df,
    tokenizer=tokenizer,
    cache=True
)

train_set

I0000 00:00:1747610685.877050      19 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


<_PrefetchDataset element_spec=((TensorSpec(shape=(None, 128), dtype=tf.int32, name=None), TensorSpec(shape=(None, 128), dtype=tf.int32, name=None)), TensorSpec(shape=(None, 128), dtype=tf.int32, name=None))>

In [11]:
N = 4
d_model = 128
d_ffn = 512
num_heads = 8
dropout_rate = 0.2


model = Transformer(
    N=N,
    d_ffn=d_ffn,
    d_model=d_model,
    num_heads=num_heads,
    dropout_rate=dropout_rate,
    input_vocab_size=tokenizer.get_piece_size(),
    target_vocab_size=tokenizer.get_piece_size())


learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)


model.compile(
    loss=masked_loss,
    optimizer=optimizer,
    metrics=[masked_accuracy])

In [None]:
weights_cbk = tf.keras.callbacks.ModelCheckpoint(
    filepath='models/weights/weight_epoch_{epoch:02d}.weights.h5',
    save_weights_only=True,  
    save_freq='epoch',       
)

best_model_cbk = tf.keras.callbacks.ModelCheckpoint(
    filepath='models/pretrained/EnYo BestModel.keras',
    save_weights_only=False,  
    monitor='val_loss',       
    mode='min',              
    save_best_only=True, 
    verbose=1
)

early_stopping_cbk = tf.keras.callbacks.EarlyStopping(
    monitor='val_masked_accuracy',
    patience=5,              
    min_delta = 0.01,
    verbose=1,
    mode='max',              
    restore_best_weights=True
)

In [13]:
# debug_dataset = train_set.unbatch().take(512).batch(128)
# debug_dataset

In [None]:
model.fit(train_set, epochs=30,
                    validation_data=valid_set,
                    callbacks=[weights_cbk, best_model_cbk, early_stopping_cbk]
                   )

model.save("models/pretrained/EnYo BaseModel.keras")

Epoch 1/30


I0000 00:00:1747610770.436427      70 service.cc:148] XLA service 0x7fb770001f80 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1747610770.437437      70 service.cc:156]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
W0000 00:00:1747610772.845551      70 assert_op.cc:38] Ignoring Assert operator compile_loss/masked_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert
I0000 00:00:1747610774.266965      70 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1747610794.835181      70 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m4233/4234[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 321ms/step - loss: 6.8047 - masked_accuracy: 0.1532

W0000 00:00:1747612157.931432      70 assert_op.cc:38] Ignoring Assert operator compile_loss/masked_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert


[1m4234/4234[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 327ms/step - loss: 6.8043 - masked_accuracy: 0.1532

W0000 00:00:1747612181.455124      72 assert_op.cc:38] Ignoring Assert operator compile_loss/masked_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert
W0000 00:00:1747612230.342265      70 assert_op.cc:38] Ignoring Assert operator compile_loss/masked_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert



Epoch 1: val_loss improved from inf to 3.67855, saving model to best_model.keras
[1m4234/4234[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1514s[0m 340ms/step - loss: 6.8040 - masked_accuracy: 0.1532 - val_loss: 3.6785 - val_masked_accuracy: 0.3937
Epoch 2/30
[1m4234/4234[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 320ms/step - loss: 3.6213 - masked_accuracy: 0.3988
Epoch 2: val_loss improved from 3.67855 to 2.82996, saving model to best_model.keras
[1m4234/4234[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1406s[0m 332ms/step - loss: 3.6212 - masked_accuracy: 0.3988 - val_loss: 2.8300 - val_masked_accuracy: 0.5029
Epoch 3/30
[1m4234/4234[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 320ms/step - loss: 2.9608 - masked_accuracy: 0.4827
Epoch 3: val_loss improved from 2.82996 to 2.57344, saving model to best_model.keras
[1m4234/4234[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1405s[0m 332ms/step - loss: 2.9608 - masked_accuracy: 0.4827 - val_loss: 2.5734 

In [16]:
# for i, (w1, w2) in enumerate(zip(model.weights, loaded_model.weights)):
#      layer_name = model.weights[i].name  # Get the layer name
#      print(f"\nLayer {i}: {layer_name}, w1.shape = {w1.shape}, w2.shape = {w2.shape}")