In [1]:
!pip install datasets



In [2]:
from datasets import load_dataset
import pandas as pd
import re
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from nltk.translate.bleu_score import corpus_bleu

2024-06-14 07:16:44.640119: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-14 07:16:44.640172: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-14 07:16:44.799953: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-06-14 07:16:45.123816: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# load dataset

dataset = load_dataset("ccibeekeoc42/english_to_igbo")

In [4]:
# show dataset details

dataset

DatasetDict({
    train: Dataset({
        features: ['English', 'Igbo'],
        num_rows: 522322
    })
    test: Dataset({
        features: ['English', 'Igbo'],
        num_rows: 3296
    })
})

In [5]:
# convert training set to a pandas dataframe
train_df = dataset['train'].to_pandas()

# convert test set to a pandas dataframe
test_df = dataset['test'].to_pandas()

In [6]:
# display the first few rows of the training set
print(train_df.head())

                                             English  \
0  All this was happening amidst a barrage of reg...   
1  Soon after Jota was denied by the recovering D...   
2  Friday’s rout equalled Manchester United’s 9-0...   
3  There were over 70 million Nigerians on the hi...   
4  Diet and Dementia: How you will cure dementia ...   

                                                Igbo  
0  Ihe a niile na-eme n' etiti ọtụtụ twiit nke si...  
1  Ozigbo David Sanchez na-agbake agbake gọnahara...  
2  Asọmụmpi nke Fụraịde nke Manchester United ji ...  
3  E nwere ihe karịrị nde ndị Naịjirịa iri asaa n...  
4  Diet and Dementia: Etu ị ga-esi jiri chocolate...  


In [7]:
# display the first few rows of the test set
print(test_df.head())

                                             English  \
0  The latest report reaching us is that it's rem...   
1       Why did you leave your former place of work?   
2  Majozi is a politics and international affairs...   
3                Saraki: The police plan has changed   
4  'Ekechi said that they had about 40 videos whi...   

                                                Igbo  
0  Nke ọhụrụ na-eru anyị ntị ugbua na-ekwu na ọ o...  
1        Gịnị mere i ji hapụ ebe ị na-arụ n'oge mbu?  
2  Majozi bụ onye nyocha ndọrọ ndọrọ ọchịchị na o...  
3               Saraki: egwu ndị uweojii adagharịala  
4  Ekechi kwuru na ha nwere ihe onyonyo ruru 40 g...  


# Preprocessing

In [8]:
# check for missing values

print(train_df.isnull().sum())
print(test_df.isnull().sum())

English    0
Igbo       0
dtype: int64
English    0
Igbo       0
dtype: int64


In [9]:
def preprocess_text(text):
    # lowercasing
    text = text.lower()
    # removing special characters
    text = re.sub(r'[^\w\s]', '', text)
    return text

In [10]:
train_df['English'] = train_df['English'].apply(preprocess_text)
train_df['Igbo'] = train_df['Igbo'].apply(preprocess_text)

In [11]:
test_df['English'] = test_df['English'].apply(preprocess_text)
test_df['Igbo'] = test_df['Igbo'].apply(preprocess_text)

In [12]:
# split the training set into training and validation set
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

In [13]:
# tokenize and convert to sequences
tokenizer_eng = Tokenizer()
tokenizer_igbo = Tokenizer()

tokenizer_eng.fit_on_texts(train_df['English'])
tokenizer_igbo.fit_on_texts(train_df['Igbo'])

train_sequences_eng = tokenizer_eng.texts_to_sequences(train_df['English'])
train_sequences_igbo = tokenizer_igbo.texts_to_sequences(train_df['Igbo'])

val_sequences_eng = tokenizer_eng.texts_to_sequences(val_df['English'])
val_sequences_igbo = tokenizer_igbo.texts_to_sequences(val_df['Igbo'])

test_sequences_eng = tokenizer_eng.texts_to_sequences(test_df['English'])
test_sequences_igbo = tokenizer_igbo.texts_to_sequences(test_df['Igbo'])

In [14]:
# pad sequences to the same length
max_len_eng = max(max(len(seq) for seq in train_sequences_eng), max(len(seq) for seq in val_sequences_eng), max(len(seq) for seq in test_sequences_eng))
max_len_igbo = max(max(len(seq) for seq in train_sequences_igbo), max(len(seq) for seq in val_sequences_igbo), max(len(seq) for seq in test_sequences_igbo))

train_padded_eng = pad_sequences(train_sequences_eng, maxlen=max_len_eng, padding='post')
train_padded_igbo = pad_sequences(train_sequences_igbo, maxlen=max_len_igbo, padding='post')

val_padded_eng = pad_sequences(val_sequences_eng, maxlen=max_len_eng, padding='post')
val_padded_igbo = pad_sequences(val_sequences_igbo, maxlen=max_len_igbo, padding='post')

test_padded_eng = pad_sequences(test_sequences_eng, maxlen=max_len_eng, padding='post')
test_padded_igbo = pad_sequences(test_sequences_igbo, maxlen=max_len_igbo, padding='post')

In [15]:
# define vocabulary sizes
vocab_size_eng = len(tokenizer_eng.word_index) + 1
vocab_size_igbo = len(tokenizer_igbo.word_index) + 1

print(f'Vocabulary size (English): {vocab_size_eng}')
print(f'Vocabulary size (Igbo): {vocab_size_igbo}')

Vocabulary size (English): 63474
Vocabulary size (Igbo): 78827


In [16]:
# define model parameters
embedding_dim = 256
latent_dim = 512

# Model building

In [17]:
# encoder
encoder_inputs = Input(shape=(max_len_eng,), name='encoder_inputs')
encoder_embedding = Embedding(input_dim=vocab_size_eng, output_dim=embedding_dim, mask_zero=True, name='encoder_embedding')(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True, name='encoder_lstm')
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# decoder
decoder_inputs = Input(shape=(max_len_igbo,), name='decoder_inputs')
decoder_embedding = Embedding(input_dim=vocab_size_igbo, output_dim=embedding_dim, mask_zero=True, name='decoder_embedding')(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, name='decoder_lstm')
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(vocab_size_igbo, activation='softmax', name='decoder_dense')
decoder_outputs = decoder_dense(decoder_outputs)


2024-06-14 07:17:34.015743: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:274] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


In [18]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [19]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [20]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 encoder_inputs (InputLayer  [(None, 1132)]               0         []                            
 )                                                                                                
                                                                                                  
 decoder_inputs (InputLayer  [(None, 2071)]               0         []                            
 )                                                                                                
                                                                                                  
 encoder_embedding (Embeddi  (None, 1132, 256)            1624934   ['encoder_inputs[0][0]']      
 ng)                                                      4                                   

# Model Training

In [21]:
# prep target data
train_target_igbo = tf.keras.preprocessing.sequence.pad_sequences(
    train_padded_igbo[:, 1:], maxlen=max_len_igbo, padding='pre', value=0.0
)
val_target_igbo = tf.keras.preprocessing.sequence.pad_sequences(
    val_padded_igbo[:, 1:], maxlen=max_len_igbo, padding='pre', value=0.0
)
test_target_igbo = tf.keras.preprocessing.sequence.pad_sequences(
    test_padded_igbo[:, 1:], maxlen=max_len_igbo, padding='pre', value=0.0
)

In [None]:
# train model
history = model.fit(
    [train_padded_eng, train_padded_igbo], train_target_igbo,
    epochs=20,
    batch_size=16,
    validation_data=([val_padded_eng, val_padded_igbo], val_target_igbo)
)

Epoch 1/20


2024-06-14 07:18:29.193383: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 10448045888 exceeds 10% of free system memory.
2024-06-14 07:18:32.665674: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 10448045888 exceeds 10% of free system memory.
2024-06-14 07:18:32.665741: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 10448045888 exceeds 10% of free system memory.


# Model evaluation

In [None]:
# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(
    [test_padded_eng, test_padded_igbo[:, :-1]],
    test_target_igbo
)

print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")