In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from nltk.translate.bleu_score import corpus_bleu

In [2]:
df = pd.read_csv("en-ha.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,source_sentence,target_sentence
0,0,And We have placed in the earth firm mountains...,Kuma Mun sanya tabbatattun duwãtsu a cikin ƙas...
1,1,"“ YOU saw a lot of casual clothing , especiall...",WATA jarida da aka rubuta da yaren Dutch ta kw...
2,2,How will God relieve mankind of suffering ?,Ta yaya Allah zai cire wahalar da ’ yan adam s...
3,3,Jesus felt compassion for the people he preach...,Yesu ya ji tausayin mutanen da yake yi wa wa’a...
4,4,Every soul shall have a taste of death : And o...,Kõwane rai mai ɗanɗanar mutuwa ne . Kuma ana c...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 351024 entries, 0 to 351023
Data columns (total 3 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   Unnamed: 0       351024 non-null  int64 
 1   source_sentence  347383 non-null  object
 2   target_sentence  346963 non-null  object
dtypes: int64(1), object(2)
memory usage: 8.0+ MB


In [5]:

# Droping the unnamed column
df = df.drop(columns=['Unnamed: 0'])

# Renaming the columns
df = df.rename(columns={
    'source_sentence': 'english',
    'target_sentence': 'hausa'
})

# Droping rows where 'english' or 'hausa' have missing values
df = df.dropna(subset=['english', 'hausa'])

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 343323 entries, 0 to 351023
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   english  343323 non-null  object
 1   hausa    343323 non-null  object
dtypes: object(2)
memory usage: 7.9+ MB


In [7]:
#Head
df.head()

Unnamed: 0,english,hausa
0,And We have placed in the earth firm mountains...,Kuma Mun sanya tabbatattun duwãtsu a cikin ƙas...
1,"“ YOU saw a lot of casual clothing , especiall...",WATA jarida da aka rubuta da yaren Dutch ta kw...
2,How will God relieve mankind of suffering ?,Ta yaya Allah zai cire wahalar da ’ yan adam s...
3,Jesus felt compassion for the people he preach...,Yesu ya ji tausayin mutanen da yake yi wa wa’a...
4,Every soul shall have a taste of death : And o...,Kõwane rai mai ɗanɗanar mutuwa ne . Kuma ana c...


In [8]:
# Split the data into training and testing sets
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42
)

In [9]:
len(train_df)

274658

In [10]:
len(test_df)

68665

In [11]:
#Checking for missing values
train_df.isnull().sum()

Unnamed: 0,0
english,0
hausa,0


In [12]:
test_df.isnull().sum()

Unnamed: 0,0
english,0
hausa,0


In [13]:
def preprocessing(text):
    # lowercase
    text = text.lower()
    # removing special characters
    text = re.sub(r'[^\w\s]', '', text)
    return text

In [14]:
train_df['english'] = train_df['english'].apply(preprocessing)
train_df['hausa'] = train_df['hausa'].apply(preprocessing)

In [15]:
test_df['english'] = test_df['english'].apply(preprocessing)
test_df['hausa'] = test_df['hausa'].apply(preprocessing)

In [16]:
# Spliting the test set into testing and validation sets
test_df, val_df = train_test_split(test_df, test_size=0.5, random_state=42)

In [17]:
len(test_df)

34332

In [18]:
len(val_df)

34333

In [19]:
# tokenizing and converting to sequences
tokenizer_eng = Tokenizer()
tokenizer_ha = Tokenizer()

tokenizer_eng.fit_on_texts(train_df['english'])
tokenizer_ha.fit_on_texts(train_df['hausa'])

train_sequences_eng = tokenizer_eng.texts_to_sequences(train_df['english'])
train_sequences_ha = tokenizer_ha.texts_to_sequences(train_df['hausa'])

val_sequences_eng = tokenizer_eng.texts_to_sequences(val_df['english'])
val_sequences_ha = tokenizer_ha.texts_to_sequences(val_df['hausa'])

test_sequences_eng = tokenizer_eng.texts_to_sequences(test_df['english'])
test_sequences_ha = tokenizer_ha.texts_to_sequences(test_df['hausa'])

In [20]:
# Getting the vocabulary sizes for the english and the hausa words ie the unique words
vocab_size_eng = len(tokenizer_eng.word_index) + 1
vocab_size_ha = len(tokenizer_ha.word_index) + 1

print("English Vocabulary Size: ", vocab_size_eng)
print("Hausa Vocabulary Size: ", vocab_size_ha)

English Vocabulary Size:  38457
Hausa Vocabulary Size:  33920


In [21]:
embedding_dim = 64
latent_dim = 128

In [None]:
# Getting the max seq, then doing post-padding in order to obtain thesame length
# max_eng = 100
# max_ha = 100
max_eng = max(len(seq) for seq in train_sequences_eng)
max_ha = max(len(seq) for seq in train_sequences_ha)

train_padded_eng = pad_sequences(train_sequences_eng, maxlen=max_eng, padding='post')
train_padded_ha = pad_sequences(train_sequences_ha, maxlen=max_ha, padding='post')

val_padded_eng = pad_sequences(val_sequences_eng, maxlen=max_eng, padding='post')
val_padded_ha = pad_sequences(val_sequences_ha, maxlen=max_ha, padding='post')

test_padded_eng = pad_sequences(test_sequences_eng, maxlen=max_eng, padding='post')
test_padded_ha = pad_sequences(test_sequences_ha, maxlen=max_ha, padding='post')

In [38]:
# for the encoder
encoder_inputs = Input(shape=(max_eng,), name='encoder_inputs')
encoder_embedding = Embedding(input_dim=vocab_size_eng, output_dim=embedding_dim, mask_zero=True, name='encoder_embedding')(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True, name='encoder_lstm')
encoder_outputs, hidden_state, cell_state = encoder_lstm(encoder_embedding)
encoder_states = [hidden_state, cell_state]

# for the decoder
decoder_inputs = Input(shape=(max_ha,), name='decoder_inputs')
decoder_embedding = Embedding(input_dim=vocab_size_ha, output_dim=embedding_dim, mask_zero=True, name='decoder_embedding')(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, name='decoder_lstm')
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(vocab_size_ha, activation='softmax', name='decoder_dense')
decoder_outputs = decoder_dense(decoder_outputs)

In [23]:
# Creation of Seq2Seq model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [24]:
# Model compilation
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [25]:
model.summary()

In [36]:
# pre-padding the target data ie adding 0.0 before in order to ensure equal length
train_target_ha = tf.keras.preprocessing.sequence.pad_sequences(
    train_padded_ha[:, 1:], maxlen=max_ha, padding='post'
)
val_target_ha = tf.keras.preprocessing.sequence.pad_sequences(
    val_padded_ha[:, 1:], maxlen=max_ha, padding='post'
)
test_target_ha = tf.keras.preprocessing.sequence.pad_sequences(
    test_padded_ha[:, 1:], maxlen=max_ha, padding='post'
)

In [40]:
encoder_lstm = LSTM(latent_dim, return_state=True, name='encoder_lstm', use_cudnn=False)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, name='decoder_lstm', use_cudnn=False)


In [41]:
# training the seq2seq_model
history = model.fit(
    [train_padded_eng, train_padded_ha], train_target_ha,
    epochs=15,
    batch_size=32,
    validation_data=([val_padded_eng, val_padded_ha], val_target_ha)
)

Epoch 1/15
[1m2618/8584[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m11:24[0m 115ms/step - accuracy: 0.8551 - loss: 4.7777

InvalidArgumentError: Graph execution error:

Detected at node functional_1/encoder_lstm_1/Assert/Assert defined at (most recent call last):
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main

  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code

  File "/usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py", line 37, in <module>

  File "/usr/local/lib/python3.10/dist-packages/traitlets/config/application.py", line 992, in launch_instance

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelapp.py", line 619, in start

  File "/usr/local/lib/python3.10/dist-packages/tornado/platform/asyncio.py", line 195, in start

  File "/usr/lib/python3.10/asyncio/base_events.py", line 603, in run_forever

  File "/usr/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once

  File "/usr/lib/python3.10/asyncio/events.py", line 80, in _run

  File "/usr/local/lib/python3.10/dist-packages/tornado/ioloop.py", line 685, in <lambda>

  File "/usr/local/lib/python3.10/dist-packages/tornado/ioloop.py", line 738, in _run_callback

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 825, in inner

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 786, in run

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelbase.py", line 361, in process_one

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 234, in wrapper

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelbase.py", line 261, in dispatch_shell

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 234, in wrapper

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelbase.py", line 539, in execute_request

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 234, in wrapper

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py", line 302, in do_execute

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/zmqshell.py", line 539, in run_cell

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 2975, in run_cell

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3030, in _run_cell

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/async_helpers.py", line 78, in _pseudo_sync_runner

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3257, in run_cell_async

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3473, in run_ast_nodes

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code

  File "<ipython-input-27-4d89e91c565c>", line 2, in <cell line: 2>

  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/keras/src/backend/tensorflow/trainer.py", line 318, in fit

  File "/usr/local/lib/python3.10/dist-packages/keras/src/backend/tensorflow/trainer.py", line 121, in one_step_on_iterator

  File "/usr/local/lib/python3.10/dist-packages/keras/src/backend/tensorflow/trainer.py", line 108, in one_step_on_data

  File "/usr/local/lib/python3.10/dist-packages/keras/src/backend/tensorflow/trainer.py", line 51, in train_step

  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/keras/src/layers/layer.py", line 882, in __call__

  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/keras/src/ops/operation.py", line 46, in __call__

  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 156, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/keras/src/models/functional.py", line 175, in call

  File "/usr/local/lib/python3.10/dist-packages/keras/src/ops/function.py", line 171, in _run_through_graph

  File "/usr/local/lib/python3.10/dist-packages/keras/src/models/functional.py", line 556, in call

  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/keras/src/layers/layer.py", line 882, in __call__

  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/keras/src/ops/operation.py", line 46, in __call__

  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 156, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/keras/src/layers/rnn/lstm.py", line 570, in call

  File "/usr/local/lib/python3.10/dist-packages/keras/src/layers/rnn/rnn.py", line 406, in call

  File "/usr/local/lib/python3.10/dist-packages/keras/src/layers/rnn/lstm.py", line 537, in inner_loop

  File "/usr/local/lib/python3.10/dist-packages/keras/src/backend/tensorflow/rnn.py", line 841, in lstm

  File "/usr/local/lib/python3.10/dist-packages/keras/src/backend/tensorflow/rnn.py", line 874, in _cudnn_lstm

  File "/usr/local/lib/python3.10/dist-packages/keras/src/backend/tensorflow/rnn.py", line 557, in _assert_valid_mask

assertion failed: [You are passing a RNN mask that does not correspond to right-padded sequences, while using cuDNN, which is not supported. With cuDNN, RNN masks can only be used for right-padding, e.g. `[[True, True, False, False]]` would be a valid mask, but any mask that isn\'t just contiguous `True`\'s on the left and contiguous `False`\'s on the right would be invalid. You can pass `use_cudnn=False` to your RNN layer to stop using cuDNN (this may be slower).]
	 [[{{node functional_1/encoder_lstm_1/Assert/Assert}}]] [Op:__inference_one_step_on_iterator_3073]