In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from tensorflow.keras.models import Model

In [2]:


file_path = 'EWE_ENGLISH.csv'

try:
    data = pd.read_csv(file_path)
except pd.errors.ParserError as e:
    print(f"Error reading CSV: {e}")

    chunksize = 1000
    for chunk in pd.read_csv(file_path, chunksize=chunksize, error_bad_lines=False):
        print(chunk.head())

In [3]:
data = pd.read_csv(file_path)

data = data.drop(columns=['Unnamed: 0'])

data.head()

Unnamed: 0,EWE,ENGLISH
0,Ne nyɔnu aɖe le evi dzim eye wo le kukum nɛ la...,﻿If a woman often loss his baby after he is bo...
1,Ŋkɔ sia nye na ŋkɔ si ke ame bubu tsɔna na ɖev...,"This name comes from another person, which mea..."
2,Ame si hɔ ɖevi la ƒlela tsona ƒome bubu me alo...,This person must not be part of the whole fami...
3,Kɔnua wo yina ale: evinɔ si ga dzi ɖevi bubu a...,The ceremony is done as follow: the family of ...
4,Ne ame aɖe vayina to afimagodzi he kɔ ɖevia la...,When somebody passes through the road and find...


In [4]:



# Separate input (English) and target (Ewe) texts
input_texts = data['ENGLISH'].astype(str).tolist()
target_texts = data['EWE'].astype(str).tolist()

# Tokenizer setup
input_tokenizer = tf.keras.preprocessing.text.Tokenizer()
target_tokenizer = tf.keras.preprocessing.text.Tokenizer()

input_tokenizer.fit_on_texts(input_texts)
target_tokenizer.fit_on_texts(target_texts)

input_sequences = input_tokenizer.texts_to_sequences(input_texts)
target_sequences = target_tokenizer.texts_to_sequences(target_texts)

max_input_len = max(len(seq) for seq in input_sequences)
max_target_len = max(len(seq) for seq in target_sequences)

input_vocab_size = len(input_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1

# Pad sequences
input_sequences = tf.keras.preprocessing.sequence.pad_sequences(input_sequences, maxlen=max_input_len, padding='post')
target_sequences = tf.keras.preprocessing.sequence.pad_sequences(target_sequences, maxlen=max_target_len, padding='post')

# Define the model
def build_model(input_vocab_size, target_vocab_size, embedding_dim=256, units=512):
    encoder_inputs = Input(shape=(None,))
    enc_emb = Embedding(input_vocab_size, embedding_dim)(encoder_inputs)
    encoder_lstm = LSTM(units, return_state=True)
    encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
    encoder_states = [state_h, state_c]

    decoder_inputs = Input(shape=(None,))
    dec_emb_layer = Embedding(target_vocab_size, embedding_dim)
    dec_emb = dec_emb_layer(decoder_inputs)
    decoder_lstm = LSTM(units, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
    decoder_dense = Dense(target_vocab_size, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)

    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    return model

model = build_model(input_vocab_size, target_vocab_size)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
model.summary()



Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 embedding (Embedding)       (None, None, 256)            5924864   ['input_1[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, None, 256)            6206976   ['input_2[0][0]']             
                                                                                              

In [None]:
# Training
batch_size = 64
epochs = 50

target_sequences_input = target_sequences[:, :-1]
target_sequences_output = target_sequences[:, 1:]
target_sequences_output = np.expand_dims(target_sequences_output, -1)

history = model.fit([input_sequences, target_sequences_input], target_sequences_output,
                    batch_size=batch_size, epochs=epochs, validation_split=0.2)

# Save model
model.save('nmt_model.h5')

# Function to translate new sentences
def translate(sentence, model, input_tokenizer, target_tokenizer, max_input_len, max_target_len):
    sequence = input_tokenizer.texts_to_sequences([sentence])
    sequence = tf.keras.preprocessing.sequence.pad_sequences(sequence, maxlen=max_input_len, padding='post')
    prediction = model.predict([sequence, sequence])
    target_sequence = [np.argmax(word) for word in prediction[0]]
    target_text = target_tokenizer.sequences_to_texts([target_sequence])[0]
    return target_text

# Example usage
sentence = "Hello, how are you?"
translation = translate(sentence, model, input_tokenizer, target_tokenizer, max_input_len, max_target_len)
print(f'Translation: {translation}')


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
  7/358 [..............................] - ETA: 4:57 - loss: 0.0769