In [None]:

import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf

#Reading CSV doc
data =pd.read_csv('eng_-french.csv')
data.tail()

#extracting eng/french columns
english_sentences = data['English words/sentences'].tolist()
french_sentences = data['French words/sentences'].tolist()

#English input data
english_tokenizer = Tokenizer()
english_tokenizer.fit_on_texts(english_sentences)
english_sequences = english_tokenizer.texts_to_sequences(english_sentences)
english_padded_sequences = pad_sequences(english_sequences, padding='post')

#French input data
french_tokenizer = Tokenizer()
french_tokenizer.fit_on_texts(french_sentences)
french_sequences = french_tokenizer.texts_to_sequences(french_sentences)
french_padded_sequences = pad_sequences(french_sequences, padding='post')

#model architecture
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(english_tokenizer.word_index) + 1, output_dim=256),
    tf.keras.layers.LSTM(256),
    tf.keras.layers.RepeatVector(max_french_sentence_length),
    tf.keras.layers.LSTM(256, return_sequences=True),
    tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(len(french_tokenizer.word_index) + 1, activation='softmax'))
])

#compile model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

#training model
model.fit(english_padded_sequences, french_padded_sequences, batch_size=64, epochs=10, validation_split=0.2)

#evaluating model
loss, accuracy = model.evaluate(english_padded_sequences, french_padded_sequences)




