# Install libraries and datasets

## Install libraries
Google's framework to build, train and deploy machine learning models at scale.

In [None]:
import numpy as np
!pip install --upgrade tensorflow

Libraries for data analysis and visualization.

In [None]:
!pip install nltk
!pip install gensim
!pip install spacy
!pip install plotly

### Import libraries

In [None]:
import pandas as pd
import re
import plotly.express as px

from collections import Counter

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras import Sequential
from keras.src.layers import Embedding, LSTM, Dense, RepeatVector, TimeDistributed

## Load datasets

In [None]:
df_english = pd.read_csv('datasets/small_vocab_en.csv', header=None, sep='/t', names=['english'], engine='python')
df_french = pd.read_csv('datasets/small_vocab_fr.csv', header=None, sep='/t', names=['french'], engine='python')

In [None]:
df_english

In [None]:
df_french

### Check datasets
1. Check how many samples are in each dataset.

In [None]:
number_of_eng_samples = df_english.shape[0]
number_of_fr_samples = df_french.shape[0]

print('Number of English samples: {}'.format(number_of_eng_samples))
print('Number of French samples: {}'.format(number_of_fr_samples))

2. Check if there are any Null values.

In [None]:
print("Number of Null values in English dataset: " + str(df_english.isnull().sum().sum()))
print("Number of Null values in French dataset: " + str(df_french.isnull().sum().sum()))

3. Check memory usage of each dataset in percentage.

In [None]:
df_english.info()
df_french.info()
print("Memory usage of English dataset: " + str(df_english.memory_usage().sum()) + ' bytes')
print("Memory usage of French dataset: " + str(df_french.memory_usage().sum()) + ' bytes')

4. Concatenate both datasets.

In [None]:
df_both = pd.concat([df_english, df_french], axis=1)
df_both

# Preprocess data

## Remove non-alphabetical characters

In [None]:
def remove_non_alphabetical_characters(text):
    return re.sub(r'[^\w\s]', '', text)

In [None]:
df_both['english'] = df_both['english'].apply(remove_non_alphabetical_characters)
df_both['french'] = df_both['french'].apply(remove_non_alphabetical_characters)

In [None]:
df_both

### Check how many unique words are in each dataset.

In [None]:
def get_unique_words(text, set_of_words):
    for word in text.split():
        set_of_words.add(word)
    return set_of_words

In [None]:
set_of_unique_eng_words = set()
set_of_unique_fr_words = set()

df_both['english'].apply(lambda x: get_unique_words(x, set_of_unique_eng_words))
df_both['french'].apply(lambda x: get_unique_words(x, set_of_unique_fr_words))

print('Number of unique words in English dataset: {}'.format(len(set_of_unique_eng_words)))
print('Number of unique words in French dataset: {}'.format(len(set_of_unique_fr_words)))

Print unique words in each dataset.

In [None]:
print('Unique words in English dataset: {}'.format(set_of_unique_eng_words))
print('Unique words in French dataset: {}'.format(set_of_unique_fr_words))

# Visualize data

Get all words in each dataset with their frequency.

In [None]:
# Get all words in each dataset
english_words = []
french_words = []

# Split each sentence into words and add them to the list of words
df_both['english'].apply(lambda x: english_words.extend(x.split()))
df_both['french'].apply(lambda x: french_words.extend(x.split()))

# Get frequency of each word in each dataset
english_words_counts = Counter(english_words)
french_words_counts = Counter(french_words)

Sort words in each dataset by their frequency.

In [None]:
english_words_counts = sorted(english_words_counts.items(), key=lambda x: x[1], reverse=True)
french_words_counts = sorted(french_words_counts.items(), key=lambda x: x[1], reverse=True)

english_words = []
english_words_freq = []
for i in range(len(english_words_counts)):
    english_words.append(english_words_counts[i][0])
    english_words_freq.append(english_words_counts[i][1])

french_words = []
french_words_freq = []
for i in range(len(french_words_counts)):
    french_words.append(french_words_counts[i][0])
    french_words_freq.append(french_words_counts[i][1])

Plot frequency of words in each dataset.

In [None]:
fig = px.bar(x = english_words, y = english_words_freq, title='Frequency of words in English dataset', labels={'x':'Words', 'y':'Frequency'})
fig.show()

fig = px.bar(x = french_words, y = french_words_freq, title='Frequency of words in French dataset', labels={'x':'Words', 'y':'Frequency'})
fig.show()

# Tokenize data

In [None]:
# Get maximum length of a sentence in each dataset for embedding
max_eng_sentence_length = int(df_both['english'].apply(lambda x: len(x.split())).max())
max_fr_sentence_length = int(df_both['french'].apply(lambda x: len(x.split())).max())

print('Maximum length of a sentence in English dataset: {}'.format(max_eng_sentence_length))
print('Maximum length of a sentence in French dataset: {}'.format(max_fr_sentence_length))


In [None]:
def tokenize_and_pad(text, max_length):
    # Tokenize text
    tokenizer = Tokenizer(char_level=False)
    tokenizer.fit_on_texts(text)
    # Get sequences and pad them (make all sequences the same length)
    sequences = tokenizer.texts_to_sequences(text)
    padded_text = pad_sequences(sequences, maxlen=max_length, padding='post')
    return tokenizer, sequences, padded_text

In [None]:
x_tokenizer, x_sequences, x_padded_text = tokenize_and_pad(df_both.english, max_eng_sentence_length)
y_tokenizer, y_sequences, y_padded_text = tokenize_and_pad(df_both.french, max_fr_sentence_length)

Check tokenized and padded data of one sentence from each dataset

In [None]:
print('English sentence: {}'.format(df_both.english[0]))
print('English sentence after tokenization: {}'.format(x_sequences[0]))
print('English sentence after padding: {}'.format(x_padded_text[0]))

print('\nFrench sentence: {}'.format(df_both.french[0]))
print('French sentence after tokenization: {}'.format(y_sequences[0]))
print('French sentence after padding: {}'.format(y_padded_text[0]))

# Split data into training and testing sets

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_padded_text, y_padded_text, test_size=0.1, random_state=42)

# Build and train the model

In [None]:
# Get vocabulary size for each dataset
english_vocab_size = len(x_tokenizer.word_index) + 1
french_vocab_size = len(y_tokenizer.word_index) + 1

# Build model
model = Sequential()

# Add layers
model.add(Embedding(english_vocab_size, 256, input_length=max_eng_sentence_length, mask_zero=True))
model.add(LSTM(256))                                # Encoder
model.add(RepeatVector(max_fr_sentence_length))     # Decoder - change the dimensionality of the input from 2D to 3D
model.add(LSTM(256, return_sequences=True))
model.add(TimeDistributed(Dense(french_vocab_size, activation='softmax')))

# Compile model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

### Change the dimensionality of the input from 2D to 3D

In [None]:
y_train = np.expand_dims(y_train, axis=2)
y_train.shape

## Train the model

In [None]:
model.fit(x_train, y_train, batch_size=1024, epochs=10, validation_split=0.1)

In [None]:
model.save('models/weights.keras')

# Asses model performance

In [None]:
y_predict = model.predict(x_test)

In [None]:
def make_prediction(sentence, _y_tokenizer = y_tokenizer):
    # Get predictions for each word in the sentence
    predictions = model.predict(sentence)[0]
    # Get the word with the highest probability for each prediction
    id_to_word = {id: word for word, id in _y_tokenizer.word_index.items()}
    id_to_word[0] = ''
    return ' '.join([id_to_word[j] for j in np.argmax(predictions, 1)])

In [None]:
def pad_to_text(padded_text, tokenizer):
    # Get word associated with each id in the padded text
    id_to_word = {id: word for word, id in tokenizer.word_index.items()}
    id_to_word[0] = ''
    return ' '.join([id_to_word[j] for j in padded_text])

In [None]:
for i in range(5):
    print('English sentence: {}'.format(pad_to_text(x_test[i], x_tokenizer)))
    print('French sentence: {}'.format(pad_to_text(y_test[i], y_tokenizer)))
    print('Predicted French sentence: {}'.format(make_prediction(x_test[i:i+1])))
    print()