# Machine Translation Project (English to French)

In [32]:
import collections
import numpy as np
import json

from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Model, Sequential
from keras.layers import Input, Dense, Embedding, GRU, LSTM, Bidirectional, Dropout, Activation, TimeDistributed, RepeatVector
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

### Verify access to the GPU

In [2]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 5592121452488144900
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 2238133044
locality {
  bus_id: 1
  links {
  }
}
incarnation: 2611008541144377426
physical_device_desc: "device: 0, name: NVIDIA GeForce GTX 1650, pci bus id: 0000:01:00.0, compute capability: 7.5"
xla_global_id: 416903419
]


## Dataset

### Load Data

In [3]:
def load_data(path):
    input_file = path
    with open(input_file, "r") as f:
        data = f.read()
    return data.split('\n')

english_sentences = load_data('data/english')
french_sentences = load_data('data/french')

### Sample Data

In [4]:
english_sentences[:5]

['new jersey is sometimes quiet during autumn , and it is snowy in april .',
 'the united states is usually chilly during july , and it is usually freezing in november .',
 'california is usually quiet during march , and it is usually hot in june .',
 'the united states is sometimes mild during june , and it is cold in september .',
 'your least liked fruit is the grape , but my least liked is the apple .']

Structure of the Dataset

In [5]:
english_words_counter = collections.Counter([word for sentence in english_sentences for word in sentence.split()])
french_words_counter = collections.Counter([word for sentence in french_sentences for word in sentence.split()])

print('{} English words.'.format(len([word for sentence in english_sentences for word in sentence.split()])))
print('{} unique English words.'.format(len(english_words_counter)))
print('10 Most common words in the English dataset:')
print('"' + '" "'.join(list(zip(*english_words_counter.most_common(10)))[0]) + '"')

print()
print('{} French words.'.format(len([word for sentence in french_sentences for word in sentence.split()])))
print('{} unique French words.'.format(len(french_words_counter)))
print('10 Most common words in the French dataset:')
print('"' + '" "'.join(list(zip(*french_words_counter.most_common(10)))[0]) + '"')

1823250 English words.
227 unique English words.
10 Most common words in the English dataset:
"is" "," "." "in" "it" "during" "the" "but" "and" "sometimes"

1961295 French words.
355 unique French words.
10 Most common words in the French dataset:
"est" "." "," "en" "il" "les" "mais" "et" "la" "parfois"


### Preprocess
1. Tokenize the words into ids
2. Add padding to make all the sequences the same length.

In [6]:
def tokenize(x):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x)
    return tokenizer.texts_to_sequences(x), tokenizer

In [7]:
def pad(x, length=None):
    if length is None:
        length = max([len(sentence) for sentence in x])
    return pad_sequences(x, maxlen=length, padding='post')

In [8]:
def preprocess(x,y):
    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)

    
    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)
    
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)
    
    return preprocess_x, preprocess_y, x_tk, y_tk

preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer = preprocess(english_sentences, french_sentences)

max_english_sequence_length = preproc_english_sentences.shape[1]
max_french_sequence_length = preproc_french_sentences.shape[1]
english_vocab_size = len(english_tokenizer.word_index)
french_vocab_size = len(french_tokenizer.word_index)

print('Data Preprocessed')
print("Max English sentence length:", max_english_sequence_length)
print("Max French sentence length:", max_french_sequence_length)
print("English vocabulary size:", english_vocab_size)
print("French vocabulary size:", french_vocab_size)

Data Preprocessed
Max English sentence length: 15
Max French sentence length: 21
English vocabulary size: 199
French vocabulary size: 344


## Models
- Model 1 is a simple RNN
- Model 2 is a Bidirectional RNN
- Model 3 is an Embedding RNN

### Ids Back to Text
The neural network will be translating the input to words ids, which isn't the final form we want.  We want the French translation.  The function `logits_to_text` will bridge the gab between the logits from the neural network to the French translation.  You'll be using this function to better understand the output of the neural network.

In [9]:
def logits_to_text(logits, tokenizer):
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'
    
    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

### Model 1: RNN

In [10]:
def simple_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    
    #Hyperparameters
    learning_rate = 0.005
    
    # Build the layers
    model = Sequential()
    model.add(GRU(256, input_shape=input_shape[1:], return_sequences=True))
    model.add(TimeDistributed(Dense(1024, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(french_vocab_size, activation='softmax')))
    
    # Compile model
    model.compile(loss = sparse_categorical_crossentropy,
                  optimizer = Adam(learning_rate),
                  metrics = ['accuracy'])
    
    return model

tmp_x = pad(preproc_english_sentences, max_french_sequence_length)
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2], 1))

#Train the neural network
simple_rnn_model = simple_model(
    tmp_x.shape,
    max_french_sequence_length,
    english_vocab_size,
    french_vocab_size)

simple_rnn_model.fit(tmp_x, preproc_french_sentences, batch_size=1024, epochs=10, validation_split=0.2)

Epoch 1/10

KeyboardInterrupt: 

In [None]:
# Print prediction(s)
print("Prediciton:")
print(logits_to_text(simple_rnn_model.predict(tmp_x[:1])[0], french_tokenizer))

print("\nCorrect Translation:")
print(french_sentences[:1])

print('\nOriginal text:')
print(english_sentences[:1])

Prediciton:
new jersey est parfois calme en l' et il est il est en en <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

Correct Translation:
["new jersey est parfois calme pendant l' automne , et il est neigeux en avril ."]

Original text:
['new jersey is sometimes quiet during autumn , and it is snowy in april .']


### Model 2: Bidirectional RNNs

In [None]:
def bd_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    
    #Hyperparameters
    learning_rate = 0.005
    
    # Build the layers
    model = Sequential()
    model.add(Bidirectional(GRU(128, return_sequences=True), input_shape=input_shape[1:]))
    model.add(TimeDistributed(Dense(1024, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(french_vocab_size, activation='softmax')))
    
    # Compile model
    model.compile(loss = sparse_categorical_crossentropy,
                  optimizer = Adam(learning_rate),
                  metrics = ['accuracy'])
    
    return model

tmp_x = pad(preproc_english_sentences, max_french_sequence_length)
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2], 1))

# Train the neural network
bd_rnn_model = bd_model(
    tmp_x.shape,
    max_french_sequence_length,
    english_vocab_size,
    french_vocab_size)

print(bd_rnn_model.summary())

bd_rnn_model.fit(tmp_x, preproc_french_sentences, batch_size=1024, epochs=10, validation_split=0.2)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional (Bidirectiona  (None, 21, 256)          100608    
 l)                                                              
                                                                 
 time_distributed_2 (TimeDis  (None, 21, 1024)         263168    
 tributed)                                                       
                                                                 
 dropout_1 (Dropout)         (None, 21, 1024)          0         
                                                                 
 time_distributed_3 (TimeDis  (None, 21, 344)          352600    
 tributed)                                                       
                                                                 
Total params: 716,376
Trainable params: 716,376
Non-trainable params: 0
________________________________________________

<keras.callbacks.History at 0x1220e9f6c10>

In [None]:
# Print prediction(s)
print("Prediciton:")
print(logits_to_text(bd_rnn_model.predict(tmp_x[:1])[0], french_tokenizer))

print("\nCorrect Translation:")
print(french_sentences[:1])

print('\nOriginal text:')
print(english_sentences[:1])

Prediciton:
new jersey est parfois chaud en l' et il et il est en en <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

Correct Translation:
["new jersey est parfois calme pendant l' automne , et il est neigeux en avril ."]

Original text:
['new jersey is sometimes quiet during autumn , and it is snowy in april .']


### Model 3: Embedding

In [128]:
def bidirectional_embed_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    
    # Hyperparameters
    learning_rate = 0.005
    
    # Build the layers
    model = Sequential()
    model.add(Embedding(english_vocab_size, 256, input_length=input_shape[1], input_shape=input_shape[1:]))
    model.add(Bidirectional(GRU(256, return_sequences=True)))
    model.add(TimeDistributed(Dense(1024, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(french_vocab_size, activation='softmax')))
    
    # Compile model
    model.compile(loss = sparse_categorical_crossentropy,
                  optimizer = Adam(learning_rate),
                  metrics = ['accuracy'])
    
    return model

tmp_x = pad(preproc_english_sentences, max_french_sequence_length)
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2]))

# Build the model
embed_rnn_model = bidirectional_embed_model(
    tmp_x.shape,
    max_french_sequence_length,
    english_vocab_size,
    french_vocab_size)

print(embed_rnn_model.summary())

embed_rnn_model.fit(tmp_x, preproc_french_sentences, batch_size=1024, epochs=10, validation_split=0.2)
    

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 21, 256)           50944     
                                                                 
 bidirectional_3 (Bidirectio  (None, 21, 512)          789504    
 nal)                                                            
                                                                 
 time_distributed_8 (TimeDis  (None, 21, 1024)         525312    
 tributed)                                                       
                                                                 
 dropout_4 (Dropout)         (None, 21, 1024)          0         
                                                                 
 time_distributed_9 (TimeDis  (None, 21, 344)          352600    
 tributed)                                                       
                                                      

<keras.callbacks.History at 0x268a8948f40>

In [129]:
print("Prediciton:")
print(logits_to_text(embed_rnn_model.predict(tmp_x[:1])[0], french_tokenizer))

print("\nCorrect Translation:")
print(french_sentences[:1])

print('\nOriginal text:')
print(english_sentences[:1])

Prediciton:
new jersey est parfois calme pendant l'automne automne et est neigeux en avril <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

Correct Translation:
["new jersey est parfois calme pendant l' automne , et il est neigeux en avril ."]

Original text:
['new jersey is sometimes quiet during autumn , and it is snowy in april .']


In [130]:
embed_rnn_model.save('english_to_french_model')
# Serialize English Tokenizer to JSON
with open('english_tokenizer.json', 'w', encoding='utf8') as f:
    f.write(json.dumps(english_tokenizer.to_json(), ensure_ascii=False))
    
# Serialize French Tokenizer to JSON
with open('french_tokenizer.json', 'w', encoding='utf8') as f:
    f.write(json.dumps(french_tokenizer.to_json(), ensure_ascii=False))
    
# Save max lengths
max_french_sequence_length_json = max_french_sequence_length
with open('sequence_length.json', 'w', encoding='utf8') as f:
    f.write(json.dumps(max_french_sequence_length_json, ensure_ascii=False))



INFO:tensorflow:Assets written to: english_to_french_model\assets


INFO:tensorflow:Assets written to: english_to_french_model\assets


# Internship Tasks:

## Task 1: Write a Python function to implement a basic tokenization algorithm for a given language.

### We tokenize the english text corpus above

In [None]:
import string

: 

In [15]:
tokens = []

for sentence in english_sentences:
    sentence = sentence.translate(str.maketrans('', '', string.punctuation))
    words_in_sentence = sentence.lower().split()

    for word in words_in_sentence:
        tokens.append(word)

In [16]:
vocab, index = {}, 1  # start indexing from 1
vocab['<pad>'] = 0  # add a padding token
for token in tokens:
  if token not in vocab:
    vocab[token] = index
    index += 1
vocab_size = len(vocab)

In [17]:
inverse_vocab = {index: token for token, index in vocab.items()}
inverse_vocab

{0: '<pad>',
 1: 'new',
 2: 'jersey',
 3: 'is',
 4: 'sometimes',
 5: 'quiet',
 6: 'during',
 7: 'autumn',
 8: 'and',
 9: 'it',
 10: 'snowy',
 11: 'in',
 12: 'april',
 13: 'the',
 14: 'united',
 15: 'states',
 16: 'usually',
 17: 'chilly',
 18: 'july',
 19: 'freezing',
 20: 'november',
 21: 'california',
 22: 'march',
 23: 'hot',
 24: 'june',
 25: 'mild',
 26: 'cold',
 27: 'september',
 28: 'your',
 29: 'least',
 30: 'liked',
 31: 'fruit',
 32: 'grape',
 33: 'but',
 34: 'my',
 35: 'apple',
 36: 'his',
 37: 'favorite',
 38: 'orange',
 39: 'paris',
 40: 'relaxing',
 41: 'december',
 42: 'busy',
 43: 'spring',
 44: 'never',
 45: 'our',
 46: 'lemon',
 47: 'january',
 48: 'warm',
 49: 'lime',
 50: 'her',
 51: 'banana',
 52: 'he',
 53: 'saw',
 54: 'a',
 55: 'old',
 56: 'yellow',
 57: 'truck',
 58: 'india',
 59: 'rainy',
 60: 'that',
 61: 'cat',
 62: 'was',
 63: 'most',
 64: 'loved',
 65: 'animal',
 66: 'dislikes',
 67: 'grapefruit',
 68: 'limes',
 69: 'lemons',
 70: 'february',
 71: 'china',


## Task 2: Use a pre-trained Word2Vec model to generate word embeddings for a given text corpus.

### We use google's pre trained word2vec model to vectorize the english words given

In [18]:
pip install gensim

Note: you may need to restart the kernel to use updated packages.


In [19]:
import gensim.downloader as api
from gensim.models import KeyedVectors

In [20]:
model_path = api.load("word2vec-google-news-300", return_path=True) #takes too long to load 3-4min
model = KeyedVectors.load_word2vec_format(model_path, binary=True) #otherwise after downloading it takes 48seconds

In [21]:
def get_word_vector(word):
    if word in model:
        return model[word]
    else:
        return None  # or return a vector of zeros

In [22]:
vectorized_corpus = []
for word in tokens:
    vector = get_word_vector(word)
    if vector is not None:
        vectorized_corpus.append(vector)

## Task 3: Load a pre-trained LSTM-based NMT model and use it to translate a sentence from one language to another.

### We use hugging face's transformer module to load a pre trained LSTM model (Helsinki) to translate from english to french

In [23]:
!pip install git+https://github.com/huggingface/transformers

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to c:\users\adity\appdata\local\temp\pip-req-build-sv3mzihz
  Resolved https://github.com/huggingface/transformers to commit a3fb96a42a9ee473de61ac01a860251123042943
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'


  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers 'C:\Users\adity\AppData\Local\Temp\pip-req-build-sv3mzihz'


In [24]:
from transformers import MarianMTModel, MarianTokenizer

In [25]:
def translate_text(text, src_lang, tgt_lang):
    model_name = f'Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}'
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)

    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

    # Generate translation
    translated_tokens = model.generate(**inputs)

    # Decode the translated tokens
    translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
    return translated_text

In [26]:
source_sentence = "The whistling of the leaves is utterly calming"
source_language = "en"  # English
target_language = "fr"  # French

translated_sentence = translate_text(source_sentence, source_language, target_language)

In [27]:
print(translated_sentence)

Le sifflement des feuilles est totalement apaisant


## Task 4: Implement beam search decoding for an NMT model to improve translation quality.

### We implement beam search decoding for the NMT model we used in the course curriculum

## Task 5: Create a feature to translate the language from French to Tamil

#### It should predict if the french word has only five letter if the french word has more than five letters or less than five letters the model should not translate the word

### We use facebook/m2m100_418m model, which allows translation directly from french to tamil

In [None]:
!pip install sentencepiece



In [None]:
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

In [None]:
fr_text = "La vie est comme une boîte de chocolat."

In [None]:
model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")

In [None]:
def filter_five_letter_words(text):
    words = text.split()
    filtered_words = [word for word in words if len(word) == 5]
    return ' '.join(filtered_words)

filtered_fr_text = filter_five_letter_words(fr_text)

In [None]:
if filtered_fr_text:
    tokenizer.src_lang = "fr"
    encoded_fr = tokenizer(filtered_fr_text, return_tensors="pt")
    generated_tokens = model.generate(**encoded_fr, forced_bos_token_id=tokenizer.get_lang_id("ta"))
    translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
else:
    translation = []

print("Original text:", fr_text)
print("Filtered text:", filtered_fr_text)
print("Translation:", translation)

Original text: La vie est comme une boîte de chocolat.
Filtered text: comme boîte
Translation: ['கோடு போல்']


## Task 6: Create a feature to throw an error if we enter the wrong word

#### If we enter a word which is not available the program should throw an error saying like this “word is not available” and provide some suggestion related to the word which is incorrect . If the user enter continuously 2 wrong word it should show list of wrong words which we enter so far in the error notification as well as it should give some suggestions related with wrong word .