<a href="https://colab.research.google.com/github/AmanPriyanshu/Natural-Language-Processing/blob/master/MachineTranslation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## IMPORTS:

In [1]:
import tensorflow as tf
import os
import unicodedata
import re
import numpy as np
import matplotlib.pyplot as plt
import string
from tqdm import tqdm
from sklearn.neighbors import KNeighborsClassifier

## GETTING DATA:

In [2]:
path_to_zip = tf.keras.utils.get_file(
    'fra-eng.zip', origin='http://download.tensorflow.org/data/fra-eng.zip', 
    extract=True)

path_to_file = os.path.dirname(path_to_zip)+"/fra.txt"

Downloading data from http://download.tensorflow.org/data/fra-eng.zip


## PREPROCESSING:

In [3]:
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')
    
def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    w = w.rstrip().strip()
    w = ' '.join([i for i in w.split() if i not in string.punctuation])
    w = '<start> ' + w + ' <end>' 
    return w

## GENERATING DATASET:

In [4]:
def create_dataset(path, num_examples):
    lines = open(path, encoding='UTF-8').read().strip().split('\n')
    
    word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_examples]]
    
    return word_pairs

In [5]:
pairs = create_dataset(path_to_file, 1500)

In [6]:
pairs = np.array(pairs)
pairs[:5]

array([['<start> go <end>', '<start> va <end>'],
       ['<start> hi <end>', '<start> salut <end>'],
       ['<start> run <end>', '<start> cours <end>'],
       ['<start> run <end>', '<start> courez <end>'],
       ['<start> who <end>', '<start> qui <end>']], dtype='<U45')

## WORD VECTORIZING:

In [7]:
def word_vec(pairs):
  english = pairs.T[0]
  french = pairs.T[1]
  vocab_english = []
  vocab_french = []
  for e, f in tqdm(zip(english, french), total=len(english), desc='Generating a Vocabulary'):
    for w in e.split():
      if w not in vocab_english:
        vocab_english.append(w)
    for w in f.split():
      if w not in vocab_french:
        vocab_french.append(w)
  
  english_word_embed = np.zeros((len(vocab_english), len(vocab_english)))
  french_word_embed = np.zeros((len(vocab_french), len(vocab_french)))

  for e, f in tqdm(zip(english, french), total=len(english), desc='Generating the Word Embeddings'):
    e = e.split()
    for i, w in enumerate(e[1:-1]):
      i += 1
      english_word_embed[vocab_english.index(w)][vocab_english.index(e[i-1])] += 1
      english_word_embed[vocab_english.index(w)][vocab_english.index(e[i+1])] += 1
    f = f.split()
    for i, w in enumerate(f[1:-1]):
      i += 1
      french_word_embed[vocab_french.index(w)][vocab_french.index(f[i-1])] += 1
      french_word_embed[vocab_french.index(w)][vocab_french.index(f[i+1])] += 1
  return french_word_embed, english_word_embed, vocab_english, vocab_french

In [8]:
french_word_embed, english_word_embed, vocab_english, vocab_french = word_vec(pairs)

Generating a Vocabulary: 100%|██████████| 1500/1500 [00:00<00:00, 63322.36it/s]
Generating the Word Embeddings: 100%|██████████| 1500/1500 [00:00<00:00, 15812.88it/s]


## TAKING A LOOK AT BOTH SPACES AND PREPARING TO MAP THEM:

In [9]:
print("French", french_word_embed.shape)
print("English", english_word_embed.shape)

French (948, 948)
English (512, 512)


## SENTENCE TO VECTOR:

In [10]:
def sentence2vector(pairs, french_word_embed, english_word_embed, vocab_english, vocab_french):
  english = pairs.T[0]
  french = pairs.T[1]
  sentences_english = []
  sentences_french = []
  for e, f in tqdm(zip(english, french), total=len(english), desc='Generating the Sentence Embeddings'):
    eng = []
    fra = []
    e = e.split()[1:-1]
    f = f.split()[1:-1]
    for w in e:
      eng.append(english_word_embed[vocab_english.index(w)])
    for w in f:
      fra.append(french_word_embed[vocab_french.index(w)])
    eng = np.array(eng)
    fra = np.array(fra)
    eng = np.mean(eng, axis=0)
    fra = np.mean(fra, axis=0)
    sentences_english.append(eng)
    sentences_french.append(fra)
  sentences_english = np.array(sentences_english)
  sentences_french = np.array(sentences_french)
  return sentences_english, sentences_french    

In [11]:
sentences_english, sentences_french = sentence2vector(pairs, french_word_embed, english_word_embed, vocab_english, vocab_french)

Generating the Sentence Embeddings: 100%|██████████| 1500/1500 [00:00<00:00, 15773.32it/s]


## MODEL FOR MAPPING:

XR = Y

In [12]:
model = tf.keras.models.Sequential([
                                    tf.keras.layers.Dense(256, activation='relu'),
                                    tf.keras.layers.Dense(256, activation='relu'),
                                    tf.keras.layers.Dense(512, activation='relu'),
                                    tf.keras.layers.Dense(948, activation='relu'),
])

In [13]:
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

In [14]:
model.fit(sentences_english, sentences_french, epochs=25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x7fb88ecc8a90>

## HUMAN TESTING:

In [15]:
word = "hi"
index_word = vocab_english.index(word)
english_vector = np.array([english_word_embed[index_word]])
predicted_french_vector = model.predict(english_vector)

In [16]:
neigh = KNeighborsClassifier(n_neighbors=1)
neigh.fit(french_word_embed, vocab_french)
print(neigh.predict(predicted_french_vector))

['fantastique']
