In [None]:
import string
import re
import numpy as np
import pandas as pd
from keras.models import Sequential,load_model
from keras.layers import Dense, LSTM, Embedding, RepeatVector
from keras.preprocessing.text import Tokenizer
from keras import optimizers
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_colwidth',200)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data_path = "/content/drive/MyDrive/Model_Data/French_to_English/fra.txt"

with open(data_path, 'r' ,encoding='utf-8') as f:
    lines = f.read()

print(lines)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
def to_lines(text):
    sents = text.strip().split('\n')
    sents = [i.split('\t') for i in sents]
    return sents

In [None]:
fra_eng = to_lines(lines)
fra_eng[:5]

[['Go.',
  'Va !',
  'CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)'],
 ['Hi.',
  'Salut !',
  'CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #509819 (Aiji)'],
 ['Hi.',
  'Salut.',
  'CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #4320462 (gillux)'],
 ['Run!',
  'Cours\u202f!',
  'CC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #906331 (sacredceltic)'],
 ['Run!',
  'Courez\u202f!',
  'CC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #906332 (sacredceltic)']]

In [None]:
fra_eng = np.array(fra_eng)
fra_eng.shape

(177210, 3)

In [None]:
fra_eng = fra_eng[30000:50000]
fra_eng = fra_eng[:,[0,1]]
fra_eng[:5]

array([['I think he likes me.', "Je pense qu'il m'apprécie."],
       ['I think she is sick.', "Je crois qu'elle est malade."],
       ["I think she's forty.", "Je pense qu'elle a quarante ans."],
       ["I think that's best.", "Je pense que c'est mieux."],
       ["I think that's fair.", "Je pense que c'est juste."]],
      dtype='<U325')

In [None]:
fra_eng[:,0] = [s.translate(str.maketrans('','',string.punctuation)) for s in fra_eng[:,0]]
fra_eng[:,1] = [s.translate(str.maketrans('','',string.punctuation)) for s in fra_eng[:,1]]
fra_eng[:5]

array([['I think he likes me', 'Je pense quil mapprécie'],
       ['I think she is sick', 'Je crois quelle est malade'],
       ['I think shes forty', 'Je pense quelle a quarante ans'],
       ['I think thats best', 'Je pense que cest mieux'],
       ['I think thats fair', 'Je pense que cest juste']], dtype='<U325')

In [None]:
for i in range(len(fra_eng)):
    fra_eng[i,0] = fra_eng[i,0].lower()
    fra_eng[i,1] = fra_eng[i,1].lower()


In [None]:
fra_eng

array([['i think he likes me', 'je pense quil mapprécie'],
       ['i think she is sick', 'je crois quelle est malade'],
       ['i think shes forty', 'je pense quelle a quarante ans'],
       ...,
       ['may i have a timetable', 'puisje disposer dun horaire '],
       ['may i introduce myself', 'puisje me présenter '],
       ['may i look at the menu', 'puisje jeter un coup dœil au menu ']],
      dtype='<U325')

In [None]:
def tokenization(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

eng_tokenizer = tokenization(fra_eng[:,0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1

eng_length = 8
print("Englise Vocabulary Size : " , eng_vocab_size)

Englise Vocabulary Size :  4245


In [None]:
fra_tokenizer = tokenization(fra_eng[:,1])
fra_vocab_size = len(fra_tokenizer.word_index) + 1

fra_length = 8
print("Englise Vocabulary Size : " , fra_vocab_size)

Englise Vocabulary Size :  8679


In [None]:
from keras.utils import pad_sequences
def encode_sequences(tokenizer,length,lines):
    seq = tokenizer.texts_to_sequences(lines)
    seq = pad_sequences(seq,maxlen=length,padding="post")
    return seq

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(fra_eng,test_size=0.2,random_state=12)

In [None]:
x_train = encode_sequences(fra_tokenizer,fra_length,train[:,1])
y_train = encode_sequences(eng_tokenizer,eng_length,train[:,0])

x_test = encode_sequences(fra_tokenizer,fra_length,test[:,1])
y_test = encode_sequences(eng_tokenizer,eng_length,test[:,0])

In [None]:
def define_model(in_vocab,out_vocab,in_timesteps,out_timesteps,units):
    model = Sequential()
    model.add(Embedding(in_vocab,units,input_length=in_timesteps,mask_zero=True))
    model.add(LSTM(units))
    model.add(RepeatVector(out_timesteps))
    model.add(LSTM(units,return_sequences=True))
    model.add(Dense(out_vocab,activation="softmax"))
    return model

In [None]:
model = define_model(fra_vocab_size,eng_vocab_size,fra_length,eng_length,512)
rms = optimizers.RMSprop(learning_rate = 0.001)
model.compile(optimizer=rms,loss='sparse_categorical_crossentropy')

In [None]:
history = model.fit(x_train,y_train.reshape(y_train.shape[0],y_train.shape[1],1),batch_size=512,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
preds = model.predict(x_test)
preds = np.argmax(preds,axis=-1)



In [None]:
preds.shape

(4000, 8)

In [None]:
def get_word(n,tokenizer):
  for word, index in tokenizer.word_index.items():
    if index == n:
      return word
  return None


In [None]:
preds_text = []
for i in preds:
  temp = []
  for j in range(len(i)):
    t = get_word(i[j],eng_tokenizer)
    if j>0:
      if (t == get_word(i[j-1], eng_tokenizer)) or (t == None):
        temp.append('')
      else:
          temp.append(t)
    else:
      if(t == None):
        temp.append('')
      else:
        temp.append(t)
  preds_text.append(' '.join(temp))


In [None]:
pred_df = pd.DataFrame({"actual" : test[:,0] , "predicted" : preds_text})

In [None]:
pred_df.sample(15)

Unnamed: 0,actual,predicted
1191,i was heavily sedated,i need to of
1645,how much is the ticket,do you the
335,those are toms cats,where is your
1307,tom is in there alone,tom is not
296,my gums are swollen,the is
1549,she has a funny face,he has a
2352,why are you even here,why are you so
464,show me how it works,do you
3113,he is a famous artist,this is a
3448,a magnet attracts iron,he a


In [None]:
model.evaluate(x_test,y_test.reshape(y_test.shape[0],y_test.shape[1],1))



2.7273478507995605

In [None]:
y_test[:5]

array([[   2,   15,  717,   29,   49,    0,    0,    0],
       [   9,   99,    2,  310,    8,    0,    0,    0],
       [   1,   24,   13, 2816,    0,    0,    0,    0],
       [ 280,    1,  181,  133,    0,    0,    0,    0],
       [  16,    2,   50,   78,    7,    0,    0,    0]], dtype=int32)

In [None]:
preds[:5]

array([[33, 20, 20,  0,  0,  0,  0,  0],
       [ 1, 46, 25,  2,  2,  0,  0,  0],
       [ 1, 32,  4, 23,  0,  0,  0,  0],
       [41,  1,  2,  2,  0,  0,  0,  0],
       [16,  2,  2,  2,  0,  0,  0,  0]])

In [None]:
preds[0].shape

(4245,)

In [None]:
words = []
indices = []
for word, index in eng_tokenizer.word_index.items():
  words.append(word)
  indices.append(index)

word_index = pd.DataFrame({"word" : words, "index" : indices})
word_index.head(10)

Unnamed: 0,word,index
0,i,1
1,you,2
2,a,3
3,to,4
4,is,5
5,the,6
6,tom,7
7,it,8
8,im,9
9,he,10


In [None]:
test_num = 200
print(pred_df.iloc[test_num])

actual       whose house is this
predicted     this your the     
Name: 200, dtype: object
