# Using seq2seq model to reconstruct english sentences into Yodish sentences.

In [36]:
import numpy as np
import pandas as pd
import os

In [37]:
Training_File_Path = "./Training_data/"

## Data Cleaning
### Collecting Tokenized Sentences

In [38]:
with open(os.path.join(Training_File_Path,'Training_from_clean_en.txt')) as f:
    eng_sents = f.readlines()

eng_sents[:5]

['59 19 2359 27\n',
 '18506 882 882 2558 59 19 12022 27\n',
 '65 59177 428 14 2269 10 96825 72439 550 7\n',
 '422 23 161 27\n',
 '843 593 161 1256 39 27\n']

In [39]:
for i in range(len(eng_sents)):
    eng_sents[i] = eng_sents[i][:-1]

for i in range(len(eng_sents)):
    eng_sents[i] = list(map(int,eng_sents[i].split()))
eng_sents[:5]

[[59, 19, 2359, 27],
 [18506, 882, 882, 2558, 59, 19, 12022, 27],
 [65, 59177, 428, 14, 2269, 10, 96825, 72439, 550, 7],
 [422, 23, 161, 27],
 [843, 593, 161, 1256, 39, 27]]

In [40]:
eng_sents = np.array(eng_sents)
eng_sents.shape

  eng_sents = np.array(eng_sents)


(224028,)

In [41]:
with open(os.path.join(Training_File_Path,'Training_to_clean_yoda.txt')) as f:
    yodish_sents = f.readlines()

yodish_sents[:5]

['7690 5 68 22 5 36 34\n',
 '17935 5 13647 930 930 987 68 22 5 36 34 12 5 14 4\n',
 '25 1729 7 89627 66587 362 55 23813 204 9\n',
 '622 5 273 27 5 36 34 13 4\n',
 '1326 5 491 492 92 1205 5 36 34\n']

In [42]:
for i in range(len(eng_sents)):
    yodish_sents[i] = yodish_sents[i][:-1]

for i in range(len(yodish_sents)):
    yodish_sents[i] = list(map(int,yodish_sents[i].split()))
yodish_sents[:5]

[[7690, 5, 68, 22, 5, 36, 34],
 [17935, 5, 13647, 930, 930, 987, 68, 22, 5, 36, 34, 12, 5, 14, 4],
 [25, 1729, 7, 89627, 66587, 362, 55, 23813, 204, 9],
 [622, 5, 273, 27, 5, 36, 34, 13, 4],
 [1326, 5, 491, 492, 92, 1205, 5, 36, 34]]

In [43]:
yodish_sents = np.array(yodish_sents)
yodish_sents.shape

  yodish_sents = np.array(yodish_sents)


(224028,)

### Collecting Word-Index tables

In [44]:
with open(os.path.join(Training_File_Path,"vocab250000_from.txt"), encoding="utf-8") as f:
    eng_tokens = f.readlines()
eng_tokens[10:20]

['the\n',
 '0\n',
 ')\n',
 '(\n',
 'to\n',
 'in\n',
 '00\n',
 'The\n',
 '0000\n',
 'is\n']

In [45]:
for i in range(len(eng_tokens)):
    eng_tokens[i] = eng_tokens[i][:-1]

eng_word_index = pd.DataFrame({"token" : eng_tokens})


In [46]:
eng_word_index.sample(5)

Unnamed: 0,token
75046,LAVALIN
62753,R-Squared
10934,witness
32251,Paton
87879,printfair


In [47]:
eng_word_index.iloc[59]

token    What
Name: 59, dtype: object

In [48]:
with open(os.path.join(Training_File_Path,"vocab250000_to.txt"), encoding="utf-8") as f:
    yodish_tokens = f.readlines()
yodish_tokens[10:20]

['and\n',
 'Hmmmmmm\n',
 'Yes\n',
 'Yeesssssss\n',
 'hmmm\n',
 'Herh\n',
 '0\n',
 'Of\n',
 ')\n',
 '(\n']

In [49]:
for i in range(len(yodish_tokens)):
    yodish_tokens[i] = yodish_tokens[i][:-1]

yodish_word_index = pd.DataFrame({"token" : yodish_tokens})


In [50]:
yodish_word_index.sample(5)

Unnamed: 0,token
45543,caso
47247,Postulated
5705,Cover
61941,expressway
51937,fiab


In [51]:
yodish_word_index.iloc[7690]

token    Light
Name: 7690, dtype: object

## Creating the Model

In [52]:
from keras.models import Sequential,load_model
from keras.layers import Dense, LSTM, Embedding, RepeatVector
from keras import optimizers

def define_model(in_vocab,out_vocab,in_timesteps,out_timesteps,units):
    model = Sequential()
    model.add(Embedding(in_vocab,units,input_length=in_timesteps,mask_zero=True))
    model.add(LSTM(units))
    model.add(RepeatVector(out_timesteps))
    model.add(LSTM(units,return_sequences=True))
    model.add(Dense(out_vocab,activation="softmax"))
    return model

In [53]:
eng_timesteps_length = 15
yodish_timesteps_length = 15
reduced_eng_sents = []
reduced_yodish_sents = []
for i in range(len(eng_sents)):
    if len(eng_sents[i]) <= eng_timesteps_length and len(yodish_sents[i]) <= yodish_timesteps_length :
        reduced_eng_sents.append(eng_sents[i])
        reduced_yodish_sents.append(yodish_sents[i])
yodish_timesteps_length = max([len(i) for i in reduced_yodish_sents])
print(len(reduced_eng_sents))

204171


In [54]:
eng_vocab_size = eng_word_index.shape[0]
yodish_vocab_size = yodish_word_index.shape[0]

In [55]:
model = define_model(eng_vocab_size,yodish_vocab_size,eng_timesteps_length,yodish_timesteps_length,512)
rms = optimizers.RMSprop(learning_rate = 0.01)
model.compile(optimizer=rms,loss='sparse_categorical_crossentropy')

In [56]:
from keras.utils import pad_sequences
def padder(sent,length):
    sent = pad_sequences(sent,maxlen=length,padding="post")
    return sent
reduced_eng_sents = padder(reduced_eng_sents, eng_timesteps_length)
reduced_yodish_sents = padder(reduced_yodish_sents, yodish_timesteps_length)


In [57]:

reduced_yodish_sents.shape

(204171, 15)

In [58]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(np.array(reduced_eng_sents[:20000]),np.array(reduced_yodish_sents[:20000]),test_size=0.2)


In [59]:
x_train.shape

(16000, 15)

In [60]:
with open('data.npy', 'wb') as f:
    np.save(f, x_train)
    np.save(f, x_test)
    np.save(f, y_train)
    np.save(f, y_test)

In [26]:
model.fit(x_train,y_train,batch_size=512,epochs=2)

Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x2122b676800>

In [65]:
def get_word(n):
  try:
    return yodish_word_index.iloc[n].token
  except:
    return None

In [28]:
model.save("model_2.keras")

In [56]:
preds = model.predict(x_test[:20])
print(preds,preds.shape)

[[[7.3029059e-03 4.4489057e-06 4.4394033e-06 ... 4.5469092e-06
   4.3572468e-06 4.6465307e-06]
  [2.8178504e-01 1.9009400e-06 1.9176150e-06 ... 2.0117582e-06
   2.0160912e-06 1.9902195e-06]
  [4.5211458e-01 9.1844623e-07 9.2665834e-07 ... 9.7055283e-07
   9.8806834e-07 9.6528288e-07]
  ...
  [6.2330139e-01 3.6240289e-07 3.7096376e-07 ... 3.8983330e-07
   3.9673014e-07 3.8405787e-07]
  [6.3015640e-01 3.4650549e-07 3.5495307e-07 ... 3.7322675e-07
   3.7955783e-07 3.6735523e-07]
  [6.3660115e-01 3.3188147e-07 3.4021448e-07 ... 3.5792078e-07
   3.6375178e-07 3.5198150e-07]]

 [[3.9951112e-03 6.3916136e-06 6.4060150e-06 ... 6.5089371e-06
   6.2646300e-06 6.6832818e-06]
  [2.8626901e-01 1.8916631e-06 1.9092161e-06 ... 1.9994584e-06
   2.0047892e-06 1.9802671e-06]
  [4.9374646e-01 7.6256202e-07 7.7061480e-07 ... 8.0855057e-07
   8.2189217e-07 8.0358768e-07]
  ...
  [7.3329031e-01 1.5175885e-07 1.5734770e-07 ... 1.6551564e-07
   1.6733210e-07 1.6189594e-07]
  [7.4118721e-01 1.4027395e-07 1.455

In [57]:
preds = np.argmax(preds,axis=-1)
preds.shape

(20, 15)

In [66]:
preds_text = []
for i in preds:
  temp = []
  for j in range(len(i)):
    t = get_word(i[j])
    # print(t)
    if(t == None):
      temp.append('')
    else:
      temp.append(t)
  preds_text.append(' '.join(temp))

In [67]:
y_actual = []
for i in y_test[:20]:
  temp = []
  for j in range(len(i)):
    t = get_word(i[j])
    # print(t)
    if(t == None):
      temp.append('')
    else:
      temp.append(t)
  y_actual.append(' '.join(temp))


In [68]:
pred_df = pd.DataFrame({"actual" : y_actual , "predicted" : preds_text})

In [70]:
pred_df.sample(10)

Unnamed: 0,actual,predicted
9,"Created to promote section 00 , several tools ...",• the _PAD _PAD _PAD _PAD _PAD _PAD _PAD _PAD ...
6,Fruit . Hmmmmmm . _PAD _PAD _PAD _PAD _PAD _PA...,• _PAD _PAD _PAD _PAD _PAD _PAD _PAD _PAD _PAD...
2,Of the sv a model . _PAD _PAD _PAD _PAD _PAD _...,• the _PAD _PAD _PAD _PAD _PAD _PAD _PAD _PAD ...
7,More information . . . Www . canfax . ca . _PA...,• _PAD _PAD _PAD _PAD _PAD _PAD _PAD _PAD _PAD...
13,• export requirements - netherlands usda . Hmm...,• the _PAD _PAD _PAD _PAD _PAD _PAD _PAD _PAD ...
17,E-mail fraud alert [ 0000-00-00 ] health cana...,The _PAD _PAD _PAD _PAD _PAD _PAD _PAD _PAD _P...
4,Mexico export preparedness guide : _PAD _PAD _...,• _PAD _PAD _PAD _PAD _PAD _PAD _PAD _PAD _PAD...
5,Of agriculture and land reclamation in egypt •...,• the _PAD _PAD _PAD _PAD _PAD _PAD _PAD _PAD ...
15,Remove the marinated bison from the marinade a...,• the _PAD _PAD _PAD _PAD _PAD _PAD _PAD _PAD ...
18,For biotechnology from the us government • con...,• _PAD _PAD _PAD _PAD _PAD _PAD _PAD _PAD _PAD...
