In [None]:
import numpy as np
import pandas as pd

from keras import callbacks
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import LSTM, Input, TimeDistributed, Dense, Activation, RepeatVector, Embedding, GRU
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy
from tensorflow.keras.utils import Sequence, to_categorical

In [None]:
!pip -q install --upgrade --no-cache-dir gdown

In [None]:
!gdown --id  1hpjYFmIxHZeTQt4VHovfQGAJ9oLY8xbm

Downloading...
From: https://drive.google.com/uc?id=1hpjYFmIxHZeTQt4VHovfQGAJ9oLY8xbm
To: /content/ferdousi.txt
100% 4.54M/4.54M [00:00<00:00, 136MB/s]


In [None]:
def tokenize(sentences):
  # Create tokenizer
  text_tokenizer = Tokenizer()
  # Fit texts
  text_tokenizer.fit_on_texts(sentences)
  return text_tokenizer.texts_to_sequences(sentences), text_tokenizer

In [None]:
with open("ferdousi.txt") as f:
  raw_data = f.read().splitlines()[2:]
f.close()

pairs = []
for i in range(len(raw_data)):
  if i % 2 ==0:
    tmp = raw_data[i].split(' ')
  else:
    tmp.append(' ')
    for w in raw_data[i].split(' '):
      tmp.append(w)
    pairs.append(tmp)

text_tokenized, text_tokenizer = tokenize(pairs)

max_len = len(max(text_tokenized,key=len))
vocab = len(text_tokenizer.word_index) + 1
data_size = len(pairs)

pad_sentence = pad_sequences(text_tokenized, max_len, padding = "post")

pad_sentence_cyc = np.concatenate((pad_sentence[1:], pad_sentence[:1]))



pad_sentence = pad_sentence.reshape(*pad_sentence.shape, 1)
pad_sentence_cyc = pad_sentence_cyc.reshape(*pad_sentence_cyc.shape, 1)

In [None]:
input_sequence = Input(shape=(max_len,))
embedding = Embedding(input_dim=vocab, output_dim=256,)(input_sequence)
encoder = LSTM(256, return_sequences=True)(embedding)
encoder2 = LSTM(128, return_sequences=True)(encoder)
encoder3 = LSTM(128, return_sequences=False)(encoder2)
r_vec = RepeatVector(max_len)(encoder3)
decoder = LSTM(512, return_sequences=True, dropout=0.2)(r_vec)
logits = TimeDistributed(Dense(512, activation = 'relu'))(decoder)
logits = TimeDistributed(Dense(vocab))(logits)

In [None]:
enc_dec_model = Model(input_sequence, Activation('softmax')(logits))
enc_dec_model.compile(loss=sparse_categorical_crossentropy,
              optimizer=Adam(1e-4),
              metrics=['accuracy'])
enc_dec_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 21)]              0         
                                                                 
 embedding (Embedding)       (None, 21, 256)           4611328   
                                                                 
 lstm (LSTM)                 (None, 21, 256)           525312    
                                                                 
 lstm_1 (LSTM)               (None, 21, 128)           197120    
                                                                 
 lstm_2 (LSTM)               (None, 128)               131584    
                                                                 
 repeat_vector (RepeatVector  (None, 21, 128)          0         
 )                                                               
                                                             

In [None]:
# Stop training when a monitored metric has stopped improving.
earlyStop = callbacks.EarlyStopping( patience=4, verbose=1, restore_best_weights=True, min_delta=1e-4)

model_results = enc_dec_model.fit(pad_sentence, pad_sentence_cyc, shuffle=True,
                                  batch_size=30, epochs=20, validation_split=.25, callbacks = [earlyStop])

In [None]:
def logits_to_sentence(indices, tokenizer):

    index_to_words = {idx: word for word, idx in tokenizer.word_index.items()}
    index_to_words[0] = '<empty>' 

    return ' '.join([index_to_words[prediction] for prediction in indices])


def get_sentence(predict, tokenizer):
  output = []
  for i in range(predict.shape[1]):
    tmp_index =[]
    tmp_val = []
    for j in range(predict.shape[2]):
      if predict[0][i][j]>.001:
        tmp_index.append(i)
        tmp_val.append(predict[0][i][j])
    
    tmp_out = np.random.choice(tmp_index, p = tmp_val / sum(tmp_val))
    output.append(tmp_out)

  return logits_to_sentence(output, tokenizer)



In [None]:
a=enc_dec_model.predict(pad_sentence[:1][:,:,0])

print("LSTM result:\n")

print("input:")
print(raw_data[0], raw_data[1])

print('predict:')
print(get_sentence(a, text_tokenizer))

LSTM result:

input:
به نام خداوند جان و خرد کزین برتر اندیشه برنگذرد
predict:
<empty>   و به که ز از بر را چو با همی گفت شد شاه تو بود او یکی همه آن


As we can see, only the most frequent words have shown up.

In [None]:
input_sequence = Input(shape=(max_len,))
embedding = Embedding(input_dim=vocab, output_dim=256,)(input_sequence)
encoder = GRU(256, return_sequences=True)(embedding)
encoder2 = GRU(128, return_sequences=True)(encoder)
encoder3 = GRU(128, return_sequences=False)(encoder2)
r_vec = RepeatVector(max_len)(encoder3)
decoder1 = GRU(512, return_sequences=True, dropout=0.2)(r_vec)
decoder = GRU(256, return_sequences=True, dropout=0.2)(decoder1)
logits = TimeDistributed(Dense(512, activation = 'relu'))(decoder)
logits = TimeDistributed(Dense(vocab))(logits)

In [None]:
enc_dec_model_GRU = Model(input_sequence, Activation('softmax')(logits))
enc_dec_model_GRU.compile(loss=sparse_categorical_crossentropy,
              optimizer=Adam(1e-4),
              metrics=['accuracy'])
enc_dec_model_GRU.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 21)]              0         
                                                                 
 embedding (Embedding)       (None, 21, 256)           4611328   
                                                                 
 gru (GRU)                   (None, 21, 256)           394752    
                                                                 
 gru_1 (GRU)                 (None, 21, 128)           148224    
                                                                 
 gru_2 (GRU)                 (None, 128)               99072     
                                                                 
 repeat_vector (RepeatVector  (None, 21, 128)          0         
 )                                                               
                                                             

In [None]:
model_results_GRU = enc_dec_model_GRU.fit(pad_sentence, pad_sentence_cyc, shuffle=True,
                                  batch_size=30, epochs=20, validation_split=.25, callbacks = [earlyStop])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 6: early stopping


In [None]:
a = enc_dec_model_GRU.predict(pad_sentence[:1][:,:,0])

print("GRU result:\n")

print("input:")
print(raw_data[0], raw_data[1])

print('predict:')
print(get_sentence(a, text_tokenizer))

GRU result:

input:
به نام خداوند جان و خرد کزین برتر اندیشه برنگذرد
predict:
<empty>   و به که ز از بر را چو با همی گفت شد شاه تو بود او یکی همه آن
