In [None]:
from __future__ import print_function
import collections
import os
import tensorflow as tf
from keras.models import Sequential, load_model
from keras.layers import Dense, Activation, Embedding, Dropout, TimeDistributed
from keras.layers import LSTM
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
import numpy as np
import argparse

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
#Function to replace \n tags
def read_words(filename):
  with tf.io.gfile.GFile(filename, "rb") as f:
    return f.read().decode("utf-8").replace("\n", "").split()

In [None]:
# Function to create a dictionary of words and a corresponding ID for each word
def build_vocab(filename):
  data = read_words(filename)

  counter = collections.Counter(data)
  count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))

  words, _ = list(zip(*count_pairs))
  word_to_id = dict(zip(words, range(len(words))))

  return word_to_id

In [None]:
#Function to convert each word with specific ID
def file_to_word_ids(filename, word_to_id):
  data = read_words(filename)
  return [word_to_id[word] for word in data if word in word_to_id]

In [None]:
def load_data():

  train_path = "/content/gdrive/MyDrive/ptbdataset/ptb.train.txt"
  valid_path = "/content/gdrive/MyDrive/ptbdataset/ptb.valid.txt"
  test_path = "/content/gdrive/MyDrive/ptbdataset/ptb.test.txt"

  #Build vocabulary then convert text to int list
  word_to_id = build_vocab(train_path)
  train_data = file_to_word_ids(train_path, word_to_id)
  valid_data = file_to_word_ids(valid_path, word_to_id)
  test_data = file_to_word_ids(test_path, word_to_id)
  vocabulary = len(word_to_id)
  reversed_dictionary = dict(zip(word_to_id.values(), word_to_id.keys()))

  print(train_data[:5])
  print(word_to_id)
  print(vocabulary)
  print(" ".join([reversed_dictionary[x] for x in train_data[:10]]))
  return train_data, valid_data, test_data, vocabulary, reversed_dictionary

In [None]:
train_data, valid_data, test_data, vocabulary, reversed_dictionary = load_data()

[9969, 9970, 9971, 9973, 9974]
9999
aer banknote berlitz calloway centrust cluett fromstein gitano guterman hydro-quebec


In [None]:
class KerasBatchGenerator(object):

  def __init__(self, data, num_steps, batch_size, vocabulary, skip_step=5):
    self.data = data
    self.num_steps = num_steps
    self.batch_size = batch_size
    self.vocabulary = vocabulary
    # this will track the progress of the batches sequentially through the data set - 
    # once the data reaches the end of the data set it will reset back to zero
    self.current_idx = 0
    # skip_step is the number of words which will be skipped before the next
    # batch is skimmed from the data set
    self.skip_step = skip_step

  def generate(self):
        x = np.zeros((self.batch_size, self.num_steps))
        y = np.zeros((self.batch_size, self.num_steps, self.vocabulary))
        while True:
            for i in range(self.batch_size):
                if self.current_idx + self.num_steps >= len(self.data):
                    # reset the index back to the start of the data set
                    self.current_idx = 0
                x[i, :] = self.data[self.current_idx:self.current_idx + self.num_steps]
                temp_y = self.data[self.current_idx + 1:self.current_idx + self.num_steps + 1]
                # convert all of temp_y into a one hot representation
                y[i, :, :] = to_categorical(temp_y, num_classes=self.vocabulary)
                self.current_idx += self.skip_step
            yield x, y

In [None]:
num_steps = 32
batch_size = 64

train_data_generator = KerasBatchGenerator(train_data, num_steps, batch_size, vocabulary,
                                           skip_step=num_steps)
valid_data_generator = KerasBatchGenerator(valid_data, num_steps, batch_size, vocabulary,
                                           skip_step=num_steps)

hidden_size = 2048 
use_dropout = True

model = Sequential()
model.add(Embedding(vocabulary, hidden_size, input_length=num_steps))
model.add(LSTM(hidden_size, return_sequences=True))
model.add(Dropout(0.4))
model.add(LSTM(hidden_size, return_sequences=True))
model.add(Dropout(0.4))
model.add(TimeDistributed(Dense(vocabulary)))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics='categorical_accuracy')

In [None]:
model.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 32, 2048)          20477952  
                                                                 
 lstm_12 (LSTM)              (None, 32, 2048)          33562624  
                                                                 
 dropout_12 (Dropout)        (None, 32, 2048)          0         
                                                                 
 lstm_13 (LSTM)              (None, 32, 2048)          33562624  
                                                                 
 dropout_13 (Dropout)        (None, 32, 2048)          0         
                                                                 
 time_distributed_6 (TimeDis  (None, 32, 9999)         20487951  
 tributed)                                                       
                                                      

In [None]:
num_epochs = 50

model.fit_generator(train_data_generator.generate(), len(train_data)//(batch_size*num_steps), num_epochs,
                    validation_data=valid_data_generator.generate(),
                    validation_steps=len(valid_data)//(batch_size*num_steps))

Epoch 1/50


  model.fit_generator(train_data_generator.generate(), len(train_data)//(batch_size*num_steps), num_epochs,


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f6525850eb0>