In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
from tensorflow.keras.layers import Embedding, Dense, LSTM, Bidirectional, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import regularizers

In [2]:
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)

from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession

config = ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.8
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)

IndexError: list index out of range

In [3]:
tokenizer = Tokenizer()

!wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sonnets.txt \
    -O /tmp/sonnets.txt

data = open('/tmp/sonnets.txt').read()

corpus = data.lower().split('\n')

tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1
print(total_words)

--2021-07-14 06:17:07--  https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sonnets.txt
Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.31.208, 172.217.160.144, 216.58.196.176, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.31.208|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 93578 (91K) [text/plain]
Saving to: ‘/tmp/sonnets.txt’


2021-07-14 06:17:08 (1.24 MB/s) - ‘/tmp/sonnets.txt’ saved [93578/93578]

3211


In [4]:
input_seq = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(len(token_list)):
        n_gram_seq = token_list[:i+1]
        input_seq.append(n_gram_seq)

In [5]:
max_seq_len = max([len(x) for x in input_seq])
input_seq = np.array(pad_sequences(input_seq, padding = 'pre', maxlen = total_words -1))

predictions, labels = input_seq[:,:-1], input_seq[:,-1]
labels = tf.keras.utils.to_categorical(labels, num_classes = total_words)

In [6]:
model = Sequential()
model.add(Embedding(total_words, 100, input_length = max_seq_len))
model.add(Bidirectional(LSTM(150, return_sequences = True)))
model.add(Dropout(0.4))
model.add(Bidirectional(LSTM(100)))
model.add(Dense(total_words/2, activation = 'relu', kernel_regularizer = regularizers.l2(0.01)))
model.add(Dense(total_words, activation = 'softmax'))

adam = Adam(lr = 0.001)
model.compile(optimizer = adam, loss = 'categorical_crossentropy', metrics = ['accuracy'])

In [7]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 11, 100)           321100    
_________________________________________________________________
bidirectional (Bidirectional (None, 11, 300)           301200    
_________________________________________________________________
dropout (Dropout)            (None, 11, 300)           0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 200)               320800    
_________________________________________________________________
dense (Dense)                (None, 1605)              322605    
_________________________________________________________________
dense_1 (Dense)              (None, 3211)              5156866   
Total params: 6,422,571
Trainable params: 6,422,571
Non-trainable params: 0
______________________________________________

In [None]:
history = model.fit(predictions, labels, epochs = 500, verbose = 1)

Epoch 1/500
  5/551 [..............................] - ETA: 30:55 - loss: 11.2777 - accuracy: 0.0000e+00

In [None]:
acc = history.history['accuracy']
loss = history.history['loss']
epochs = range(len(acc))
plt.plot(epochs, acc, label = "accuracy")
plt.title('Accuracy')
plt.figure()

plt.plot(epochs, loss, label = "Loss")
plt.title('Loss')
plt.figure()