In [1]:
import tensorflow as tf
import numpy as np
import os
import pickle
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from string import punctuation
import requests

In [2]:
## Load/Scrap Text From This Link
content = requests.get("http://www.gutenberg.org/cache/epub/11/pg11.txt").text
FILE_PATH = "wonderland.txt"
open(FILE_PATH, "w", encoding="utf-8").write(content)

167516

In [3]:
sequence_length = 50
BATCH_SIZE = 128
EPOCHS = 30
BASENAME = os.path.basename(FILE_PATH)

### Read The Data
text = open(FILE_PATH, encoding="utf-8").read()

### Clean The Data
## Remove caps
text = text.lower()
## Remove punctuation
text = text.translate(str.maketrans("", "", punctuation))

In [4]:
## Print Some Stats
n_chars = len(text)
vocab = ''.join(sorted(set(text))) ## Join All Text Without Spaces --> Take It As Set Of Letters Not Words
n_unique_chars = len(vocab)

print("Unique characters:", vocab)
print("Number of characters:", n_chars)
print("Number of unique characters:", n_unique_chars)

Unique characters:
 0123456789abcdefghijklmnopqrstuvwxyz﻿
Number of characters: 158596
Number of unique characters: 39


In [6]:
vocab[0], vocab[1]

('\n', ' ')

In [9]:
## Dictionary its Keys: Characters, Values: Integers(indeices)
char2int = {c: i for i, c in enumerate(vocab)}
print(char2int, '\n')

## Dictionary its Keys: Integers(indeices), Values: Characters
int2char = {i: c for i, c in enumerate(vocab)}
print(int2char)

## Save these dictionaries for later generation
pickle.dump(char2int, open(f"{BASENAME}-char2int.pickle", "wb"))
pickle.dump(int2char, open(f"{BASENAME}-int2char.pickle", "wb"))

{'\n': 0, ' ': 1, '0': 2, '1': 3, '2': 4, '3': 5, '4': 6, '5': 7, '6': 8, '7': 9, '8': 10, '9': 11, 'a': 12, 'b': 13, 'c': 14, 'd': 15, 'e': 16, 'f': 17, 'g': 18, 'h': 19, 'i': 20, 'j': 21, 'k': 22, 'l': 23, 'm': 24, 'n': 25, 'o': 26, 'p': 27, 'q': 28, 'r': 29, 's': 30, 't': 31, 'u': 32, 'v': 33, 'w': 34, 'x': 35, 'y': 36, 'z': 37, '\ufeff': 38} 

{0: '\n', 1: ' ', 2: '0', 3: '1', 4: '2', 5: '3', 6: '4', 7: '5', 8: '6', 9: '7', 10: '8', 11: '9', 12: 'a', 13: 'b', 14: 'c', 15: 'd', 16: 'e', 17: 'f', 18: 'g', 19: 'h', 20: 'i', 21: 'j', 22: 'k', 23: 'l', 24: 'm', 25: 'n', 26: 'o', 27: 'p', 28: 'q', 29: 'r', 30: 's', 31: 't', 32: 'u', 33: 'v', 34: 'w', 35: 'x', 36: 'y', 37: 'z', 38: '\ufeff'}


In [8]:
## Convert all text into integers
encoded_text = np.array([char2int[c] for c in text])
print(encoded_text, '\n')
    
## Convert all integers to Tensor --> Construct Tensor Object
char_dataset = tf.data.Dataset.from_tensor_slices(encoded_text)
print(char_dataset)

[38 27 29 ... 30  0  0] 

<TensorSliceDataset shapes: (), types: tf.int32>


In [10]:
## Print first 8 characters (Included white-space)
for char in char_dataset.take(8):
    print(char.numpy(), int2char[char.numpy()])

38 ﻿
27 p
29 r
26 o
21 j
16 e
14 c
31 t


In [11]:
## Build Sequences By Batching
sequences = char_dataset.batch(2*sequence_length + 1, drop_remainder=True)
print(sequences, '\n____________________________________________________\n')
    
## Print Sequences
for sequence in sequences.take(2):
    print(''.join([int2char[i] for i in sequence.numpy()]))

<BatchDataset shapes: (201,), types: tf.int32> 
____________________________________________________
﻿project gutenbergs alices adventures in wonderland by lewis carroll



this ebook is for the use of anyone anywhere at no cost and with

almost no restrictions whatsoever  you may copy it give it away
 or

reuse it under the terms of the project gutenberg license included

with this ebook or online at wwwgutenbergorg





title alices adventures in wonderland



author lewis carroll



posting date 


In [27]:
## Prepare inputs (x) and targets (y)
def split_sample(sample): ##                          X        ,               Y
    ds = tf.data.Dataset.from_tensors((sample[:sequence_length], sample[sequence_length]))
    for i in range(1, (len(sample)-1) // 2): ## range(1, 394) --> (789 - 1) // 2
        
        input_ = sample[i: i+sequence_length]
        target = sample[i+sequence_length]

        ## Extend The Dataset with these samples by concatenate() method
        other_ds = tf.data.Dataset.from_tensors((input_, target))
        ds = ds.concatenate(other_ds)
    return ds


print(len(sequences))
dataset = sequences.flat_map(split_sample)
dataset

789


<FlatMapDataset shapes: ((50,), ()), types: (tf.int32, tf.int32)>

In [28]:
def one_hot_samples(input_, target):
    # one-hot encode the inputs and the targets
    return tf.one_hot(input_, n_unique_chars), tf.one_hot(target, n_unique_chars)


dataset = dataset.map(one_hot_samples)
dataset

<MapDataset shapes: ((50, 39), (39,)), types: (tf.float32, tf.float32)>

In [30]:
## Print First 2 Samples
print("="*80)
for element in dataset.take(5):
    print("Input:", ''.join([int2char[np.argmax(char_vector)] for char_vector in element[0].numpy()]))
    print("Target:", int2char[np.argmax(element[1].numpy())])
    print("Input shape:", element[0].shape)
    print("Target shape:", element[1].shape)
    
    print("="*80)

Input: ﻿project gutenbergs alices adventures in wonderlan
Target: d
Input shape: (50, 39)
Target shape: (39,)
Input: project gutenbergs alices adventures in wonderland
Target:  
Input shape: (50, 39)
Target shape: (39,)
Input: roject gutenbergs alices adventures in wonderland 
Target: b
Input shape: (50, 39)
Target shape: (39,)
Input: oject gutenbergs alices adventures in wonderland b
Target: y
Input shape: (50, 39)
Target shape: (39,)
Input: ject gutenbergs alices adventures in wonderland by
Target:  
Input shape: (50, 39)
Target shape: (39,)


In [31]:
## Repeat, Shuffle and Batch The Dataset
ds = dataset.repeat().shuffle(1024).batch(BATCH_SIZE, drop_remainder=True)

In [None]:
## Build The Model
model = Sequential([
    LSTM(256, input_shape=(sequence_length, n_unique_chars), return_sequences=True),
    Dropout(0.3),
    LSTM(256),
    Dense(n_unique_chars, activation="softmax"),## The Output Layer --> No. Of Classes: n_unique_chars, Activation: softmax
])

## Train the model
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.fit(ds, steps_per_epoch=(len(encoded_text) - sequence_length) // BATCH_SIZE, epochs=1)

## Make "results folder" if does not exist yet
if not os.path.isdir("results"):
    os.mkdir("results")

## Save the model
model.save(f"results/{BASENAME}-{sequence_length}.h5")