# Buchstabenvorhersage zum Erstellen von Shakespeare Texten

In [1]:
# get data
!wget "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"

--2021-01-23 10:42:44--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2021-01-23 10:42:44 (48.9 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [2]:
# read data
PATH = "input.txt"

with open(PATH) as f:
    text = f.read()

In [4]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

# map words to int
tokenizer = Tokenizer(char_level=True) # characterwise
tokenizer.fit_on_texts(text)

### Testing/playing with Tokenizer

In [5]:
# map "Hallo Welt!" to sequence
hallo_welt = tokenizer.texts_to_sequences(["Hallo Welt!"])
hallo_welt

[[7, 5, 12, 12, 4, 1, 17, 2, 12, 3, 31]]

In [6]:
# map sequnce back to text
hallo_welt = tokenizer.sequences_to_texts(hallo_welt)
hallo_welt

['h a l l o   w e l t !']

In [7]:
# different charakters in tokenizer
max_id = len(tokenizer.word_index)
max_id

39

In [8]:
# total count of characters
dataset_size = tokenizer.document_count
dataset_size

1115394

### Preprocessing the Shakespeare Text

In [9]:
import numpy as np

# map text to sequence
[encoded] = np.array(tokenizer.texts_to_sequences([text])) -1 # -1 to get indices from 0 to N
encoded, len(encoded)

(array([19,  5,  8, ..., 20, 26, 10]), 1115394)

In [10]:
# create dataset
train_size = int(dataset_size * 0.9) # 90% training data
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size]) # 0 to train_size
""" dataset is one long sequence now """

' dataset is one long sequence now '

In [11]:
# cut the dataset into small sequences
n_steps = 100
window_length = n_steps + 1
dataset = dataset.window(window_length, shift=1, drop_remainder=True)
""" dataset is a nested dataset now """

' dataset is a nested dataset now '

In [12]:
# flatten the nested dataset to a normal dataset
dataset = dataset.flat_map(lambda window: window.batch(window_length))

In [13]:
# shuffle dataset
batch_size = 32
dataset = dataset.shuffle(10000).batch(batch_size)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))

In [14]:
# create one-hots, because of the few characters (39)
dataset = dataset.map(
    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch)
)

In [15]:
dataset = dataset.prefetch(1)

### Neural Net

In [17]:
from tensorflow.keras.layers import GRU
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import TimeDistributed
from tensorflow.keras.models import Sequential

model = Sequential()

model.add(GRU(units=128, return_sequences=True, input_shape=[None, max_id], dropout=0.2, recurrent_dropout=0.2))
model.add(GRU(units=128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))
model.add(TimeDistributed(Dense(units=max_id, activation="softmax")))

model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer="adam"
)



In [None]:
history = model.fit(dataset, epochs=20)

Epoch 1/20
    762/Unknown - 502s 652ms/step - loss: 2.5767