In [1]:
import tensorflow as tf

import numpy as np
import os
import time


In [2]:
import urllib.request

url = "https://raw.githubusercontent.com/CalvinDO/AITextGenerator/main/LawLanguageRNN/dataText.txt"
response = urllib.request.urlopen(url)
text = response.read().decode(encoding='utf-8')

# Take a look at the first 250 characters in text
print(text[:250])


# The unique characters in the file
vocab = sorted(set(text))
print(f'{len(vocab)} unique characters')
print(vocab)


Length of text: 1525970 characters
Buch 1
Allgemeiner Teil
Abschnitt 1
Personen
Titel 1
Natuerliche Personen, Verbraucher, Unternehmer

§ 1 Beginn der Rechtsfaehigkeit
Die Rechtsfaehigkeit des Menschen beginnt mit der Vollendung der Geburt.

§ 2 Eintritt der Volljaehrigkeit
75 unique characters
['\n', '\r', ' ', '"', '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '§']


In [32]:
example_texts = ['abcdefg', 'xyz']

chars = tf.strings.unicode_split(example_texts, input_encoding='UTF-8')
chars

ids_from_chars = tf.keras.layers.StringLookup(
    vocabulary=list(vocab), mask_token=None)

ids = ids_from_chars(chars)
ids


chars_from_ids = tf.keras.layers.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)


chars = chars_from_ids(ids)
chars

tf.strings.reduce_join(chars, axis=-1).numpy()


def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)


all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
all_ids


ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

for ids in ids_dataset.take(10):
    print(chars_from_ids(ids).numpy().decode('utf-8'))

seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)

sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

for seq in sequences.take(1):
  print(chars_from_ids(seq))

for seq in sequences.take(5):
    print(text_from_ids(seq).numpy())


def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text


B
u
c
h
 
1



A
l
tf.Tensor(
[b'B' b'u' b'c' b'h' b' ' b'1' b'\r' b'\n' b'A' b'l' b'l' b'g' b'e' b'm'
 b'e' b'i' b'n' b'e' b'r' b' ' b'T' b'e' b'i' b'l' b'\r' b'\n' b'A' b'b'
 b's' b'c' b'h' b'n' b'i' b't' b't' b' ' b'1' b'\r' b'\n' b'P' b'e' b'r'
 b's' b'o' b'n' b'e' b'n' b'\r' b'\n' b'T' b'i' b't' b'e' b'l' b' ' b'1'
 b'\r' b'\n' b'N' b'a' b't' b'u' b'e' b'r' b'l' b'i' b'c' b'h' b'e' b' '
 b'P' b'e' b'r' b's' b'o' b'n' b'e' b'n' b',' b' ' b'V' b'e' b'r' b'b'
 b'r' b'a' b'u' b'c' b'h' b'e' b'r' b',' b' ' b'U' b'n' b't' b'e' b'r'
 b'n' b'e' b'h'], shape=(101,), dtype=string)
b'Buch 1\r\nAllgemeiner Teil\r\nAbschnitt 1\r\nPersonen\r\nTitel 1\r\nNatuerliche Personen, Verbraucher, Unterneh'
b'mer\r\n\r\n\xc2\xa7 1 Beginn der Rechtsfaehigkeit\r\nDie Rechtsfaehigkeit des Menschen beginnt mit der Vollendung '
b'der Geburt.\r\n\r\n\xc2\xa7 2 Eintritt der Volljaehrigkeit\r\nDie Volljaehrigkeit tritt mit der Vollendung des 18.'
b' Lebensjahres ein.\r\n\r\n\xc2\xa7\xc2\xa7 3 bis 6 (weggefallen)

In [33]:
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

split_input_target(list("Tensorflow"))

dataset = sequences.map(split_input_target)

for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())


# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset


Input : b'Buch 1\r\nAllgemeiner Teil\r\nAbschnitt 1\r\nPersonen\r\nTitel 1\r\nNatuerliche Personen, Verbraucher, Unterne'
Target: b'uch 1\r\nAllgemeiner Teil\r\nAbschnitt 1\r\nPersonen\r\nTitel 1\r\nNatuerliche Personen, Verbraucher, Unterneh'


<PrefetchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

In [34]:
# Length of the vocabulary in chars
vocab_size = len(vocab)
# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024


In [52]:
class MyModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__()
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    
    x = self.embedding(x, training=training)
    #removed:
    #if states is None:
    #  states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x


model = MyModel(
    # Be sure the vocabulary size matches the `StringLookup` layers.
    vocab_size=len(ids_from_chars.get_vocabulary()),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
)


for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape,
          "# (batch_size, sequence_length, vocab_size)")

model.summary()


sampled_indices = tf.random.categorical(
    example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()

sampled_indices


#print("Input:\n", text_from_ids(input_example_batch[0]).numpy())
#print()
#print("Next Char Predictions:\n", text_from_ids(sampled_indices).numpy())


loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

example_batch_mean_loss = loss(target_example_batch, example_batch_predictions)
#print("Prediction shape: ", example_batch_predictions.shape,
#      " # (batch_size, sequence_length, vocab_size)")
#print("Mean loss:        ", example_batch_mean_loss)

tf.exp(example_batch_mean_loss).numpy()


(64, 100, 76) # (batch_size, sequence_length, vocab_size)
Model: "my_model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     multiple                  19456     
                                                                 
 gru_3 (GRU)                 multiple                  3938304   
                                                                 
 dense_3 (Dense)             multiple                  77900     
                                                                 
Total params: 4,035,660
Trainable params: 4,035,660
Non-trainable params: 0
_________________________________________________________________


76.04339

In [55]:
# ONLY RUN WHEN LOADING A PRE-TRAINED MODEL
checkpoint_path = "./training_checkpoints/ckpt_100"

model.load_weights(checkpoint_path)


<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x1d4ca5f46c8>

In [57]:
model.compile(optimizer=tf.keras.optimizers.SGD(momentum=0.9),
              metrics=['accuracy'], loss=loss)



# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}.weights.h5")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

EPOCHS = 1

In [59]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])



KeyboardInterrupt: 

In [43]:
import matplotlib.pyplot as plt

plt.plot(history.history['accuracy'], label='accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.ylim([0.75, 0.9])
plt.legend(loc='lower right')


NameError: name 'history' is not defined

In [50]:
class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states





In [51]:
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

start = time.time()
states = None
next_char = tf.constant([' '])
result = [next_char]

for n in range(2000):
  next_char, states = one_step_model.generate_one_step(
      next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)


 Anspruch auf den Eigentuemer zwischen Leistung abweichend von Absatz 1 Nr. 3 kredn Gellen unmilvergang des Darlehens sind dem Erben als 5 uewertet und erste Bauschsystetvot, wenn dies im Zweifel als auf den Ersatz der Sache geltend zu machen.
Die spaeilierrerichen).

§ 1224 
(weggefallen)

§ 266H Pfandrechte

§ 1637 Gesamtwordungsbereich das Vorliegen

§ 1594 Anlegung von Euer die Hereitstelann diese Veraeusserungunge nurden, die sich aus der Mutter fuer einen Vertrag ueber den Vertragsschluss beind,uchen mitten, so erlischt zum Gegenstand hat, verweigern, wenn die Verwaltung des Nachlasses Teilanzhiedung unter Beruecksichtigung des Notar auf die Zuwendung vgl. § 578 Abs. 2 Satz 1 +++)
(+++ § 559c: dem Annehmender urbansgigt oder sich bestimmt werden, dem Darlehensnehmer geeignete Modernis des Gericht. Der Niessbrauch an einer Teil, um dem Verkahren in betruegerischer Absicht gehandelt oder
3.
vor aenderungen nach den §§ 471, 2859 Abs. 1, § 173 Abs. 2 zu ueberpruefung.
(2) Das Gleiche

In [774]:
tf.saved_model.save(one_step_model, 'one_step')
one_step_reloaded = tf.saved_model.load('one_step')





INFO:tensorflow:Assets written to: one_step\assets


INFO:tensorflow:Assets written to: one_step\assets


In [773]:
states = None
next_char = tf.constant([' '])
result = [next_char]

for n in range(10000):
  next_char, states = one_step_reloaded.generate_one_step(
      next_char, states=states)
  result.append(next_char)

print(tf.strings.join(result)[0].numpy().decode("utf-8"))


 Abt. 2 Satz 1 +++)
:a+kt der Dritten in durch § 1310 Abs. 2 Satz 2, wenn sie dem Anfang der Hypothek, eine Weise als mit dem Monat sowohl der Eigentuemer das Vermoegen uebernames Teilhabers abhaengt. Der Markei, so kann er nur unter den Erben oder eines Mangelhafter Ersicherung einer Vormunks, die den Vertragschliessenden oder des Mietverhaeltnisses zwischen der zweiter Dritter insoweit
(1) Wei einem eigenheit eine notarielle fuer die Lieferung einer Hyyothek, die zuzuegel hierung untirure, wilde gilt die Vorschrift des § 1378 Abs. 1 hinverlassenen Unterventsprich. Der Kuendigungspreis sechs Monate und ueber die Ehegatten anzubichen, es sei denn, dass, nach der Reiseveranstalter nach solchen dem in der Regel vorbiegen. Das Gesamtgut gekoendig anerklaeren, es sei denn, dass er zur Leistung bewinnt oder in den Faellen des § 144 entsprechende Anwendung, wenn die Fehlerhaftigkeit nicht vorziehen, gissanspauchsichtigt. Ist die Auflage sowie leendente kann nicht flei von Umpauf-Vollsichtlic