In [57]:
import urllib.request

import numpy as np
import random
import tensorflow as tf
import keras

from keras.models import Sequential
from keras import layers

keras.utils.set_random_seed(42)



In [58]:
path = 'frankenstein.txt'
url = 'https://gutenberg.org/ebooks/84.txt.utf-8'

intro_length = 1375

# with urllib.request.urlopen(url) as file:
# 	text = file.read().decode('utf-8').lower()

with open(path, encoding='utf-8') as f:
	text = f.read().lower()			# Convert everything to lowercase

text = text[intro_length:-1]

# Length of characters in the text
print('Text length: ', len(text))
print('--------------------------')
print(text[0:1000]) # First 1000 characters



Text length:  437431
--------------------------



letter 1

_to mrs. saville, england._


st. petersburgh, dec. 11th, 17—.


you will rejoice to hear that no disaster has accompanied the
commencement of an enterprise which you have regarded with such evil
forebodings. i arrived here yesterday, and my first task is to assure
my dear sister of my welfare and increasing confidence in the success
of my undertaking.

i am already far north of london, and as i walk in the streets of
petersburgh, i feel a cold northern breeze play upon my cheeks, which
braces my nerves and fills me with delight. do you understand this
feeling? this breeze, which has travelled from the regions towards
which i am advancing, gives me a foretaste of those icy climes.
inspirited by this wind of promise, my daydreams become more fervent
and vivid. i try in vain to be persuaded that the pole is the seat of
frost and desolation; it ever presents itself to my imagination as the
region of beauty and delight. there, ma

Dobbiamo convertire il testo (parole) in un insieme di numeri per permettere alla rete di gestire.

### Utilizziamo One-Hot Encoding
Per ogni carattere abbiamo un'array con la flag attiva nella posizione del character.

In [59]:
chars = sorted(list(set(text)))

print('total chars: ', len(chars))

# Dictionary to convert from chars to numbers
char_indices = dict((c,i) for i, c in enumerate(chars))
# Dictionary to convert from numbers to chars
indices_chars = dict((i,c) for i, c in enumerate(chars))

print(char_indices)
print(indices_chars)

total chars:  66
{'\n': 0, ' ': 1, '!': 2, '$': 3, '%': 4, '(': 5, ')': 6, '*': 7, ',': 8, '-': 9, '.': 10, '/': 11, '0': 12, '1': 13, '2': 14, '3': 15, '4': 16, '5': 17, '6': 18, '7': 19, '8': 20, '9': 21, ':': 22, ';': 23, '?': 24, '[': 25, ']': 26, '_': 27, 'a': 28, 'b': 29, 'c': 30, 'd': 31, 'e': 32, 'f': 33, 'g': 34, 'h': 35, 'i': 36, 'j': 37, 'k': 38, 'l': 39, 'm': 40, 'n': 41, 'o': 42, 'p': 43, 'q': 44, 'r': 45, 's': 46, 't': 47, 'u': 48, 'v': 49, 'w': 50, 'x': 51, 'y': 52, 'z': 53, 'æ': 54, 'è': 55, 'é': 56, 'ê': 57, 'ô': 58, '—': 59, '‘': 60, '’': 61, '“': 62, '”': 63, '•': 64, '™': 65}
{0: '\n', 1: ' ', 2: '!', 3: '$', 4: '%', 5: '(', 6: ')', 7: '*', 8: ',', 9: '-', 10: '.', 11: '/', 12: '0', 13: '1', 14: '2', 15: '3', 16: '4', 17: '5', 18: '6', 19: '7', 20: '8', 21: '9', 22: ':', 23: ';', 24: '?', 25: '[', 26: ']', 27: '_', 28: 'a', 29: 'b', 30: 'c', 31: 'd', 32: 'e', 33: 'f', 34: 'g', 35: 'h', 36: 'i', 37: 'j', 38: 'k', 39: 'l', 40: 'm', 41: 'n', 42: 'o', 43: 'p', 44: 'q', 

### Training set perparation

In [60]:
# Dati i primi 30 caratteri di contesto prevedi quello successivo
maxlen = 30
step = 2

sentences = []
next_chars = []

# from the beginning to about half the text
for i in range(0, 300_000, step):
	sentences.append(text[i: i+maxlen])
	next_chars.append(text[i+maxlen])

print('number  of sentences: ', len(sentences))

print(sentences[11])
print(next_chars[11])

# create training set & labels
x = np.zeros((len(sentences), maxlen, len(chars))) # 15000x30x65
y = np.zeros((len(sentences), len(chars)))  # 15000x65

# spatially organize data (matrices)
for i, (sentence, next_char) in enumerate(zip(sentences, next_chars)):
	for t, char in enumerate(sentence):
		x[i, t, char_indices[char]] = 1
	y[i, char_indices[next_char]] = 1

print('training set shapes: ', x.shape, y.shape)


number  of sentences:  150000
saville, england._


st. peter
s
training set shapes:  (150000, 30, 66) (150000, 66)


### Test Set preparation

In [61]:
sentences = []
next_chars = []

# from the beginning to about half the text
for i in range(300_000, len(text)-maxlen, step):
	sentences.append(text[i: i+maxlen])
	next_chars.append(text[i+maxlen])

print('number  of sentences: ', len(sentences))

print(sentences[11])
print(next_chars[11])

# create training set & labels
x_test = np.zeros((len(sentences), maxlen, len(chars)))
y_test = np.zeros((len(sentences), len(chars)))

# spatially organize data (matrices)
for i, (sentence, next_char) in enumerate(zip(sentences, next_chars)):
	for t, char in enumerate(sentence):
		x_test[i, t, char_indices[char]] = 1
	y_test[i, char_indices[next_char]] = 1

print('training set shapes: ', x_test.shape, y_test.shape)


number  of sentences:  68701
ne in which i was engaged, my 
s
training set shapes:  (68701, 30, 66) (68701, 66)


In [62]:
from keras.callbacks import LambdaCallback
import sys

# We only test after the tenth epoch, because we can't expect good performances before
def testAfterEpoch(epoch, _):
	if epoch < 29:
		return
		
	print()
	print()
	print("---------------------- Generating text after epoch 29")

	start_index = random.randint(0, len(text)-maxlen-1)

	generated = ''
	sentence = text[start_index: start_index+maxlen]
	generated += sentence
	print('***** starting sentence *****')
	print(sentence)
	print('*****************************')
	sys.stdout.write(generated)

	for i in range(200):
		x_pred = np.zeros((1, maxlen, len(chars)))
		for t, char in enumerate(sentence):
			x_pred[0, t, char_indices[char]] = 1
		preds = model.predict(x_pred, verbose=0)[0]
		next_index = np.argmax(preds)
		next_char = indices_chars[next_index]

		sentence = sentence[1:] + next_char
		sys.stdout.write(next_char)
		sys.stdout.flush()
	print()

print_callback = LambdaCallback(on_epoch_end=testAfterEpoch)


## CNN Network

### CNN Setup

In [66]:
model = keras.Sequential([
	keras.Input(shape=(maxlen, len(chars),1)),
	layers.Conv2D(32, kernel_size=(3,3), activation='relu'),
	layers.MaxPooling2D(pool_size=(2,2)),
	layers.Conv2D(64, kernel_size=(3,3), activation='relu'),
	layers.MaxPooling2D(pool_size=(2,2)),
	layers.Flatten(),
	layers.Dropout(0.5),							# To avoid overfitting
	layers.Dense(len(chars), activation="softmax")  # Probability of the next character
])

optmizer = keras.optimizers.RMSprop(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optmizer, metrics=['accuracy'])

model.summary()

^ none is due to the fact we don't know batch size yet.

### CNN Training

In [67]:
model.fit(x, y, batch_size=2048, epochs=30, callbacks=[print_callback])

Epoch 1/30
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 489ms/step - accuracy: 0.1477 - loss: 3.0948
Epoch 2/30
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 490ms/step - accuracy: 0.1740 - loss: 2.9072
Epoch 3/30
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 494ms/step - accuracy: 0.1990 - loss: 2.8060
Epoch 4/30
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 525ms/step - accuracy: 0.2100 - loss: 2.7627
Epoch 5/30
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 535ms/step - accuracy: 0.2159 - loss: 2.7376
Epoch 6/30
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 491ms/step - accuracy: 0.2209 - loss: 2.7245
Epoch 7/30
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 484ms/step - accuracy: 0.2237 - loss: 2.7098
Epoch 8/30
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 488ms/step - accuracy: 0.2272 - loss: 2.7026
Epoch 9/30
[1m74/74[0m [32m━━

<keras.src.callbacks.history.History at 0x1e1995c89e0>

### Compute the performance (accuracy) on the test set

In [68]:
accuracy = model.evaluate(x_test, y_test)

[1m2147/2147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.2924 - loss: 2.5230


# RNN Network

Change the Neural model from CNN to RNN 
- e.g. LSTM
- more or less same amount of parameters

## RNN (LSTM)

### Setup

In [63]:
model = Sequential([
	keras.Input(shape=(maxlen,len(chars))),
	layers.LSTM(256, dropout=0.2, recurrent_dropout=0.2),
	layers.Dense(len(chars), activation="softmax")  # Probability of the next character
])

optmizer = keras.optimizers.Adam(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optmizer, metrics=['accuracy'])

model.summary()

### Training

In [64]:
model.fit(x, y, batch_size=2048, epochs=30, callbacks=[print_callback])

Epoch 1/30
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 353ms/step - accuracy: 0.2067 - loss: 2.8411
Epoch 2/30
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 341ms/step - accuracy: 0.2924 - loss: 2.4278
Epoch 3/30
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 338ms/step - accuracy: 0.3376 - loss: 2.2490
Epoch 4/30
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 337ms/step - accuracy: 0.3753 - loss: 2.1052
Epoch 5/30
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 349ms/step - accuracy: 0.3983 - loss: 2.0161
Epoch 6/30
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 349ms/step - accuracy: 0.4171 - loss: 1.9474
Epoch 7/30
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 353ms/step - accuracy: 0.4327 - loss: 1.8897
Epoch 8/30
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 384ms/step - accuracy: 0.4456 - loss: 1.8393
Epoch 9/30
[1m74/74[0m [32m━━

<keras.src.callbacks.history.History at 0x1e18ba571d0>

### Evaluation

In [65]:
accuracy = model.evaluate(x_test, y_test)

[1m2147/2147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 7ms/step - accuracy: 0.5408 - loss: 1.5521
