In [1]:
import urllib.request

import numpy as np
import random
import tensorflow as tf
import keras

from keras.models import Sequential
from keras import layers

keras.utils.set_random_seed(42)



In [2]:
path = 'frankenstein.txt'
url = 'https://gutenberg.org/ebooks/84.txt.utf-8'

with urllib.request.urlopen(url) as file:
	text = file.read().decode('utf-8').lower()

#with open(path, encoding='utf-8') as f:
#	text = f.read().lower()			# Convert everything to lowercase

# Length of characters in the text
print('Text length: ', len(text))
print('--------------------------')
print(text[0:1000]) # First 1000 characters



Text length:  446544
--------------------------
﻿the project gutenberg ebook of frankenstein; or, the modern prometheus
    
this ebook is for the use of anyone anywhere in the united states and
most other parts of the world at no cost and with almost no restrictions
whatsoever. you may copy it, give it away or re-use it under the terms
of the project gutenberg license included with this ebook or online
at www.gutenberg.org. if you are not located in the united states,
you will have to check the laws of the country where you are located
before using this ebook.

title: frankenstein; or, the modern prometheus

author: mary wollstonecraft shelley

release date: october 1, 1993 [ebook #84]
                most recently updated: september 8, 2025

language: english

credits: judith boss, christy phillips, lynn hanninen and david meltzer. html version by al haines.
        further corrections by menno de leeuw.


*** start of the project gutenberg ebook frankenstein; or, the modern promethe

Dobbiamo convertire il testo (parole) in un insieme di numeri per permettere alla rete di gestire.

### Utilizziamo One-Hot Encoding
Per ogni carattere abbiamo un'array con la flag attiva nella posizione del character.

In [3]:
chars = sorted(list(set(text)))

print('total chars: ', len(chars))

# Dictionary to convert from chars to numbers
char_indices = dict((c,i) for i, c in enumerate(chars))
# Dictionary to convert from numbers to chars
indices_chars = dict((i,c) for i, c in enumerate(chars))

print(char_indices)
print(indices_chars)

total chars:  69
{'\n': 0, '\r': 1, ' ': 2, '!': 3, '#': 4, '$': 5, '%': 6, '(': 7, ')': 8, '*': 9, ',': 10, '-': 11, '.': 12, '/': 13, '0': 14, '1': 15, '2': 16, '3': 17, '4': 18, '5': 19, '6': 20, '7': 21, '8': 22, '9': 23, ':': 24, ';': 25, '?': 26, '[': 27, ']': 28, '_': 29, 'a': 30, 'b': 31, 'c': 32, 'd': 33, 'e': 34, 'f': 35, 'g': 36, 'h': 37, 'i': 38, 'j': 39, 'k': 40, 'l': 41, 'm': 42, 'n': 43, 'o': 44, 'p': 45, 'q': 46, 'r': 47, 's': 48, 't': 49, 'u': 50, 'v': 51, 'w': 52, 'x': 53, 'y': 54, 'z': 55, 'æ': 56, 'è': 57, 'é': 58, 'ê': 59, 'ô': 60, '—': 61, '‘': 62, '’': 63, '“': 64, '”': 65, '•': 66, '™': 67, '\ufeff': 68}
{0: '\n', 1: '\r', 2: ' ', 3: '!', 4: '#', 5: '$', 6: '%', 7: '(', 8: ')', 9: '*', 10: ',', 11: '-', 12: '.', 13: '/', 14: '0', 15: '1', 16: '2', 17: '3', 18: '4', 19: '5', 20: '6', 21: '7', 22: '8', 23: '9', 24: ':', 25: ';', 26: '?', 27: '[', 28: ']', 29: '_', 30: 'a', 31: 'b', 32: 'c', 33: 'd', 34: 'e', 35: 'f', 36: 'g', 37: 'h', 38: 'i', 39: 'j', 40: 'k', 41

### Training set perparation

In [4]:
# Dati i primi 30 caratteri di contesto prevedi quello successivo
maxlen = 60
step = 2

sentences = []
next_chars = []

# from the beginning to about half the text
for i in range(0, 222269, step):
	sentences.append(text[i: i+maxlen])
	next_chars.append(text[i+maxlen])

print('number  of sentences: ', len(sentences))

print(sentences[11])
print(next_chars[11])

# create training set & labels
x = np.zeros((len(sentences), maxlen, len(chars))) # 15000x30x65
y = np.zeros((len(sentences), len(chars)))  # 15000x65

# spatially organize data (matrices)
for i, (sentence, next_char) in enumerate(zip(sentences, next_chars)):
	for t, char in enumerate(sentence):
		x[i, t, char_indices[char]] = 1
	y[i, char_indices[next_char]] = 1

print('training set shapes: ', x.shape, y.shape)


number  of sentences:  111135
 ebook of frankenstein; or, the modern prometheus
    
thi
s
training set shapes:  (111135, 60, 69) (111135, 69)


### Test Set preparation

In [5]:
sentences = []
next_chars = []

# from the beginning to about half the text
for i in range(222269, len(text)-maxlen, step):
	sentences.append(text[i: i+maxlen])
	next_chars.append(text[i+maxlen])

print('number  of sentences: ', len(sentences))

print(sentences[11])
print(next_chars[11])

# create training set & labels
x_test = np.zeros((len(sentences), maxlen, len(chars)))
y_test = np.zeros((len(sentences), len(chars)))

# spatially organize data (matrices)
for i, (sentence, next_char) in enumerate(zip(sentences, next_chars)):
	for t, char in enumerate(sentence):
		x_test[i, t, char_indices[char]] = 1
	y_test[i, char_indices[next_char]] = 1

print('training set shapes: ', x_test.shape, y_test.shape)


number  of sentences:  112108
ng days, while the preparations were going forward for
the 
e
training set shapes:  (112108, 60, 69) (112108, 69)


In [6]:
from keras.callbacks import LambdaCallback
import sys

# We only test after the tenth epoch, because we can't expect good performances before
def testAfterEpoch(epoch, _):
	if epoch < 30:
		return
		
	print()
	print()
	print("---------------------- Generating text after epoch 10")

	start_index = random.randint(0, len(text)-maxlen-1)

	generated = ''
	sentence = text[start_index: start_index+maxlen]
	generated += sentence
	print('***** starting sentence *****')
	print(sentence)
	print('*****************************')
	sys.stdout.write(generated)

	for i in range(200):
		x_pred = np.zeros((1, maxlen, len(chars)))
		for t, char in enumerate(sentence):
			x_pred[0, t, char_indices[char]] = 1
		preds = model.predict(x_pred, verbose=0)[0]
		next_index = np.argmax(preds)
		next_char = indices_chars[next_index]

		sentence = sentence[1:] + next_char
		sys.stdout.write(next_char)
		sys.stdout.flush()
	print()

print_callback = LambdaCallback(on_epoch_end=testAfterEpoch)


## CNN Network

### CNN Setup

In [34]:
model = keras.Sequential([
	keras.Input(shape=(maxlen, len(chars),1)),
	layers.Conv2D(32, kernel_size=(3,3), activation='relu'),
	layers.MaxPooling2D(pool_size=(2,2)),
	layers.Conv2D(64, kernel_size=(3,3), activation='relu'),
	layers.MaxPooling2D(pool_size=(2,2)),
	layers.Flatten(),
	layers.Dropout(0.5),							# To avoid overfitting
	layers.Dense(len(chars), activation="softmax")  # Probability of the next character
])

optmizer = keras.optimizers.RMSprop(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optmizer, metrics=['accuracy'])

model.summary()

^ none is due to the fact we don't know batch size yet.

### CNN Training

In [35]:
model.fit(x, y, batch_size=2048, epochs=30, callbacks=[print_callback])

Epoch 1/30
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 420ms/step - accuracy: 0.1464 - loss: 3.1205
Epoch 2/30
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 434ms/step - accuracy: 0.1612 - loss: 2.9744
Epoch 3/30
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 437ms/step - accuracy: 0.1757 - loss: 2.8953
Epoch 4/30
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 429ms/step - accuracy: 0.1918 - loss: 2.8354
Epoch 5/30
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 415ms/step - accuracy: 0.1994 - loss: 2.8057
Epoch 6/30
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 402ms/step - accuracy: 0.2027 - loss: 2.7878
Epoch 7/30
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 407ms/step - accuracy: 0.2050 - loss: 2.7790
Epoch 8/30
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 407ms/step - accuracy: 0.2058 - loss: 2.7707
Epoch 9/30
[1m55/55[0m [32m━━

<keras.src.callbacks.history.History at 0x1b4cdb10440>

### Compute the performance (accuracy) on the test set

In [36]:
accuracy = model.evaluate(x_test, y_test)

[1m3383/3383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.2637 - loss: 2.5955


# RNN Network

Change the Neural model from CNN to RNN 
- e.g. LSTM
- more or less same amount of parameters

## RNN (LSTM)

### Setup

In [7]:
model = keras.Sequential([
	keras.Input(shape=(maxlen,len(chars))),
	layers.LSTM(200, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
	layers.LayerNormalization(),
	layers.LSTM(150, dropout=0.2, recurrent_dropout=0.2),
	layers.LayerNormalization(),
	layers.Dense(len(chars), activation="softmax")  # Probability of the next character
])

optmizer = keras.optimizers.RMSprop(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optmizer, metrics=['accuracy'])

model.summary()

### Training

In [8]:
model.fit(x, y, batch_size=2048, epochs=30, callbacks=[print_callback])

: 

: 

: 

### Evaluation

In [83]:
accuracy = model.evaluate(x_test, y_test)

[1m3383/3383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 14ms/step - accuracy: 0.4843 - loss: 1.7231


## RNN (GRU)

### Setup

In [69]:
model = keras.Sequential([
	keras.Input(shape=(maxlen,len(chars))),
	layers.GRU(200, return_sequences=True),
	layers.GRU(150),
	layers.Dense(len(chars), activation="softmax")  # Probability of the next character
])

optmizer = keras.optimizers.RMSprop(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optmizer, metrics=['accuracy'])

model.summary()

### Training

In [70]:
model.fit(x, y, batch_size=2048, epochs=30, callbacks=[print_callback])

Epoch 1/30
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 484ms/step - accuracy: 0.3642 - loss: 2.3272
Epoch 2/30
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 481ms/step - accuracy: 0.4561 - loss: 1.8138
Epoch 3/30
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 479ms/step - accuracy: 0.4929 - loss: 1.6859
Epoch 4/30
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 479ms/step - accuracy: 0.5148 - loss: 1.6041
Epoch 5/30
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 478ms/step - accuracy: 0.5324 - loss: 1.5430
Epoch 6/30
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 476ms/step - accuracy: 0.5452 - loss: 1.4929
Epoch 7/30
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 491ms/step - accuracy: 0.5559 - loss: 1.4526
Epoch 8/30
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 494ms/step - accuracy: 0.5657 - loss: 1.4157
Epoch 9/30
[1m55/55[0m [32m━━

<keras.src.callbacks.history.History at 0x1b4ada638c0>

### Evaluation

In [71]:
accuracy = model.evaluate(x_test, y_test)

[1m3383/3383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 10ms/step - accuracy: 0.4747 - loss: 2.0115



### Submission
The notebook + short document with model, total number of parameters, accuracy of CNN, accuracy of your RNN model


### Can we use the model to generate text?  
è possibile fare completion del testo facendo predirre il prossimo character per sequenze di character (30) consecutive