In [None]:
import numpy as np
import random
import tensorflow as tf
import keras

from keras.models import Sequential
from keras import layers

keras.utils.set_random_seed(42)



In [None]:
path = 'the_twin_seven_shooters.txt'
with open(path, encoding='utf-8') as f:
	text = f.read().lower()			# Convert everything to lowercase

# Length of characters in the text
print('Text length: ', len(text))
print('--------------------------')
print(text[0:1000]) # First 1000 characters



Text length:  60063

﻿the project gutenberg ebook of the twin seven-shooters
    
this ebook is for the use of anyone anywhere in the united states and
most other parts of the world at no cost and with almost no restrictions
whatsoever. you may copy it, give it away or re-use it under the terms
of the project gutenberg license included with this ebook or online
at www.gutenberg.org. if you are not located in the united states,
you will have to check the laws of the country where you are located
before using this ebook.

title: the twin seven-shooters

author: charles frederick manderson

release date: december 1, 2025 [ebook #77379]

language: english

original publication: new york, ny: f. tennyson neely, 1902

credits: chenzw and the online distributed proofreading team at https://www.pgdp.net (this file was produced from images generously made available by the internet archive)


*** start of the project gutenberg ebook the twin seven-shooters ***




      the twin
      seven-shoo

Dobbiamo convertire il testo (parole) in un insieme di numeri per permettere alla rete di gestire.

### Utilizziamo One-Hot Encoding
Per ogni carattere abbiamo un'array con la flag attiva nella posizione del character.

In [None]:
chars = sorted(list(set(text)))

print('total chars: ', len(chars))

# Dictionary to convert from chars to numbers
char_indices = dict((c,i) for i, c in enumerate(chars))
# Dictionary to convert from numbers to chars
indices_chars = dict((i,c) for i, c in enumerate(chars))

print(char_indices)
print(indices_chars)

total chars:  65
{'\n': 0, ' ': 1, '!': 2, '#': 3, '$': 4, '%': 5, '(': 6, ')': 7, '*': 8, ',': 9, '-': 10, '.': 11, '/': 12, '0': 13, '1': 14, '2': 15, '3': 16, '4': 17, '5': 18, '6': 19, '7': 20, '8': 21, '9': 22, ':': 23, ';': 24, '?': 25, '[': 26, ']': 27, '_': 28, 'a': 29, 'b': 30, 'c': 31, 'd': 32, 'e': 33, 'f': 34, 'g': 35, 'h': 36, 'i': 37, 'j': 38, 'k': 39, 'l': 40, 'm': 41, 'n': 42, 'o': 43, 'p': 44, 'q': 45, 'r': 46, 's': 47, 't': 48, 'u': 49, 'v': 50, 'w': 51, 'x': 52, 'y': 53, 'z': 54, 'é': 55, 'ê': 56, '—': 57, '‘': 58, '’': 59, '“': 60, '”': 61, '•': 62, '™': 63, '\ufeff': 64}
{0: '\n', 1: ' ', 2: '!', 3: '#', 4: '$', 5: '%', 6: '(', 7: ')', 8: '*', 9: ',', 10: '-', 11: '.', 12: '/', 13: '0', 14: '1', 15: '2', 16: '3', 17: '4', 18: '5', 19: '6', 20: '7', 21: '8', 22: '9', 23: ':', 24: ';', 25: '?', 26: '[', 27: ']', 28: '_', 29: 'a', 30: 'b', 31: 'c', 32: 'd', 33: 'e', 34: 'f', 35: 'g', 36: 'h', 37: 'i', 38: 'j', 39: 'k', 40: 'l', 41: 'm', 42: 'n', 43: 'o', 44: 'p', 45: 

In [None]:
# Dati i primi 30 caratteri prevedi quello successivo
maxlen = 30
step = 2

sentences = []
next_chars = []

# from the beginning to about half the text
for i in range(0, 30000, step):
	sentences.append(text[i: i+maxlen])
	next_chars.append(text[i+maxlen])

print('number  of sentences: ', len(sentences))

print(sentences[11])
print(next_chars[11])


number  of sentences:  15000
 ebook of the twin seven-shoot
e


In [None]:

# create training set & labels
x = np.zeros((len(sentences), maxlen, len(chars))) # 15000x30x65
y = np.zeros((len(sentences), len(chars)))  # 15000x65

# spatially organize data (matrices)
for i, (sentence, next_char) in enumerate(zip(sentences, next_chars)):
	for t, char in enumerate(sentence):
		x[i, t, char_indices[char]] = 1
	y[i, char_indices[next_char]] = 1

print(x.shape)
print(y.shape)

(15000, 30, 65)
(15000, 65)


## CNN Network

In [13]:
model = keras.Sequential([
	keras.Input(shape=(30,65,1)),
	layers.Conv2D(32, kernel_size=(3,3), activation='relu'),
	layers.MaxPooling2D(pool_size=(2,2)),
	layers.Conv2D(64, kernel_size=(3,3), activation='relu'),
	layers.MaxPooling2D(pool_size=(2,2)),
	layers.Flatten(),
	layers.Dropout(0.5),							# To avoid overfitting
	layers.Dense(len(chars), activation="softmax")  # Probability of the next character
])

optmizer = keras.optimizers.RMSprop(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optmizer)

model.summary()

^ none is due to the fact we don't know batch size yet.

In [15]:
from keras.callbacks import LambdaCallback
import sys

# We only test after the tenth epoch, because we can't expect good performances before
def testAfterEpoch(epoch, _):
	if epoch < 10:
		return
		
	print()
	print()
	print("---------------------- Generating text after epoch 10")

	start_index = random.randint(0, len(text)-maxlen-1)

	generated = ''
	sentence = text[start_index: start_index+maxlen]
	generated += sentence
	print('***** starting sentence *****')
	print(sentence)
	print('*****************************')
	sys.stdout.write(generated)

	for i in range(200):
		x_pred = np.zeros((1, maxlen, len(chars)))
		for t, char in enumerate(sentence):
			x_pred[0, t, char_indices[char]] = 1
		preds = model.predict(x_pred, verbose=0)[0]
		next_index = np.argmax(preds)
		next_char = indices_chars[next_index]

		sentence = sentence[1:] + next_char
		sys.stdout.write(next_char)
		sys.stdout.flush()
	print()

print_callback = LambdaCallback(on_epoch_end=testAfterEpoch)


In [16]:
model.fit(x, y, batch_size=2048, epochs=30, callbacks=[print_callback])

Epoch 1/30
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 538ms/step - loss: 2.4052
Epoch 2/30
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 560ms/step - loss: 2.3931
Epoch 3/30
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 623ms/step - loss: 2.3933
Epoch 4/30
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 618ms/step - loss: 2.3854
Epoch 5/30
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 575ms/step - loss: 2.3958
Epoch 6/30
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 626ms/step - loss: 2.3819
Epoch 7/30
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 613ms/step - loss: 2.3807
Epoch 8/30
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 556ms/step - loss: 2.3883
Epoch 9/30
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 535ms/step - loss: 2.3826
Epoch 10/30
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 626ms/step - loss: 2.3655
Epoch 11/

<keras.src.callbacks.history.History at 0x276077fed20>

### TODO : Compute the performance (accuracy) on the test set

### TODO 2 : Change the Neural model from CNN to RNN (suggestion: LSTM) [more or less same amount of parameters]


### Submission
The notebook + short document with model, total number of parameters, accuracy of CNN, accuracy of your RNN model


### Can we use the model to generate text?  
è possibile fare completion del testo facendo predirre il prossimo character per sequenze di character (30) consecutive