In [1]:
!pip install -U spaCy
!pip install keras
!python -m spacy download it_core_news_lg

Collecting it-core-news-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/it_core_news_lg-3.8.0/it_core_news_lg-3.8.0-py3-none-any.whl (567.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m567.9/567.9 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: it-core-news-lg
Successfully installed it-core-news-lg-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('it_core_news_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


# Text categorization using Deep Learning (with Keras)

Now you are finally ready to experiment with Deep Learning and Keras. Keras supports two main types of models. You have the Sequential model API which you are going to see in use in this tutorial and the functional API which can do everything of the Sequential model but it can be also used for advanced models with complex network architectures.

The Sequential model is a linear stack of layers, where you can use the large variety of available layers in Keras. The most common layer is the Dense layer which is your regular densely connected neural network layer.

We need to prepare training and testing data.

Data available here: https://drive.google.com/drive/folders/1pQKHrUth2x3lR-W74LKwtke-kQRIbR6U?usp=drive_link

In [2]:
from keras.models import Sequential
from keras import layers
from sklearn.preprocessing import LabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
import json

df = open('dataset_05.json','r')
X_text = []
y = []
for line in df:
  j = json.loads(line)
  #text = j['title']
  text = j['title']+' '+j['desc']
  #text = j['title']+' '+j['desc']+' '+(j['text'] if 'text' in j  else '')
  X_text.append(text)
  y.append(j['topic']) # the labels

vectorizer = CountVectorizer()
vectorizer.fit(X_text)
X = vectorizer.transform(X_text)

X_f, X_test, y_f, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_f, y_f, test_size=0.1, random_state=42)

# Number of features
input_dim = X_train.shape[1]
print("Number of features: ",input_dim)

# binarize labels
encoder = LabelBinarizer()
y_train = encoder.fit_transform(y_train)
y_val = encoder.transform(y_val)
y_test = encoder.transform(y_test)
nc = encoder.classes_.size
print("Number of classes: ",nc)

Number of features:  19326
Number of classes:  14


We design the network and compile it.

In [3]:
model = Sequential()
model.add(layers.Dense(512, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(256, activation='relu'))
model.add(layers.Dense(nc, activation='softmax')) # nc is the number of classes

# compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Now, we can fit the network on training data. We use testing data for validation.
The fit function requires the number of epochs and the batch size.

In [4]:
from keras.backend import clear_session
clear_session() # Make sure to call clear_session() before you start training the model again

model.fit(X_train, y_train, epochs=5, verbose=True, validation_data=(X_val, y_val), batch_size=64)

Epoch 1/5
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 224ms/step - accuracy: 0.4980 - loss: 1.7575 - val_accuracy: 0.8106 - val_loss: 0.6199
Epoch 2/5
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 229ms/step - accuracy: 0.9657 - loss: 0.1379 - val_accuracy: 0.8106 - val_loss: 0.6368
Epoch 3/5
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 223ms/step - accuracy: 0.9983 - loss: 0.0126 - val_accuracy: 0.8119 - val_loss: 0.6621
Epoch 4/5
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 226ms/step - accuracy: 0.9992 - loss: 0.0033 - val_accuracy: 0.8068 - val_loss: 0.6662
Epoch 5/5
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 232ms/step - accuracy: 0.9992 - loss: 0.0036 - val_accuracy: 0.8106 - val_loss: 0.6794


<keras.src.callbacks.history.History at 0x78e6bc6c8f10>

Compute accuracy on both training and test set.

In [5]:
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.9997
Testing Accuracy:  0.8156


# Using word embeddings

We can directly use pretrained word embeddings in our model as input.

In [6]:
df = open('dataset_05.json','r')
X = []
y = []
for line in df:
  j = json.loads(line)
  #text = j['title']
  text = j['title']+' '+j['desc']
  #text = j['title']+' '+(j['text'] if 'text' in j  else '')
  X.append(text.lower())
  y.append(j['topic']) # the labels

Now we need to tokenize the data into a format that can be used by word embeddings. Keras offers a couple of convenience methods for text preprocessing and sequence preprocessing which you can employ to prepare your text.

You can start by using the Tokenizer utility class which can vectorize a text corpus into a list of integers. Each integer maps to a value in a dictionary that encodes the entire corpus, with the keys in the dictionary being the vocabulary terms themselves. You can add the parameter num_words, which is responsible for setting the size of the vocabulary. The most common num_words words will be then kept.

In [7]:
!pip install keras-preprocessing

Collecting keras-preprocessing
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl.metadata (1.9 kB)
Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: keras-preprocessing
Successfully installed keras-preprocessing-1.1.2


In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)

X_f, X_test, y_f, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_f, y_f, test_size=0.1, random_state=42)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
X_val = tokenizer.texts_to_sequences(X_val)

vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary size:',vocab_size)

Vocabulary size: 21785


One problem that we have is that each text sequence has in most cases different length of words. To counter this, you can use pad_sequence() which simply pads the sequence of words with zeros. By default, it prepends zeros but we want to append them. Typically it does not matter whether you prepend or append zeros.

Additionally you would want to add a maxlen parameter to specify how long the sequences should be. This cuts sequences that exceed that number. In the following code, you can see how to pad sequences with Keras:

In [11]:
from keras.utils import pad_sequences

maxlen = 128
#Per le reti che gestiscono sequenze ovviamente le sequenze devono avere tutte la stessa sequenza. Per evitare problemi dunque si mette una lunghezza massima. Per quelle piu corte si aggiunge il padding alla fine. Quelle piu corte si tronca (ovviamente meglio evitare di troncare)
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)
X_val = pad_sequences(X_val, padding='post', maxlen=maxlen)

print(X_train[0])

[ 267   28   12 1714  264 6645  936 6646 1464 6647   33   11   84  267
   28   12 1714  264 6645  936 6646 1464 6647   33   11   84    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]


We need to binarize the labels.

In [13]:
# binarize labels
encoder = LabelBinarizer()
print(y_train[0])
y_train = encoder.fit_transform(y_train)
print(y_train[0])
y_test = encoder.transform(y_test)
y_val = encoder.transform(y_val)
nc = encoder.classes_.size
print('Number of classes:',nc)

salute
[0 0 0 0 0 0 0 0 0 1 0 0 0 0]
Number of classes: 14


You can see in the next example how you can load the embedding matrix. Each line in the file starts with the word and is followed by the embedding vector for the particular word.

This is a large file with each line representing a word followed by its vector as a stream of floats.

Since you don’t need all words, you can focus on only the words that we have in our vocabulary. Since we have only a limited number of words in our vocabulary.


Download pre-trained word embeddings for Italian from fasttext.

In [14]:
!wget 'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.it.300.vec.gz'
!gunzip cc.it.300.vec.gz

--2025-05-16 07:02:08--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.it.300.vec.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 18.173.166.74, 18.173.166.51, 18.173.166.31, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|18.173.166.74|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1272825284 (1.2G) [binary/octet-stream]
Saving to: ‘cc.it.300.vec.gz’


2025-05-16 07:02:22 (86.9 MB/s) - ‘cc.it.300.vec.gz’ saved [1272825284/1272825284]



In [15]:
!head -n 5 cc.it.300.vec

#2M di embedding lunghi 300

#per ogni riga abbiamo carattere [embedding]

2000000 300
, -0.0624 -0.0432 -0.3535 -0.0145 0.0690 0.0831 0.0784 0.0153 0.4491 0.1494 0.0392 0.0331 -0.0138 -0.0321 0.0813 0.0449 0.0506 -0.0302 -0.0460 -0.0900 0.0872 -0.0460 -0.0014 -0.0633 -0.0683 -0.0064 -0.0802 0.0366 -0.0948 0.0211 -0.0140 0.0504 -0.0243 -0.0205 -0.0424 -0.0105 -0.0013 -0.0270 0.0189 0.1892 0.0491 -0.0239 -0.0399 -0.0001 -0.0192 0.1326 0.0995 -0.0239 0.0485 0.1064 -0.0603 0.0197 -0.0582 -0.0168 0.0471 -0.0094 -0.0000 -0.0562 0.0642 0.0338 -0.0096 -0.0799 0.0620 -0.0072 -0.0635 -0.0803 0.0618 -0.0305 -0.0152 -0.0265 0.0226 -0.0361 0.0489 0.0985 0.1611 0.0050 0.1271 0.2563 -0.0871 0.0338 0.0617 0.0266 -0.0647 0.0704 -0.1108 -0.0088 0.0403 -0.0116 0.0528 0.0304 0.0558 0.0045 -0.0231 -0.1034 -0.6818 -0.2181 0.0567 0.0305 0.0937 -0.0283 -0.0449 -0.0081 -0.0211 0.0494 0.0552 0.1646 0.0341 -0.0076 0.0265 -0.0632 -0.1509 0.0787 0.2853 -0.0154 -0.0769 0.0390 -0.0053 0.0110 0.0239 0.0178 0.0164 -0.0123 -0.0115 -0.0396 -0.0072 -0.0153 -0.0343 -0.0875 0.0568 1.2602 -0.0168

In [16]:
import numpy as np
#prendiamo solo gli embedding che stanno nel nostro vocabolario, non tutti
def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath) as f:
        f.readline # skip first line that contains word space info
        for line in f:
            word, *vector = line.split()
            if word in word_index: # load only word embeddings into the vocabulary
                idx = word_index[word]
                embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]
    return embedding_matrix

You can use this function now to retrieve the embedding matrix.
We use word embeddings from fasttext.

In [17]:
embedding_dim = 300
embedding_matrix = create_embedding_matrix('cc.it.300.vec', tokenizer.word_index, embedding_dim)

Vocabulary coverage.

In [19]:
nonzero_elements = np.count_nonzero(np.count_nonzero(embedding_matrix, axis=1))
print('Vocabulary coverage: ',nonzero_elements / vocab_size)

#vediamo quante valore del training hanno un embedding
#le altre verranno inizializzate con embedding a 0  (oppure Random, oppure centroide degli altri embedding)

Vocabulary coverage:  0.7539132430571495


We now design the new model with the word embeddings and the GlobalMaxPool layers.

In [20]:
model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim,
                           weights=[embedding_matrix],
                           input_length=maxlen,
                           trainable=False,))
model.add(layers.GlobalAveragePooling1D())
model.add(layers.Dense(512, activation='relu'))
model.add(layers.Dense(nc, activation='softmax'))
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()


#rispetto a prima ora abbiamo gli embedding. Quindi inseriamo un layer con tutte le caratteristiche dei nostri embedding

#nel layer dense non posso far entrare tanti vettori, ma uno solo. Si utilizza il Pooling



Fit the model.

In [21]:
clear_session()
model.fit(X_train, y_train,
                    epochs=50,
                    verbose=True,
                    validation_data=(X_val, y_val),
                    batch_size=64)

Epoch 1/50
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - accuracy: 0.2491 - loss: 2.5123 - val_accuracy: 0.2879 - val_loss: 2.3357
Epoch 2/50
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.3140 - loss: 2.2393 - val_accuracy: 0.3182 - val_loss: 2.1003
Epoch 3/50
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.3658 - loss: 2.0200 - val_accuracy: 0.4268 - val_loss: 1.8893
Epoch 4/50
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.4438 - loss: 1.8311 - val_accuracy: 0.4684 - val_loss: 1.7340
Epoch 5/50
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.4961 - loss: 1.6681 - val_accuracy: 0.5290 - val_loss: 1.6023
Epoch 6/50
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.5336 - loss: 1.5661 - val_accuracy: 0.5391 - val_loss: 1.5057
Epoch 7/50
[1m112/112

<keras.src.callbacks.history.History at 0x78e6bcae7890>

In [22]:
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.7890
Testing Accuracy:  0.7089


# Convolutional Neural Networks (CNN)

Convolutional neural networks or also called convnets.

They have revolutionized image classification and computer vision by being able to extract features from images and using them in neural networks. The properties that made them useful in image processing makes them also handy for sequence processing. You can imagine a CNN as a specialized neural network that is able to detect specific patterns.

If it is just another neural network, what differentiates it from what you have previously learned?

A CNN has hidden layers which are called convolutional layers. When you think of images, a computer has to deal with a two dimensional matrix of numbers and therefore you need some way to detect features in this matrix. These convolutional layers are able to detect edges, corners and other kinds of textures which makes them such a special tool. The convolutional layer consists of multiple filters which are slid across the image and are able to detect specific features.

This is the very core of the technique, the mathematical process of convolution. With each convolutional layer the network is able to detect more complex patterns.

When you are working with sequential data, like text, you work with one dimensional convolutions, but the idea and the application stays the same. You still want to pick up on patterns in the sequence which become more complex with each added convolutional layer.

Now let’s have a look how you can use this network in Keras. Keras offers again various Convolutional layers which you can use for this task. The layer you’ll need is the Conv1D layer. This layer has again various parameters to choose from. The ones you are interested in for now are the number of filters, the kernel size, and the activation function. You can add this layer in between the Embedding layer and the GlobalMaxPool1D layer:

In [23]:
model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
model.add(layers.Conv1D(512, 5, activation='relu'))
model.add(layers.GlobalAveragePooling1D())
model.add(layers.Dense(512, activation='relu'))
model.add(layers.Dense(nc, activation='softmax'))
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

In [None]:
clear_session()
model.fit(X_train, y_train,
                    epochs=5,
                    verbose=True,
                    validation_data=(X_val, y_val),
                    batch_size=64)

Epoch 1/5
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m145s[0m 1s/step - accuracy: 0.2607 - loss: 2.2907 - val_accuracy: 0.5657 - val_loss: 1.3348
Epoch 2/5
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m128s[0m 1s/step - accuracy: 0.6963 - loss: 0.9552 - val_accuracy: 0.7437 - val_loss: 0.8363
Epoch 3/5
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 1s/step - accuracy: 0.9086 - loss: 0.3269 - val_accuracy: 0.7374 - val_loss: 0.8992
Epoch 4/5
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 1s/step - accuracy: 0.9710 - loss: 0.1073 - val_accuracy: 0.7626 - val_loss: 0.9747
Epoch 5/5
[1m106/112[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m7s[0m 1s/step - accuracy: 0.9937 - loss: 0.0271

In [None]:
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_val, y_val, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

# Exercise 1
Try to use a Neural Network to classify the Haspeede and HODI datasets.