# Neural network Classifier x Toxic Content Detection
Il presente Notebook mostra l'addestramento ed il testing di un Classificatore basato su Neural network per il task di Toxic Content Detection.

I dati sono stati processati come segue:
1. Pulizia del testo (si veda, 'dataset_preprocessing.py')
2. Lemmatizzazione con NLTK
3. Vettorizzazione con TF-IDF

In [1]:
import pandas as pd
import pickle
import nltk
import re
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from keras.layers import Conv1D
from keras.layers import MaxPooling1D
from sklearn.decomposition import PCA
from datetime import datetime
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, ConfusionMatrixDisplay

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# Addestramento del Sistema
Il Sistema è ovviamente riaddestrabile a piacere. Si consiglia, tuttavia, dato il tempo necessario per riaddestrare il classificatore, di utilizzare il file pickle 'rf_classifier' per eseguire subito gli esperimenti.

## Caricamento del Training Set

In [2]:
training_set = pd.read_csv("./../../datasets/training_set.csv")
training_set_lem = pd.read_csv("./../../datasets/training_set_lemmatized.csv")

test_data = pd.read_csv("./../../datasets/test_set.csv")
test_data_lem = pd.read_csv("./../../datasets/test_set_lemmatized.csv")
test_data.dropna(inplace=True)
test_data_lem.dropna(inplace=True)


test_set = test_data[test_data['toxic']!=-1]
test_set_lem = test_data[test_data['toxic']!=-1]

# Osservazione: il Training Set è stato già ripulito


In [3]:
# Vettorizzazione con TF-IDF
vectorizer = TfidfVectorizer()
vectorizer_lem = TfidfVectorizer()

y_train = training_set['toxic']

X_train = vectorizer.fit_transform(training_set['comment_text'])
X_train_lem = vectorizer_lem.fit_transform(training_set_lem['comment_text'])

X_test = vectorizer.transform(test_set['comment_text'])
X_test_lem = vectorizer_lem.transform(test_set_lem['comment_text'])

print("y_train.shape: " + str(y_train.shape))

print("X_train.shape: " + str(X_train.shape))
print("X_train_lem.shape: " + str(X_train_lem.shape))


print("X_test.shape: " + str(X_test.shape))
print("X_test_lem.shape: " + str(X_test_lem.shape))

y_train.shape: (15282,)
X_train.shape: (15282, 39767)
X_train_lem.shape: (15282, 34238)
X_test.shape: (63842, 39767)
X_test_lem.shape: (63842, 34238)


## Addestramento del Modello

In [4]:
X_train = X_train.toarray()

In [5]:
# Building the CNN Model

model = tf.keras.models.Sequential([
tf.keras.layers.Reshape((39767, 1), input_shape=(39767,)),  
tf.keras.layers.Conv1D(filters=45, kernel_size=11, strides=4,
activation='relu',input_shape=(39767, 1)),
tf.keras.layers.MaxPool1D(pool_size=3, strides=2,padding='same'),
tf.keras.layers.Conv1D(filters=125, kernel_size=5, padding='same',
activation='relu'),
tf.keras.layers.MaxPool1D(pool_size=3, strides=2,padding='same'),
tf.keras.layers.Conv1D(filters=150, kernel_size=3, padding='same',
activation='relu'),
tf.keras.layers.Conv1D(filters=125, kernel_size=3, padding='same',
activation='relu'),
tf.keras.layers.Conv1D(filters=60, kernel_size=1, padding='same',
activation='relu'),
tf.keras.layers.Dense(80, activation='relu'),
tf.keras.layers.Dropout(0.5),
tf.keras.layers.Dense(40, activation='relu'),
tf.keras.layers.Dropout(0.5),
tf.keras.layers.Dense(1,activation='sigmoid')
])

2024-02-10 12:23:36.332196: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2024-02-10 12:23:36.332237: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2024-02-10 12:23:36.332250: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2024-02-10 12:23:36.332627: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-02-10 12:23:36.333046: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [6]:
model.build()
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 reshape (Reshape)           (None, 39767, 1)          0         
                                                                 
 conv1d (Conv1D)             (None, 9940, 96)          1152      
                                                                 
 max_pooling1d (MaxPooling1  (None, 4970, 96)          0         
 D)                                                              
                                                                 
 conv1d_1 (Conv1D)           (None, 4970, 256)         123136    
                                                                 
 max_pooling1d_1 (MaxPoolin  (None, 2485, 256)         0         
 g1D)                                                            
                                                                 
 conv1d_2 (Conv1D)           (None, 2485, 384)         2

In [9]:
import tensorflow as tf

# Verifica il dispositivo attualmente utilizzato da TensorFlow
device_name = tf.test.gpu_device_name()

if device_name == '/device:GPU:0':
    print('GPU disponibile, TensorFlow sta utilizzando la GPU.')
else:
    print('GPU non disponibile, TensorFlow sta utilizzando la CPU.')


GPU disponibile, TensorFlow sta utilizzando la GPU.


2024-02-10 13:29:19.876800: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-02-10 13:29:19.877102: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [7]:
model.fit(X_train, y_train, epochs=10, batch_size=32)

Epoch 1/10


2024-02-10 12:23:48.962594: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
 43/478 [=>............................] - ETA: 4:25 - loss: 0.4840 - accuracy: 0.8161

KeyboardInterrupt: 

In [8]:
import platform
print(f"Python platform: {platform.platform()}")
print(f"Tensorflow version: {tf.__version__}")
print(tf.config.list_physical_devices('GPU'))

Python platform: macOS-14.0-arm64-arm-64bit
Tensorflow version: 2.15.0
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [None]:
# define model lem
model_lem = Sequential()
model_lem.add(Dense(, input_shape=(,), activation='relu'))
model_lem.add(Dense(, activation='relu'))
model_lem.add(Dense(, activation='relu'))
model_lem.add(Dense(1, activation='softmax'))

In [None]:
# Addestramento sul Dataset non-lemmatizzato
print("Training started on not-Lemmatized Dataset...")
start = datetime.now()
model.fit(X=X_train, y=y_train)
end = datetime.now()
print("Training completed! Required time: " + str(end-start))

with open(model_filename, 'wb') as f:
    pickle.dump(cl, f)

In [None]:
# Addestramento sul Dataset non-lemmatizzato
print("Training started on not-Lemmatized Dataset...")
start = datetime.now()
model_lem.fit(X=X_train, y=y_train)
end = datetime.now()
print("Training completed! Required time: " + str(end-start))

with open(model_filename, 'wb') as f:
    pickle.dump(cl, f)