# Neural network Classifier x Toxic Content Detection
Il presente Notebook mostra l'addestramento ed il testing di un Classificatore basato su Neural network per il task di Toxic Content Detection.

I dati sono stati processati come segue:
1. Pulizia del testo (si veda, 'dataset_preprocessing.py')
2. Lemmatizzazione con NLTK
3. Vettorizzazione con TF-IDF

In [1]:
import pandas as pd
import pickle
import nltk
import re
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from keras.layers import Conv1D
from keras.layers import MaxPooling1D
from sklearn.decomposition import PCA
from datetime import datetime
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, ConfusionMatrixDisplay

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# Neural Network, Dataset "non-Lemmatizzato"

In [2]:
training_set = pd.read_csv("./../../datasets/training_set.csv")
test_data = pd.read_csv("./../../datasets/test_set.csv")
test_data.dropna(inplace=True)
test_set = test_data[test_data['toxic']!=-1]
# Osservazione: il Training Set è stato già ripulito

In [3]:
# Vettorizzazione con TF-IDF
vectorizer = TfidfVectorizer()
vectorizer_lem = TfidfVectorizer()

y_train = training_set['toxic']

X_train = vectorizer.fit_transform(training_set['comment_text'])

X_test = vectorizer.transform(test_set['comment_text'])

print("y_train.shape: " + str(y_train.shape))

print("X_train.shape: " + str(X_train.shape))

print("X_test.shape: " + str(X_test.shape))


y_train.shape: (15282,)
X_train.shape: (15282, 39767)
X_train_lem.shape: (15282, 34238)
X_test.shape: (63842, 39767)
X_test_lem.shape: (63842, 34238)


## Addestramento del Modello

In [12]:
#Verifica presenza della GPU
import tensorflow as tf

# Verifica il dispositivo attualmente utilizzato da TensorFlow
device_name = tf.test.gpu_device_name()

if device_name == '/device:GPU:0':
    print('GPU disponibile, TensorFlow sta utilizzando la GPU.')
else:
    print('GPU non disponibile, TensorFlow sta utilizzando la CPU.')


GPU disponibile, TensorFlow sta utilizzando la GPU.


2024-02-10 13:32:17.233395: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-02-10 13:32:17.233426: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [4]:
X_train = X_train.toarray()

In [36]:
# Building the CNN Model
model = tf.keras.models.Sequential([
tf.keras.layers.Reshape((39767, 1), input_shape=(39767,)),  
tf.keras.layers.Conv1D(filters=96, kernel_size=11, strides=4,
activation='relu',input_shape=(39767, 1)),
tf.keras.layers.MaxPool1D(pool_size=3, strides=2,padding='same'),
tf.keras.layers.Conv1D(filters=256, kernel_size=5, padding='same',
activation='relu'),
tf.keras.layers.MaxPool1D(pool_size=3, strides=2,padding='same'),
tf.keras.layers.Conv1D(filters=384, kernel_size=3, padding='same',
activation='relu'),
tf.keras.layers.Conv1D(filters=256, kernel_size=1, padding='same',
activation='relu'),
tf.keras.layers.Dense(155, activation='relu'),
tf.keras.layers.Dropout(0.5),
tf.keras.layers.Dense(40, activation='relu'),
tf.keras.layers.Dropout(0.5),
tf.keras.layers.Dense(1,activation='sigmoid')
])

In [37]:
model.build()
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model.summary()

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 reshape_11 (Reshape)        (None, 39767, 1)          0         
                                                                 
 conv1d_54 (Conv1D)          (None, 9940, 96)          1152      
                                                                 
 max_pooling1d_15 (MaxPooli  (None, 4970, 96)          0         
 ng1D)                                                           
                                                                 
 conv1d_55 (Conv1D)          (None, 4970, 256)         123136    
                                                                 
 max_pooling1d_16 (MaxPooli  (None, 2485, 256)         0         
 ng1D)                                                           
                                                                 
 conv1d_56 (Conv1D)          (None, 2485, 384)       

In [38]:
model.fit(X_train, y_train, epochs=10, batch_size=32)

Epoch 1/10

In [None]:
with open(model_filename, 'wb') as f:
    pickle.dump(model, f)

In [None]:
with open(model_filename, 'rb') as f:
    model = pickle.load(f)

# Testing del Sistema, Dataset "Non Lemmatizzato"

In [None]:
test_data = pd.read_csv("./../../datasets/test_set.csv")
test_data.dropna(inplace=True)
test_set = test_data[test_data['toxic'] != -1]

In [None]:
y_test = test_set['toxic']
print("y_test.shape: " + str(y_test.shape))

In [None]:
X_test = pd.read_csv("./../../datasets/X_test_bert.csv")
print("X_test.shape:", X_test.shape)

In [None]:
y_pred = model.predict(X_test)
print(y_pred)

y_pred_binary = np.where(y_pred > 0.5, 1, 0)

#Metriche: Accuracy,Precision,Recall
print("Accuracy: " + str(accuracy_score(y_test, y_pred_binary)))
print("Precision: " + str(precision_score(y_test, y_pred_binary)))
print("Recall: " + str(recall_score(y_test, y_pred_binary)))

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred_binary)

# Definisci le etichette delle classi
classes = ['Classe Negativa', 'Classe Positiva']

# Plotta la matrice di confusione
plt.figure(figsize=(6, 5))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=classes, yticklabels=classes)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Matrice di Confusione')
plt.show()

# Neural Network, Dataset "Lemmatizzato"

In [None]:
training_set_lem = pd.read_csv("./../../datasets/training_set_lemmatized.csv")
test_data_lem = pd.read_csv("./../../datasets/test_set_lemmatized.csv")
test_set_lem = test_data[test_data['toxic']!=-1]
test_data_lem.dropna(inplace=True)

In [None]:
# Vettorizzazione con TF-IDF
vectorizer = TfidfVectorizer()
vectorizer_lem = TfidfVectorizer()

y_train = training_set['toxic']
X_train_lem = vectorizer_lem.fit_transform(training_set_lem['comment_text'])
X_test_lem = vectorizer_lem.transform(test_set_lem['comment_text'])

print("y_train.shape: " + str(y_train.shape))
print("X_train_lem.shape: " + str(X_train_lem.shape))
print("X_test_lem.shape: " + str(X_test_lem.shape))


## Addestramento del Modello

In [None]:
#Verifica presenza della GPU
import tensorflow as tf

# Verifica il dispositivo attualmente utilizzato da TensorFlow
device_name = tf.test.gpu_device_name()

if device_name == '/device:GPU:0':
    print('GPU disponibile, TensorFlow sta utilizzando la GPU.')
else:
    print('GPU non disponibile, TensorFlow sta utilizzando la CPU.')

In [None]:
X_train_lem = X_train_lem.toarray()

In [None]:
# Building the CNN Model

model = tf.keras.models.Sequential([
tf.keras.layers.Reshape((39767, 1), input_shape=(39767,)),  
tf.keras.layers.Conv1D(filters=96, kernel_size=11, strides=4,
activation='relu',input_shape=(39767, 1)),
tf.keras.layers.Dropout(0.5),
tf.keras.layers.Conv1D(filters=120, kernel_size=5, padding='same',
activation='relu'),
tf.keras.layers.Conv1D(filters=240, kernel_size=3, padding='same',
activation='relu'),
tf.keras.layers.Dropout(0.5),
tf.keras.layers.Conv1D(filters=300, kernel_size=3, padding='same',
activation='relu'),
tf.keras.layers.Conv1D(filters=150, kernel_size=1, padding='same',
activation='relu'),
tf.keras.layers.MaxPool1D(pool_size=3, strides=2,padding='same'),
tf.keras.layers.Dense(80, activation='relu'),
tf.keras.layers.Dropout(0.5),
tf.keras.layers.Dense(40, activation='relu'),
tf.keras.layers.Dropout(0.5),
tf.keras.layers.Dense(1,activation='sigmoid')
])

In [None]:
model.build()
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model.summary()

In [None]:
model.fit(X_train_lem, y_train, epochs=10, batch_size=32)

In [None]:
with open(model_filename, 'wb') as f:
    pickle.dump(model, f)

In [None]:
with open(model_filename, 'rb') as f:
    model = pickle.load(f)

# Testing del Sistema, Dataset "Non Lemmatizzato"