## Building a CNN model for the classification


In [None]:
# # Assigning the processed_lemmatized_words and preprocessed_comment columns from the data DataFrame to the X_cnn and X_cnn_noLemma variables, respectively.
# X_cnn = data.processed_lemmatized_words.values
# X_cnn_noLemma = data.preprocessed_comment.values

# # Assigning the toxic, severe_toxic, obscene, threat, insult, and identity_hate columns from the data DataFrame to the y_cnn DataFrame.
# y_cnn = data[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]]

In [None]:
# Defining constants
MAX_SEQUENCE_LENGTH = 1000  # The maximum length of a sequence.
MAX_NUM_WORDS = 20000  # The maximum vocabulary size.
EMBEDDING_DIM = 100  # The embedding dimension.
VALIDATION_SPLIT = 0.2  # The fraction of the data to use for validation.

In [None]:
# Initializing the class
# This creates a tokenizer object with a maximum vocabulary size of MAX_NUM_WORDS.
tokenizer = Tokenizer(num_words = MAX_NUM_WORDS)
tokenizer_noLemma = Tokenizer(num_words = MAX_NUM_WORDS)

# Updating internal vocabulary based on a list of texts.
# This fits the tokenizer to the text data in X_cnn. The tokenizer will learn the vocabulary of the text data and create a mapping from words to integers.
tokenizer.fit_on_texts(X)
tokenizer_noLemma.fit_on_texts(X_noLemma)

In [None]:
# Transforming each text in texts to a sequence of integers
# This function takes a list of texts and returns a list of sequences of integers, where each sequence represents the words in a text. The integers represent the index of the word in the vocabulary.
train_sequences = tokenizer.texts_to_sequences(X)
train_sequences_noLemma = tokenizer_noLemma.texts_to_sequences(X_noLemma)

In [None]:
# Getting the word index from the tokenizer - The word index is a mapping from words to integers.
word_index = tokenizer.word_index
word_index_noLemma = tokenizer_noLemma.word_index

# Printing the length of the word index - This shows how many words are in the vocabulary.
print("Length of word Index:", len(word_index))

# Printing the first 5 elements in the word index dictionary
# This shows the first 5 words in the vocabulary and their corresponding indices.
print("First 5 elements in the word_index dictionary:", dict(list(word_index.items())[0: 5]))

# Printing the first comment text in the training set
# This shows the first comment text in the training set, represented as a sequence of integers.
print("First comment text in training set:\n", train_sequences[0])

# With no lemmatized comment
# Repeating the above steps for the training set without lemmatization.
print("\n")
print("Length of word Index:", len(word_index_noLemma))
print("First 5 elements in the word_index dictionary:", dict(list(word_index_noLemma.items())[0: 5]))
print("First comment text in training set:\n", train_sequences_noLemma[0])


Length of word Index: 67330
First 5 elements in the word_index dictionary: {'article': 1, 'page': 2, 'fuck': 3, 'wikipedia': 4, 'like': 5}
First comment text in training set:
 [1898, 46, 37, 1101, 46, 8]


Length of word Index: 72638
First 5 elements in the word_index dictionary: {'fuck': 1, 'article': 2, 'page': 3, 'wikipedia': 4, 'like': 5}
First comment text in training set:
 [1953, 39, 32, 2321, 39, 7]


In [None]:
#Pad tokenized sequences
train_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
#test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
print("Shape of padded sequence list:\n", train_data.shape)
print("First comment text in training set - 0 for padding - only last 50 sequences as the rest are paddings:\n", train_data[0][-50:])


#Pad tokenized sequences
train_data_noLemma = pad_sequences(train_sequences_noLemma, maxlen=MAX_SEQUENCE_LENGTH)
#test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
print("Shape of padded sequence list:\n", train_data_noLemma.shape)
print("First comment text in training set - 0 for padding - only last 50 sequences as the rest are paddings:\n", train_data_noLemma[0][-50:])

Shape of padded sequence list:
 (32450, 1000)
First comment text in training set - 0 for padding - only last 50 sequences as the rest are paddings:
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0 1898   46   37 1101   46    8]
Shape of padded sequence list:
 (32450, 1000)
First comment text in training set - 0 for padding - only last 50 sequences as the rest are paddings:
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0 1953   39   32 2321   39    7]


In [None]:
# Padding tokenized sequences - This function pads the sequences to the maximum sequence length, which is defined by the MAX_SEQUENCE_LENGTH constant.
train_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)

# Printing the shape of the padded sequence list - showing the shape of the padded sequence list, which is (number of samples, maximum sequence length).
print("Shape of padded sequence list:\n", train_data.shape)

# Printing the first comment text in the training set - showing the first comment text in the training set, represented as a sequence of integers. Only the last 50 sequences are printed, as the rest are paddings.
print("First comment text in training set - 0 for padding - only last 50 sequences as the rest are paddings:\n", train_data[0][-50:])

# With no lemmatized comment
# Repeating the above steps for the training set without lemmatization.
print("\n")
train_data_noLemma = pad_sequences(train_sequences_noLemma, maxlen=MAX_SEQUENCE_LENGTH)

print("Shape of padded sequence list:\n", train_data_noLemma.shape)
print("First comment text in training set - 0 for padding - only last 50 sequences as the rest are paddings:\n", train_data_noLemma[0][-50:])


Shape of padded sequence list:
 (32450, 1000)
First comment text in training set - 0 for padding - only last 50 sequences as the rest are paddings:
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0 1898   46   37 1101   46    8]


Shape of padded sequence list:
 (32450, 1000)
First comment text in training set - 0 for padding - only last 50 sequences as the rest are paddings:
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0 1953   39   32 2321   39    7]


In [None]:
# Creating a Sequential model - which is a linear stack of layers.
cnn_model = Sequential()

# Adding an Embedding layer
# This layer creates an embedding matrix, which maps each word in the vocabulary to a vector of 128 dimensions.
cnn_model.add(Embedding(MAX_NUM_WORDS, 128))

# Adding a Conv1D layer
# This layer performs 1D convolutions on the embedding layer output. The kernel size is 5, and the activation function is ReLU.
cnn_model.add(Conv1D(filters=128, kernel_size=5, activation="relu"))

# Adding a MaxPooling1D layer
# This layer performs max pooling on the output of the Conv1D layer. The pool size is 5.
cnn_model.add(MaxPooling1D(pool_size=5))

# Adding another Conv1D and MaxPooling1D layer
# Repeat the above steps to add another Conv1D and MaxPooling1D layer.
cnn_model.add(Conv1D(filters=128, kernel_size=5, activation="relu"))
cnn_model.add(MaxPooling1D(pool_size=5))

# Adding a GlobalMaxPooling1D layer
# This layer performs global max pooling on the output of the last Conv1D layer. This reduces the output shape to (batch_size, 128).
cnn_model.add(GlobalMaxPooling1D())

# Adding a Dense layer
# This layer performs a dense layer with 128 units and ReLU activation function.
cnn_model.add(Dense(units=128, activation='relu'))

# Adding a final Dense layer
# This layer performs a dense layer with 6 units and sigmoid activation function. This is the output layer, which predicts the toxicity labels.
cnn_model.add(Dense(units=6, activation='sigmoid'))

# Printing the model summary
# This prints a summary of the model, including the layer sizes and output shapes.
print(cnn_model.summary())


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 128)         2560000   
                                                                 
 conv1d (Conv1D)             (None, None, 128)         82048     
                                                                 
 max_pooling1d (MaxPooling1D  (None, None, 128)        0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, None, 128)         82048     
                                                                 
 max_pooling1d_1 (MaxPooling  (None, None, 128)        0         
 1D)                                                             
                                                                 
 global_max_pooling1d (Globa  (None, 128)              0

In [None]:
# Configuring the model for training
# This sets the loss function, optimizer, and metrics for the model.
cnn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["AUC"])

# Splitting the dataset into train and validation set
# This splits the dataset into two sets, a training set and a validation set. The training set is used to train the model, and the validation set is used to evaluate the model's performance.
X_train, X_val, y_train, y_val = train_test_split(train_data, y, shuffle=True, random_state=123)

# Printing the shape of the train and validation sets
# This prints the shape of the train and validation sets, which is (number of samples, maximum sequence length).
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

# Training the model for a fixed number of epochs
# This trains the model for 1 epoch, which is one pass through the entire training set. The validation data is used to evaluate the model's performance after each epoch.
history = cnn_model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=(X_val, y_val), verbose=1)


(24337, 1000) (24337, 6) (8113, 1000) (8113, 6)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Evaluating the CNN model

In [None]:
# Converting the input string to a sequence of integers
test_sequences = tokenizer.texts_to_sequences('I will kill you')

# Padding the sequence to the maximum sequence length
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

# Using the CNN model to predict the toxicity levels of the input string
y_preds = cnn_model.predict(test_data)

# Printing the toxicity levels of the input string
print('Toxic:         {:.0%}'.format(y_preds[0][0]))
print('Severe Toxic:  {:.0%}'.format(y_preds[0][1]))
print('Obscene:       {:.0%}'.format(y_preds[0][2]))
print('Threat:        {:.0%}'.format(y_preds[0][3]))
print('Insult:        {:.0%}'.format(y_preds[0][4]))
print('Identity Hate: {:.0%}'.format(y_preds[0][5]))
print()

Toxic:         35%
Severe Toxic:  3%
Obscene:       17%
Threat:        4%
Insult:        19%
Identity Hate: 5%



Unfortunately, the above CNN model cannot detect Threat in the text message

In [None]:
# Helper function
# def toxicity_level(string, model):
#     """
#     Return toxicity probability based on inputed string.

#     Args:
#         string (str): The string to be analyzed.
#         model (keras.Model): The trained model.

#     Returns:
#         The toxicity levels of the input string.
#     """
#     # Process string
#     # This code takes the input string and converts it into a sequence of integers. The sequence is then padded to the maximum sequence length.
#     new_string = [string]
#     new_string = tokenizer.texts_to_sequences(new_string)
#     new_string = pad_sequences(new_string, maxlen=maxlen, padding='pre')

#     # Predict
#     # This code uses the model to predict the toxicity levels of the input string.
#     prediction = model.predict(new_string)

#     # Print output
#     # This code prints the toxicity levels of the input string.
#     print("Toxicity levels for '{}':".format(string))
#     print('Toxic:         {:.0%}'.format(prediction[0][0]))
#     print('Severe Toxic:  {:.0%}'.format(prediction[0][1]))
#     print('Obscene:       {:.0%}'.format(prediction[0][2]))
#     print('Threat:        {:.0%}'.format(prediction[0][3]))
#     print('Insult:        {:.0%}'.format(prediction[0][4]))
#     print('Identity Hate: {:.0%}'.format(prediction[0][5]))
#     print()

#     return prediction

## Building a CNN model for the classification - 3 epochs


Convolutional Neural Networks (CNNs) are a type of deep learning neural network that are commonly used for image recognition and processing.
CNNs are inspired by the way the human visual cortex works, and they are able to learn to recognize patterns in images by applying a series of convolution operations.


Convolution is a mathematical operation that takes two functions as input and produces a third function that expresses how the shape of one function is modified by the other function. In the context of CNNs, the two functions are the image and a filter. The filter is a small matrix of weights that is used to scan the image, and the convolution operation produces a new image that highlights the features that are detected by the filter.


CNNs typically have three types of layers: convolutional layers, pooling layers, and fully-connected layers. The convolutional layers are responsible for detecting features in the image, the pooling layers are responsible for reducing the size of the image while preserving the most important features, and the fully-connected layers are responsible for classifying the image.


Here are some of the benefits of using CNNs:

* They are able to learn to recognize patterns in images without being explicitly programmed to do so.
* They are able to generalize to new images that they have not seen before.
* They are able to process images very quickly.

However, CNNs also have some limitations:

* They require a large amount of training data.
* They can be computationally expensive to train.
* They can be difficult to interpret.

In [None]:
# Assigning the processed_lemmatized_words and preprocessed_comment columns from the data DataFrame to the X_cnn and X_cnn_noLemma variables, respectively.
X_cnn = data.processed_lemmatized_words.values
X_cnn_noLemma = data.preprocessed_comment.values

# Assigning the toxic, severe_toxic, obscene, threat, insult, and identity_hate columns from the data DataFrame to the y_cnn DataFrame.
y_cnn = data[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values

In [None]:
# Defining constants
MAX_SEQUENCE_LENGTH = 1000  # The maximum length of a sequence.
MAX_NUM_WORDS = 20000  # The maximum vocabulary size.
EMBEDDING_DIM = 100  # The embedding dimension.
VALIDATION_SPLIT = 0.2  # The fraction of the data to use for validation.

In [None]:
# Initializing the class
# This creates a tokenizer object with a maximum vocabulary size of MAX_NUM_WORDS.
tokenizer = Tokenizer(num_words = MAX_NUM_WORDS)
tokenizer_noLemma = Tokenizer(num_words = MAX_NUM_WORDS)

# Updating internal vocabulary based on a list of texts.
# This fits the tokenizer to the text data in X_cnn. The tokenizer will learn the vocabulary of the text data and create a mapping from words to integers.
tokenizer.fit_on_texts(X)
tokenizer_noLemma.fit_on_texts(X_noLemma)

In [None]:
# Transforming each text in texts to a sequence of integers
# This function takes a list of texts and returns a list of sequences of integers, where each sequence represents the words in a text. The integers represent the index of the word in the vocabulary.
train_sequences = tokenizer.texts_to_sequences(X)
train_sequences_noLemma = tokenizer_noLemma.texts_to_sequences(X_noLemma)

In [None]:
# Getting the word index from the tokenizer - The word index is a mapping from words to integers.
word_index = tokenizer.word_index
word_index_noLemma = tokenizer_noLemma.word_index

# Printing the length of the word index - This shows how many words are in the vocabulary.
print("Length of word Index:", len(word_index))

# Printing the first 5 elements in the word index dictionary
# This shows the first 5 words in the vocabulary and their corresponding indices.
print("First 5 elements in the word_index dictionary:", dict(list(word_index.items())[0: 5]))

# Printing the first comment text in the training set
# This shows the first comment text in the training set, represented as a sequence of integers.
print("First comment text in training set:\n", train_sequences[0])

# With no lemmatized comment
# Repeating the above steps for the training set without lemmatization.
print("\n")
print("Length of word Index:", len(word_index_noLemma))
print("First 5 elements in the word_index dictionary:", dict(list(word_index_noLemma.items())[0: 5]))
print("First comment text in training set:\n", train_sequences_noLemma[0])


Length of word Index: 67669
First 5 elements in the word_index dictionary: {'article': 1, 'page': 2, 'fuck': 3, 'wikipedia': 4, 'like': 5}
First comment text in training set:
 [296, 888, 205, 67]


Length of word Index: 72886
First 5 elements in the word_index dictionary: {'fuck': 1, 'article': 2, 'page': 3, 'wikipedia': 4, 'like': 5}
First comment text in training set:
 [297, 956, 190, 76]


In [None]:
# Padding tokenized sequences - This function pads the sequences to the maximum sequence length, which is defined by the MAX_SEQUENCE_LENGTH constant.
train_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)

# Printing the shape of the padded sequence list - showing the shape of the padded sequence list, which is (number of samples, maximum sequence length).
print("Shape of padded sequence list:\n", train_data.shape)

# Printing the first comment text in the training set - showing the first comment text in the training set, represented as a sequence of integers. Only the last 50 sequences are printed, as the rest are paddings.
print("First comment text in training set - 0 for padding - only last 50 sequences as the rest are paddings:\n", train_data[0][-50:])

# With no lemmatized comment
# Repeating the above steps for the training set without lemmatization.
print("\n")
train_data_noLemma = pad_sequences(train_sequences_noLemma, maxlen=MAX_SEQUENCE_LENGTH)

print("Shape of padded sequence list:\n", train_data_noLemma.shape)
print("First comment text in training set - 0 for padding - only last 50 sequences as the rest are paddings:\n", train_data_noLemma[0][-50:])


Shape of padded sequence list:
 (32450, 1000)
First comment text in training set - 0 for padding - only last 50 sequences as the rest are paddings:
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0 296 888 205  67]


Shape of padded sequence list:
 (32450, 1000)
First comment text in training set - 0 for padding - only last 50 sequences as the rest are paddings:
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0 297 956 190  76]


In [None]:
# Creating a Sequential model - which is a linear stack of layers.
cnn_model = Sequential()

# Adding an Embedding layer
# This layer creates an embedding matrix, which maps each word in the vocabulary to a vector of 128 dimensions.
cnn_model.add(Embedding(MAX_NUM_WORDS, 128))

# Adding a Conv1D layer
# This layer performs 1D convolutions on the embedding layer output. The kernel size is 5, and the activation function is ReLU.
cnn_model.add(Conv1D(filters=128, kernel_size=5, activation="relu"))

# Adding a MaxPooling1D layer
# This layer performs max pooling on the output of the Conv1D layer. The pool size is 5.
cnn_model.add(MaxPooling1D(pool_size=5))

# Adding another Conv1D and MaxPooling1D layer
# Repeat the above steps to add another Conv1D and MaxPooling1D layer.
cnn_model.add(Conv1D(filters=128, kernel_size=5, activation="relu"))
cnn_model.add(MaxPooling1D(pool_size=5))

# Adding a GlobalMaxPooling1D layer
# This layer performs global max pooling on the output of the last Conv1D layer. This reduces the output shape to (batch_size, 128).
cnn_model.add(GlobalMaxPooling1D())

# Adding a Dense layer
# This layer performs a dense layer with 128 units and ReLU activation function.
cnn_model.add(Dense(units=128, activation='relu'))

# Adding a final Dense layer
# This layer performs a dense layer with 6 units and sigmoid activation function. This is the output layer, which predicts the toxicity labels.
cnn_model.add(Dense(units=6, activation='sigmoid'))

# Printing the model summary
# This prints a summary of the model, including the layer sizes and output shapes.
print(cnn_model.summary())


Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, None, 128)         2560000   
                                                                 
 conv1d (Conv1D)             (None, None, 128)         82048     
                                                                 
 max_pooling1d (MaxPooling1D  (None, None, 128)        0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, None, 128)         82048     
                                                                 
 max_pooling1d_1 (MaxPooling  (None, None, 128)        0         
 1D)                                                             
                                                                 
 global_max_pooling1d (Globa  (None, 128)             

In [None]:
# Configuring the model for training
# This sets the loss function, optimizer, and metrics for the model.
cnn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["AUC"])

# Splitting the dataset into train and validation set
# This splits the dataset into two sets, a training set and a validation set. The training set is used to train the model, and the validation set is used to evaluate the model's performance.
X_train, X_val, y_train, y_val = train_test_split(train_data, y, shuffle=True, random_state=123)

# Printing the shape of the train and validation sets
# This prints the shape of the train and validation sets, which is (number of samples, maximum sequence length).
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

# Training the model for a fixed number of epochs
# This trains the model for 1 epoch, which is one pass through the entire training set. The validation data is used to evaluate the model's performance after each epoch.
history = cnn_model.fit(X_train, y_train, batch_size=128, epochs=3, validation_data=(X_val, y_val), verbose=1)


(24337, 1000) (24337, 6) (8113, 1000) (8113, 6)
Epoch 1/3
Epoch 2/3
Epoch 3/3


### Evaluating the CNN model

In [None]:
# Converting the input string to a sequence of integers
test_sequences = tokenizer.texts_to_sequences('I will kill you')

# Padding the sequence to the maximum sequence length
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

# Using the CNN model to predict the toxicity levels of the input string
y_preds = cnn_model.predict(test_data)

# Printing the toxicity levels of the input string
print('Toxic:         {:.0%}'.format(y_preds[0][0]))
print('Severe Toxic:  {:.0%}'.format(y_preds[0][1]))
print('Obscene:       {:.0%}'.format(y_preds[0][2]))
print('Threat:        {:.0%}'.format(y_preds[0][3]))
print('Insult:        {:.0%}'.format(y_preds[0][4]))
print('Identity Hate: {:.0%}'.format(y_preds[0][5]))
print()

Toxic:         52%
Severe Toxic:  4%
Obscene:       23%
Threat:        6%
Insult:        26%
Identity Hate: 8%



Unfortunately, the above CNN model cannot detect Threat in the text message