## CONVOLUTIONAL NEURAL NETWORKS

In [8]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tqdm import tqdm

import keras
from keras import layers
import os
os.environ["KERAS_BACKEND"] = "tensorflow"

import string
import re

#### Loading and preparing the data:

In [9]:
train_data = pd.read_csv("train_2024.csv", quoting = 3)

@keras.saving.register_keras_serializable(name = "preprocessing")
def preprocessing(input_data):
    lowercase = tf.strings.lower(input_data)
    plain_text = tf.strings.regex_replace(lowercase, f"[{re.escape(string.punctuation)}]", "")
    return plain_text

train_comments = tf.convert_to_tensor(train_data['text'])    # type tf.Tensor
train_labels = tf.convert_to_tensor(train_data['label'])     # type tf.Tensor

#### Building the model:

In [10]:
max_features = 20000
embedding_dim = 128
sequence_length = 500
epochs = 5

# Vectorization layer
vectorize_layer = keras.layers.TextVectorization(
    standardize = preprocessing,
    max_tokens = max_features,
    output_mode = "int",
    output_sequence_length = sequence_length)

vectorize_layer.adapt(train_comments)

In [11]:
# The current architecture is as follows:
# (1) Vectorization
# (2) Embedding
# (3) Dropout
# (4) Conv1d
# (5) Conv1d
# (6) MaxPool1d

text_input = keras.Input(shape = (1,), dtype = tf.string, name = 'text')
x = vectorize_layer(text_input)

x = layers.Embedding(max_features + 1, embedding_dim)(x)

x = layers.Dropout(0.5)(x)

# Conv1D + global max pooling
x = layers.Conv1D(128, 7, padding = "valid", activation = "relu", strides = 3)(x)
x = layers.Conv1D(128, 7, padding = "valid", activation = "relu", strides = 3)(x)
x = layers.GlobalMaxPooling1D()(x)

# We add a vanilla hidden layer:
x = layers.Dense(128, activation = "relu")(x)
x = layers.Dropout(0.5)(x)

# We project onto a single unit output layer, and squash it with a sigmoid:
predictions = layers.Dense(1, activation = "sigmoid", name = "predictions")(x)

model = keras.Model(text_input, predictions)

# Compile the model with binary crossentropy loss and an adam optimizer.
model.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = ["accuracy"])
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text (InputLayer)           [(None, 1)]               0         
                                                                 
 text_vectorization_1 (Text  (None, 500)               0         
 Vectorization)                                                  
                                                                 
 embedding_1 (Embedding)     (None, 500, 128)          2560128   
                                                                 
 dropout_2 (Dropout)         (None, 500, 128)          0         
                                                                 
 conv1d_2 (Conv1D)           (None, 165, 128)          114816    
                                                                 
 conv1d_3 (Conv1D)           (None, 53, 128)           114816    
                                                           

#### Fitting the model:

In [None]:
model.fit(train_comments, train_labels, epochs = epochs)
model.save('keras_model.keras')

Epoch 1/5
Epoch 2/5
Epoch 3/5

#### Evaluating the model:

In [None]:
valid_data = pd.read_csv("dev_2024.csv", quoting = 3)
valid_comments = tf.convert_to_tensor(valid_data['text'])     # type tf.Tensor
valid_labels = tf.convert_to_tensor(valid_data['label'])      # type tf.Tensor

eval_loss, eval_acc = model.evaluate(valid_comments, valid_labels, verbose = 0)
print(f"Loss of the model: {eval_loss}")
print(f"Accuracy of the model: {eval_acc}")

#### Feeding the model with the test data:

In [None]:
test_data = pd.read_csv("test_2024.csv", quoting = 3)
test_comments = tf.convert_to_tensor(test_data['text'])     # type tf.Tensor
preds = model.predict(test_comments)
rounded_preds = np.int_( np.round(preds.flatten()) )

results = test_data
results = results.drop(['text','label'], axis=1)
results.insert(1,'label',rounded_preds)

results.to_csv('results.csv', index = False)

#### In case the model has already been trained and saved, we can instead evaluate it as follows:

In [12]:
model = keras.saving.load_model("keras_model.keras")

valid_data = pd.read_csv("dev_2024.csv", quoting = 3)
valid_comments = tf.convert_to_tensor(valid_data['text'])     # type tf.Tensor
valid_labels = tf.convert_to_tensor(valid_data['label'])      # type tf.Tensor

eval_loss, eval_acc = model.evaluate(valid_comments, valid_labels, verbose = 0)
print('Evaluation of the model:')
print(f"Loss of the model: {eval_loss}")
print(f"Accuracy of the model: {eval_acc}")

Evaluation of the model:
Loss of the model: 0.2685219347476959
Accuracy of the model: 0.9204545617103577


In [7]:
test_data = pd.read_csv("test_2024.csv", quoting = 3)
test_comments = tf.convert_to_tensor(test_data['text'])     # type tf.Tensor
preds = model.predict(test_comments)
rounded_preds = np.int_( np.round(preds.flatten()) )

results = test_data
results = results.drop(['text','label'], axis=1)
results.insert(1,'label',rounded_preds)

results.to_csv('results_cnn.csv', index = False)

