# CNN baseline for MetaHate

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score
import tensorflow as tf

from keras.models import Sequential
from keras.layers import Dense, Embedding, Flatten
from keras.layers import TextVectorization
from keras.preprocessing.sequence import pad_sequences
import numpy as np

  if not hasattr(np, "object"):


## Reading the data

In [2]:
data = pd.read_csv('../../../data/processed/within-dataset/metahate_train.tsv', sep='\t', names=['label', 'text'],header=0)
data = data.dropna(subset=['text'])

data['text'] = data['text'].astype(str)

texts_train = data['text'].tolist()
labels_train = data['label'].tolist()

data = pd.read_csv('../../../data/processed/within-dataset/metahate_test.tsv', sep='\t', names=['label', 'text'],header=0)

data = data.dropna(subset=['text'])

data['text'] = data['text'].astype(str)

texts_test = data['text'].tolist()
labels_test = data['label'].tolist()


## Standardize labels all to string and enerate validation set from train set

In [3]:
texts_train_noVal, texts_val, labels_train_noVal, labels_val = train_test_split(
    texts_train,
    labels_train,
    test_size=0.2,
    random_state=42
)


In [4]:

labels_train.count('1')

0

## Tokenizing the text data and convert to sequences

In [None]:
max_tokens = 10000
sequence_length = 512

vectorizer = TextVectorization(
    max_tokens=max_tokens,
    output_mode="int",
    output_sequence_length=sequence_length
)
labels_train_noVal = np.array(labels_train_noVal, dtype=np.float32)
labels_val = np.array(labels_val, dtype=np.float32)

train_ds = tf.data.Dataset.from_tensor_slices(
    (texts_train_noVal, labels_train_noVal)
).batch(32)

val_ds = tf.data.Dataset.from_tensor_slices(
    (texts_val, labels_val)
).batch(32)
vectorizer.adapt(texts_train_noVal)



## Creating a simple neural network

In [12]:
# Creating a Sequential model
model = Sequential([
    vectorizer,  # <-- replaces Tokenizer + pad_sequences   
    Embedding(
        input_dim=vectorizer.vocabulary_size(),
        output_dim=64
    ),
    Flatten(),
    Dense(64, activation="relu"),
    Dense(1, activation="sigmoid")
])




## Compiling and training the model

In [13]:
model.compile(
    optimizer='adam', # 'adam' is chosen as the optimization algorithm, known for its efficiency in training neural networks
    loss='binary_crossentropy', # 'binary_crossentropy' is selected as the loss function as we are performing a binary classification tasks
    metrics= ['accuracy'] # The model will be evaluated based on 'accuracy' during training
)
model.summary()



In [14]:
model.fit(train_ds, validation_data=val_ds, epochs=1)


[1m22024/22024[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m886s[0m 40ms/step - accuracy: 0.8523 - loss: 0.3320 - val_accuracy: 0.8619 - val_loss: 0.3137


<keras.src.callbacks.history.History at 0x20fe48fa010>

## Evaluating the model on the test set

In [16]:
# Obtaining raw predictions for the test set by thresholding the predictions at 0.5 and converting boolean values to integers (0 or 1)

predict_ds = tf.data.Dataset.from_tensor_slices(texts_test).batch(32)
predictions = (model.predict(predict_ds) > 0.5).astype(int).flatten()


[1m  86/6883[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:11[0m 10ms/step

[1m6883/6883[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 8ms/step


## Calculating the metrics

In [17]:
accuracy = accuracy_score(labels_test, predictions)
report = classification_report(labels_test, predictions)
weighted_f1 = f1_score(labels_test, predictions, average='weighted')
micro_f1 = f1_score(labels_test, predictions, average='micro')
macro_f1 = f1_score(labels_test, predictions, average='macro')

print(f"Accuracy: {accuracy}")
print("Classification Report:\n", report)
print(f"Weighted F1 Score: {weighted_f1}")
print(f"Micro F1 Score: {micro_f1}")
print(f"Macro F1 Score: {macro_f1}")

Accuracy: 0.8623594102609509
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.94      0.91    173537
           1       0.72      0.58      0.64     46696

    accuracy                           0.86    220233
   macro avg       0.80      0.76      0.78    220233
weighted avg       0.86      0.86      0.86    220233

Weighted F1 Score: 0.8569224678167362
Micro F1 Score: 0.8623594102609509
Macro F1 Score: 0.7782966307894152
