In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

### Datasets

In [2]:
def load_data_arrays():
    df_1 = pd.read_csv("datasets/1_no_stopwords.csv")
    df_3 = pd.read_csv("datasets/3_no_stopwords.csv")
    df_5 = pd.read_csv("datasets/5_no_stopwords.csv")
    df_6 = pd.read_csv("datasets/6_no_stopwords.csv")
    
    df = df_1.append(df_3)
    df = df.append(df_5)
    df = df.append(df_6)
    
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, shuffle=True, stratify=df["is_hatespeech"])
    
    return train_df, test_df

train_df, test_df = load_data_arrays()


# Convert the test- and train-DataFrames to Tensorflow Datasets

train_labels = np.eye(2)[train_df['is_hatespeech'].values]
train_dataset = (
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(np.asarray(train_df['text'].values, dtype=str), tf.string),
            tf.cast(train_labels, tf.int32)
        )
    )
)

test_labels = np.eye(2)[test_df['is_hatespeech'].values]
test_dataset = (
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(np.asarray(test_df['text'].values, dtype=str), tf.string),
            tf.cast(test_labels, tf.int32)
        )
    )
)

BUFFER_SIZE = 1000
BATCH_SIZE = 64

train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [3]:
# print(train_dataset)
print(next(iter(train_dataset)))
# print(test_dataset)

(<tf.Tensor: shape=(64,), dtype=string, numpy=
array([b'i committing vandalism are i hereby formally requesting stop harassing wikipedia if persist i everything power banned thank you',
       b'i already said i again',
       b'ah yes i forgot subst thanks care share',
       b'on twitter know talking to oh woman software dev oh lordy',
       b'yes sophia summary bullet points side dispute describes arguments excellent idea enables noninvolved editors understand nature dipute hopefully bring unexplored perspectives resolution \xe2\x80\xa2',
       b'rt we want skinny bitches get out we want big booty bitches',
       b'every good story starts so bitch lol',
       b'oh noes insult internets get moron being blocked affect slightest i life encouraging users like sp however might backfire whatever topics like edit here i would quite like see would deal various issues really understanding policy all blp npov or tpo fucking anything really especially hes learned trick misguided pompous ar

### Keywords feature

In [4]:
df_kw = pd.read_csv("datasets/bad_words.csv")
kw_vocab = set()
for idx, item in df_kw.iterrows():
  kw_vocab.add(item[0])
kw_vocab_len = len(kw_vocab) + 1
kw_layer = tf.keras.layers.TextVectorization(vocabulary=list(kw_vocab))
kw_layer.compile()

### Model

In [5]:
import tensorflow.keras.layers as layers

input_layer = layers.Input(shape=(1,), dtype=(tf.string))
seq_layer = kw_layer(input_layer)
seq_layer = layers.Embedding(input_dim=len(kw_layer.get_vocabulary()), output_dim=64, mask_zero=True)(seq_layer)
seq_layer = layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True))(seq_layer)
seq_layer = layers.Bidirectional(tf.keras.layers.LSTM(32))(seq_layer)
seq_layer = layers.Dense(64, activation='relu')(seq_layer)
seq_layer = layers.Dropout(0.5)(seq_layer)
seq_layer = layers.Dense(2, activation='softmax')(seq_layer)
output_layer = layers.Dense(2, activation='softmax')(seq_layer)

model = tf.keras.Model(name="hatespeech_keywords", inputs=input_layer, outputs=output_layer)
# print(model.summary())
model.compile(optimizer=tf.keras.optimizers.Adam(0.0001), loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), metrics=['accuracy'])
history = model.fit(train_dataset, epochs=5, validation_data=test_dataset, validation_steps=10)

Epoch 1/5


  return dispatch_target(*args, **kwargs)


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [9]:
print(model.summary())

Model: "hatespeech_keywords"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVec  (None, None)             0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, None, 64)          183552    
                                                                 
 bidirectional (Bidirectiona  (None, None, 128)        66048     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 64)               41216     
 nal)                                                            
                                               

In [10]:
test_loss, test_acc = model.evaluate(test_dataset)
print('Test Loss:', test_loss)
print('Test Accuracy:', test_acc)

Test Loss: 0.31142401695251465
Test Accuracy: 0.9259796142578125


In [11]:
# predict on a sample text without padding.
sample_text = ['You are such a stupid fucking whore',
               'I would not recommend this movie.']
predictions = model.predict(np.array(sample_text))
print(predictions)

[[0.11463743 0.8853626 ]
 [0.97191095 0.02808903]]
