In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

### Datasets

In [2]:
def load_data_arrays():
    df_1 = pd.read_csv("datasets/1_no_stopwords.csv")
    df_3 = pd.read_csv("datasets/3_no_stopwords.csv")
    df_5 = pd.read_csv("datasets/5_no_stopwords.csv")
    df_6 = pd.read_csv("datasets/6_no_stopwords.csv")
    
    df = df_1.append(df_3)
    df = df.append(df_5)
    df = df.append(df_6)
    
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, shuffle=True, stratify=df["is_hatespeech"])
    
    return train_df, test_df

train_df, test_df = load_data_arrays()


# Convert the test- and train-DataFrames to Tensorflow Datasets

train_labels = np.eye(2)[train_df['is_hatespeech'].values]
train_dataset = (
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(np.asarray(train_df['text'].values, dtype=str), tf.string),
            tf.cast(train_labels, tf.int32)
        )
    )
)

test_labels = np.eye(2)[test_df['is_hatespeech'].values]
test_dataset = (
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(np.asarray(test_df['text'].values, dtype=str), tf.string),
            tf.cast(test_labels, tf.int32)
        )
    )
)

BUFFER_SIZE = 1000
BATCH_SIZE = 64

train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [3]:
# print(train_dataset)
print(next(iter(train_dataset)))
# print(test_dataset)

(<tf.Tensor: shape=(64,), dtype=string, numpy=
array([b'which even matter this foo talk here 13112511515',
       b'explanation toddst1 blocked completely clear community rejected onesided interaction ban completely ineffective1 2 3 4 5 behaviour completely one sided considering completely ignored similar behaviour gpms part here here even i brought attention talk page advised go ani 3 afterwards blocked following advice1 this massive abuse admin power content dispute favor proteges perfect example wpgame whats wrong wikipedia',
       b'rt annie sunrise im wrong show could get worse mkr',
       b'that certainly true however protests mainly highlight particular aspects carrolls addiction research tend draw sympathy if research stopped attention would probably diverted research using animal models particularly research monkeys would drop protests threshold notability iinans addiction research aspects experimentation seem quite unique important own possibly primate research ongoing umns

### Keywords feature

In [4]:
df_kw = pd.read_csv("datasets/bad_words.csv")
kw_vocab = set()
for idx, item in df_kw.iterrows():
  kw_vocab.add(item[0])
kw_vocab_len = len(kw_vocab) + 1
kw_layer = tf.keras.layers.TextVectorization(vocabulary=list(kw_vocab))
kw_layer.compile()

### Encoder feature

In [5]:
VOCAB_SIZE = 1000
ec_layer = tf.keras.layers.TextVectorization(max_tokens=VOCAB_SIZE)
ec_layer.adapt(train_dataset.map(lambda text, label: text))

In [6]:
# print(kw_layer(["You are a nice little idiot"]))
# print(ec_layer(["You are a nice little idiot"]))

### Model

In [8]:
import tensorflow.keras.layers as layers

input_layer = layers.Input(shape=(1,), dtype=(tf.string))
seq_layer = kw_layer(input_layer)
seq_layer = layers.Embedding(input_dim=len(kw_layer.get_vocabulary()), output_dim=64, mask_zero=True)(seq_layer)
seq_layer = layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True))(seq_layer)
seq_layer = layers.Bidirectional(tf.keras.layers.LSTM(32))(seq_layer)
seq_layer = layers.Dense(64, activation='relu')(seq_layer)
seq_layer = layers.Dropout(0.5)(seq_layer)
seq_layer = layers.Dense(2, activation='softmax')(seq_layer)
seq2_layer = ec_layer(input_layer)
seq2_layer = layers.Embedding(input_dim=len(ec_layer.get_vocabulary()), output_dim=64, mask_zero=True)(seq2_layer)
seq2_layer = layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True))(seq2_layer)
seq2_layer = layers.Bidirectional(tf.keras.layers.LSTM(32))(seq2_layer)
seq2_layer = layers.Dense(64, activation='relu')(seq2_layer)
seq2_layer = layers.Dropout(0.5)(seq2_layer)
seq2_layer = layers.Dense(2, activation='softmax')(seq2_layer)
concat_layer = layers.Concatenate(axis=1)([seq_layer, seq2_layer])
output_layer = layers.Dense(2, activation='softmax')(concat_layer)

model = tf.keras.Model(name="hatespeech_detector", inputs=input_layer, outputs=output_layer)
# print(model.summary())
model.compile(optimizer=tf.keras.optimizers.Adam(0.0001), loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), metrics=['accuracy'])
history = model.fit(train_dataset, epochs=5, validation_data=test_dataset, validation_steps=10)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [9]:
print(model.summary())

Model: "hatespeech_detector"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 text_vectorization (TextVector  (None, None)        0           ['input_2[0][0]']                
 ization)                                                                                         
                                                                                                  
 text_vectorization_1 (TextVect  (None, None)        0           ['input_2[0][0]']                
 orization)                                                                                       
                                                                                

In [10]:
test_loss, test_acc = model.evaluate(test_dataset)
print('Test Loss:', test_loss)
print('Test Accuracy:', test_acc)

Test Loss: 0.2095242589712143
Test Accuracy: 0.9322841167449951


In [11]:
# predict on a sample text without padding.
sample_text = ['You are such a stupid fucking whore',
               'I would not recommend this movie.']
predictions = model.predict(np.array(sample_text))
print(predictions)

[[0.00668358 0.9933164 ]
 [0.9894041  0.01059597]]


In [12]:
# Save the model
!mkdir models
model.save('models/binary_features')

Ein Unterverzeichnis oder eine Datei mit dem Namen "models" existiert bereits.


INFO:tensorflow:Assets written to: models/binary_features\assets


INFO:tensorflow:Assets written to: models/binary_features\assets
