In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

### Datasets

In [2]:
def load_data_arrays():
    df_1 = pd.read_csv("datasets/1_no_stopwords.csv")
    df_3 = pd.read_csv("datasets/3_no_stopwords.csv")
    df_5 = pd.read_csv("datasets/5_no_stopwords.csv")
    df_6 = pd.read_csv("datasets/6_no_stopwords.csv")
    
    df = df_1.append(df_3)
    df = df.append(df_5)
    df = df.append(df_6)
    
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, shuffle=True, stratify=df["is_hatespeech"])
    
    return train_df, test_df

train_df, test_df = load_data_arrays()


# Convert the test- and train-DataFrames to Tensorflow Datasets

train_labels = np.eye(2)[train_df['is_hatespeech'].values]
train_dataset = (
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(np.asarray(train_df['text'].values, dtype=str), tf.string),
            tf.cast(train_labels, tf.int32)
        )
    )
)

test_labels = np.eye(2)[test_df['is_hatespeech'].values]
test_dataset = (
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(np.asarray(test_df['text'].values, dtype=str), tf.string),
            tf.cast(test_labels, tf.int32)
        )
    )
)

BUFFER_SIZE = 1000
BATCH_SIZE = 64

train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [3]:
# print(train_dataset)
print(next(iter(train_dataset)))
# print(test_dataset)

(<tf.Tensor: shape=(64,), dtype=string, numpy=
array([b'oh well go get malware yourself itll stop going wikipedia get real admins on eh good idea chump',
       b'ways improve rosie fellner hi im kudpung turskellies thanks creating rosie fellner ive tagged page using page curation tools issues fix please take moment return article address tagged issues the tags removed another editor issues mention addressed if questions leave comment talk page or editing help talk volunteers teahouse',
       b'pov some seems bit pov especially paragraph begins in march 1990 the person wrote obviously trying evenhanded delicate matters like this would probably better cite conflicting view responsible sources talk 0131 nov 18 2004 utc',
       b'i see disregarded request and based email copied editors talk page resubmitted content article i going revert edit ip editor got first please refrain posting inappropriate material wikipedia familiarize relevant policies links provided above wikipedia content m

### Keywords feature

In [4]:
df_kw = pd.read_csv("datasets/bad_words.csv")
kw_vocab = set()
for idx, item in df_kw.iterrows():
  kw_vocab.add(item[0])
kw_vocab_len = len(kw_vocab) + 1
kw_layer = tf.keras.layers.TextVectorization(vocabulary=list(kw_vocab))
kw_layer.compile()

### Encoder feature

In [5]:
VOCAB_SIZE = 1000
ec_layer = tf.keras.layers.TextVectorization(max_tokens=VOCAB_SIZE)
ec_layer.adapt(train_dataset.map(lambda text, label: text))

In [6]:
# print(kw_layer(["You are a nice little idiot"]))
# print(ec_layer(["You are a nice little idiot"]))

### Model

In [8]:
import tensorflow.keras.layers as layers

input_layer = layers.Input(shape=(1,), dtype=(tf.string))
seq_layer = kw_layer(input_layer)
seq2_layer = ec_layer(input_layer)
concat_layer = layers.Concatenate(axis=1)([seq_layer, seq2_layer])
concat_layer = layers.Embedding(input_dim=len(kw_layer.get_vocabulary() + ec_layer.get_vocabulary()), output_dim=64, mask_zero=True)(concat_layer)
concat_layer = layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True))(concat_layer)
concat_layer = layers.Bidirectional(tf.keras.layers.LSTM(32))(concat_layer)
concat_layer = layers.Dense(64, activation='relu')(concat_layer)
concat_layer = layers.Dropout(0.5)(concat_layer)
output_layer = layers.Dense(2, activation='softmax')(concat_layer)

model = tf.keras.Model(name="hatespeech_detector", inputs=input_layer, outputs=output_layer)
# print(model.summary())
model.compile(optimizer=tf.keras.optimizers.Adam(0.0001), loss=tf.keras.losses.BinaryCrossentropy(), metrics=['accuracy'])
history = model.fit(train_dataset, epochs=5, validation_data=test_dataset, validation_steps=10)

Epoch 1/5
   4/2717 [..............................] - ETA: 7:11:46 - loss: 0.6912 - accuracy: 0.6523

KeyboardInterrupt: 

In [None]:
print(model.summary())

Model: "hatespeech_detector"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 text_vectorization (TextVector  (None, None)        0           ['input_2[0][0]',                
 ization)                                                         'input_2[0][0]']                
                                                                                                  
 embedding_2 (Embedding)        (None, None, 64)     183552      ['text_vectorization[2][0]']     
                                                                                                  
 embedding_3 (Embedding)        (None, None, 64)     183552      ['text_vectoriz

In [None]:
test_loss, test_acc = model.evaluate(test_dataset)
print('Test Loss:', test_loss)
print('Test Accuracy:', test_acc)

Test Loss: 0.28492414951324463
Test Accuracy: 0.9172821640968323


In [None]:
# predict on a sample text without padding.
sample_text = ['You are such a stupid fucking whore',
               'I would not recommend this movie.']
predictions = model.predict(np.array(sample_text))
print(predictions)

[[0.06653643 0.9334636 ]
 [0.9860053  0.01399465]]


In [None]:
# Save the model
!mkdir models
model.save('models/binary_features')

Ein Unterverzeichnis oder eine Datei mit dem Namen "models" existiert bereits.


INFO:tensorflow:Assets written to: models/binary_features\assets


INFO:tensorflow:Assets written to: models/binary_features\assets
