<a href="https://colab.research.google.com/github/DaryaTereshchenko/HateSpeechDetection/blob/main/Emb_Roberta_Sm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sentence_transformers
!pip install transformers

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
from nltk import word_tokenize
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, TextVectorization, Input

In [None]:
ethos_hate = pd.read_csv("ethos_hate.csv", sep=",", encoding="unicode_escape").drop(columns="label")
ethos_hate["label"] = 1

ethos_neutral = pd.read_csv("ethos_neutral.csv", sep=",", encoding="unicode_escape").drop(columns="label")
ethos_neutral["label"] = 0

train_n = ethos_neutral.sample(frac=0.9, random_state=0)
test_n = ethos_neutral.drop(train_n.index)

train_hate = ethos_hate.sample(frac=0.9, random_state=0)
test_hate = ethos_hate.drop(train_hate.index)

train = pd.concat([train_n, train_hate]).sample(frac=1, random_state=300).reset_index(drop=True)
test = pd.concat([test_n, test_hate]).sample(frac=1, random_state=300).reset_index(drop=True)

print(train.text[0])


In [None]:
hate_speech = pd.read_csv('ethos_sm_hate.csv', sep=";", decimal=",", skiprows=[1,2,3,4], low_memory=False, on_bad_lines='skip').sort_values(by=["CODE"])
neutral_speech = pd.read_csv('ethos_sm_neutral.csv', sep=";", decimal=",", skiprows=[1,2,3,4], low_memory=False, on_bad_lines='skip').sort_values(by=["CODE"])

hate_speech = hate_speech.drop(columns="CODE")
neutral_speech = neutral_speech.drop(columns="CODE")

In [None]:
train_sm_n = neutral_speech.sample(frac=0.9, random_state=0)
test_sm_n = neutral_speech.drop(train_sm_n.index)

train_sm_hate = hate_speech.sample(frac=0.9, random_state=0)
test_sm_hate = hate_speech.drop(train_sm_hate.index)

TrainSM = pd.concat([train_sm_n, train_sm_hate]).sample(frac=1, random_state=300).reset_index(drop=True)
TestSM = pd.concat([test_sm_n, test_sm_hate]).sample(frac=1, random_state=300).reset_index(drop=True)

In [None]:
def clean_text(line):
  processed_line = line.lower()
  words = [word for word in word_tokenize(processed_line)]
  processed_line = [word for word in words if word not in set(string.punctuation)]
  text = " ".join(processed_line)a
  return text

In [None]:
# Sentences for embeddings
train_sentence_list = np.asarray(list((map(clean_text, train.text.values))))
test_sentence_list = np.asarray(list(map(clean_text, test.text.values)))

In [None]:
df = pd.read_csv('hurtlex_EN.tsv', sep='\t')
vocab = df["lemma"].unique()

In [None]:
MAX_LEN = 128
max_features = 6000  # maximum word number
embedding_dims = 20

In [None]:
MODEL_NAME = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)

In [None]:
def roberta_encode(texts, tokenizer):
    ct = len(texts)
    input_ids = np.ones((ct, MAX_LEN), dtype='int32')
    attention_mask = np.zeros((ct, MAX_LEN), dtype='int32')
    token_type_ids = np.zeros((ct, MAX_LEN), dtype='int32') # Not used in text classification

    for k, text in enumerate(texts):
        # Tokenize
        tok_text = tokenizer.tokenize(text)
        
        # Truncate and convert tokens to numerical IDs
        enc_text = tokenizer.convert_tokens_to_ids(tok_text[:(MAX_LEN-2)])
        
        input_length = len(enc_text) + 2
        input_length = input_length if input_length < MAX_LEN else MAX_LEN
        
        # Add tokens [CLS] and [SEP] at the beginning and the end
        input_ids[k,:input_length] = np.asarray([0] + enc_text + [2], dtype='int32')
        
        # Set to 1s in the attention input
        attention_mask[k,:input_length] = 1

    return {
        'input_word_ids': input_ids,
        'input_mask': attention_mask,
        'input_type_ids': token_type_ids
    }

In [None]:
train_dict = roberta_encode(train_sentence_list, tokenizer)
train_dict["np_sm"] = np.asarray(TrainSM.values, dtype="float32")

test_dict = roberta_encode(test_sentence_list, tokenizer)
test_dict["np_sm"] = np.asarray(TestSM.values,  dtype="float32")

In [None]:
X_train = train_dict
X_test = test_dict

y_train = np.asarray(train.label, dtype='int32').reshape(-1,1)
y_test = np.asarray(test.label, dtype='int32').reshape(-1,1)

In [None]:
def build_model():
  input_word_ids = Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_word_ids')
  input_mask = Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_mask')
  input_type_ids = Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_type_ids')
  np_sm = Input(shape=(157,), dtype=tf.float32, name='np_sm')
  sents = Input(shape=(), dtype=tf.string)
  

  roberta_model = TFRobertaModel.from_pretrained(MODEL_NAME)
  x = roberta_model(input_word_ids, attention_mask=input_mask, token_type_ids=input_type_ids)
  x = x[0]
  np_sm = np_sm

  vectorize_layer = TextVectorization(max_tokens=max_features, output_mode='int', output_sequence_length=MAX_LEN, vocabulary=vocab)(sents)
  y = Embedding(input_dim=max_features, output_dim=128, input_length=MAX_LEN)(vectorize_layer)

  x = tf.keras.layers.Dropout(rate=0.3)(x)
  x = tf.keras.layers.Falatten()(x)
  x = tf.keras.layers.Dense(128)(x)

  y = tf.keras.layers.Dropout(rate=0.5)(y)
  y = tf.keras.layers.Falatten()(y)
  y = tf.keras.layers.Dense(128)(y)

  z = tf.keras.layers.concatenate([x, np_sm, y])
  z = tf.keras.layers.Dense(32, activation='relu')(z)
  z =  tf.keras.layers.BatchNormalization()(z)
  z = tf.keras.layers.Dense(1, activation='sigmoid')(z)
  
  model = Model(inputs=[input_word_ids, input_mask, input_type_ids, np_sm, sents], outputs=z)

  # x1.trainable = True

  model.compile(
      optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
      loss= tf.keras.losses.binary_crossentropy,
      metrics=['accuracy'])
  
  return model


In [None]:
model = build_model1()
model.summary()
tf.keras.utils.plot_model(model)

In [None]:
print('Training...')
history = model.fit([X_train, train_sentence_list],
                    y_train,
                    epochs=25,
                    batch_size=16,
                    verbose=1,
                    validation_data=([X_test, test_sentence_list], y_test))

In [None]:
# This plot will look much better if we train models with more epochs, but anyway here is
plt.figure(figsize=(10, 10))
plt.title('Accuracy')

xaxis = np.arange(len(history.history['accuracy']))
plt.plot(xaxis, history.history['accuracy'], label='Train set')
plt.plot(xaxis, history.history['val_accuracy'], label='Validation set')
plt.legend()


In [None]:
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1] * 100))

In [None]:
y_pred = [int(np.round(i,0)) for i in model.predict(X_test)]

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import confusion_matrix
con_mat_df = confusion_matrix(y_test, y_pred)
con_mat_df = con_mat_df.astype('float') / con_mat_df.sum(axis=1)[:, np.newaxis]

In [None]:
import seaborn as sns
figure = plt.figure(figsize=(10, 10))
sns.heatmap(con_mat_df, cmap=plt.cm.Blues, annot=True)
plt.ylabel('True label')
plt.xlabel('Predicted label')
print(con_mat_df)