<a href="https://colab.research.google.com/github/Drime648/KAR-P/blob/main/cupertino_hack_2021.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Get the data

In [None]:
!wget https://raw.githubusercontent.com/t-davidson/hate-speech-and-offensive-language/master/data/labeled_data.csv

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("/content/labeled_data.csv")

In [None]:
df

In [None]:
df = df.drop(["Unnamed: 0", "count", 'hate_speech', 'offensive_language', "neither"], axis=1)

In [None]:
df

In [None]:
class_names = ["hate speech", "offensive language", "none"]
num_classes = len(class_names)
num_classes

#split data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_text, val_text, train_labels, val_labels = train_test_split(df["tweet"].to_numpy(),
                                                                  df["class"].to_numpy(),
                                                                  test_size = 0.1,
                                                                  random_state=42)

In [None]:
len(train_text), len(train_labels), len(val_text), len(val_labels)

In [None]:
train_text, train_labels

#Baseline

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [None]:
model_0 = Pipeline([
  ("tfidf", TfidfVectorizer()),
  ("clf", MultinomialNB()),
])

In [None]:
model_0.fit(train_text, train_labels)

In [None]:
base_score = model_0.score(val_text, val_labels)

In [None]:
base_score * 100

#Text Vectorization

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
import matplotlib.pyplot as plt

In [None]:
text_lengths = [len(sentence.split()) for sentence in train_text]

In [None]:
plt.hist(text_lengths, 10)

In [None]:
max_tokens = 32

In [None]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [None]:
text_vectorizer = TextVectorization(output_sequence_length=max_tokens)

In [None]:
text_vectorizer.adapt(train_text)

In [None]:
len(text_vectorizer.get_vocabulary())

In [None]:
len_vocab = len(text_vectorizer.get_vocabulary())

#Embedding

In [None]:
from tensorflow.keras.layers import Embedding

In [None]:
embedding_layer = Embedding(len_vocab, 128, mask_zero=True, name = "embedding_layer")

In [None]:
sample_sentence = text_vectorizer(["hello there larry, my face sucks."])

In [None]:
sample_embed = embedding_layer(sample_sentence)

In [None]:
sample_embed

#Make Token Dataset

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_text, train_labels))
val_dataset = tf.data.Dataset.from_tensor_slices((val_text, val_labels))


In [None]:
train_dataset = train_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
val_dataset = val_dataset.batch(32).prefetch(tf.data.AUTOTUNE)


In [None]:
train_dataset

#Make Model 1

In [None]:
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

In [None]:
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                              patience=3, min_lr=0.000001)


In [None]:
early_stopping = EarlyStopping(patience = 6, restore_best_weights=True)

In [None]:
import tensorflow as tf
from tensorflow.keras import layers

In [None]:
inputs = layers.Input(shape = (1,), dtype = tf.string, name = "inputs")
text_vectors = text_vectorizer(inputs)
embeds = embedding_layer(text_vectors)

x = layers.Conv1D(128, 3, padding = "same", activation="relu")(embeds)
x = layers.GlobalMaxPooling1D()(x)

outputs = layers.Dense(num_classes, activation="softmax", name = "outputs")(x)

model_1 = tf.keras.Model(inputs, outputs, name = "Model_1")

model_1.compile(loss = "sparse_categorical_crossentropy",
                optimizer = "Adam",
                metrics = ["accuracy"])

In [None]:
model_1.summary()

In [None]:
from tensorflow.keras.utils import plot_model
plot_model(model_1)

In [None]:
history_1 = model_1.fit(train_dataset, epochs = 30, steps_per_epoch=len(train_dataset),
                        validation_data=val_dataset,
                        validation_steps= len(val_dataset),
                        callbacks = [early_stopping, reduce_lr])

In [None]:
import matplotlib.pyplot as plt

def plot_loss_curves(history):
  loss = history.history['loss']
  val_loss = history.history['val_loss']

  accuracy = history.history['accuracy']
  val_accuracy = history.history['val_accuracy']

  epochs = range(len(history.history['loss']))

  # Plot loss
  plt.plot(epochs, loss, label='training_loss')
  plt.plot(epochs, val_loss, label='val_loss')
  plt.title('Loss')
  plt.xlabel('Epochs')
  plt.legend()

  # Plot accuracy
  plt.figure()
  plt.plot(epochs, accuracy, label='training_accuracy')
  plt.plot(epochs, val_accuracy, label='val_accuracy')
  plt.title('Accuracy')
  plt.xlabel('Epochs')
  plt.legend();

In [None]:
plot_loss_curves(history_1)

In [None]:
model_1.evaluate(val_dataset)

In [None]:
tf.keras.models.save_model(model_1, "hack_model")

In [None]:
model_1_pred_probs = model_1.predict(["black people are stupid"])
model_1_pred_probs

In [None]:
model_1_pred = class_names[np.argmax(model_1_pred_probs)]

In [None]:
model_1_pred

In [None]:
import numpy as np
np.__version__

In [None]:
model_1.save('CNN_model')


In [None]:
new_cnn = tf.keras.models.load_model("/content/CNN_model")

In [None]:
new_cnn.summary()

#Model 2

In [None]:
def split_chars(text):
  return " ".join(list(text))

In [None]:
train_chars = [split_chars(line) for line in train_text]
val_chars = [split_chars(line) for line in val_text]

In [None]:
train_chars[:10]


In [None]:
char_lengths = [len(char) for char in train_text]

In [None]:
cover_most_chars = int(np.percentile(char_lengths, 95))

In [None]:
cover_most_chars

In [None]:
num_char_tokens = 70

In [None]:
char_vectorizer = TextVectorization(num_char_tokens, output_sequence_length=cover_most_chars, name = "char_vectorizing_layer")

In [None]:
char_vectorizer.adapt(train_chars)

In [None]:
char_len_vocab = len(char_vectorizer.get_vocabulary())


In [None]:
char_embedding_layer = Embedding(char_len_vocab, 25, mask_zero=True, name = "char_embedding_layer")

In [None]:
char_train_dataset = tf.data.Dataset.from_tensor_slices((train_chars, train_labels))
char_val_dataset = tf.data.Dataset.from_tensor_slices((val_chars, val_labels))


char_train_dataset = char_train_dataset.batch(32)
char_train_dataset = char_train_dataset.prefetch(tf.data.AUTOTUNE)

char_val_dataset = char_val_dataset.batch(32)
char_val_dataset = char_val_dataset.prefetch(tf.data.AUTOTUNE)

In [None]:
char_train_dataset

In [None]:
import tensorflow_hub as hub

In [None]:
embed_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4", trainable=False, name = "encoder")

In [None]:
embeddings = embed_layer([
    "The quick brown fox jumps over the lazy dog.",
    "I am a sentence for which I would like to get its embedding"])

print(embeddings)

In [None]:
token_inputs = layers.Input(shape = (1,), dtype = tf.string, name = "token_inputs")
text_vectors = text_vectorizer(token_inputs)
embeds = embedding_layer(text_vectors)
x = layers.Conv1D(128, 3, padding = "same", activation="relu")(embeds)
x = layers.GlobalMaxPooling1D()(x)
token_model = tf.keras.Model(inputs = token_inputs, outputs = x)

In [None]:
char_inputs = layers.Input(shape = (1,), dtype = tf.string, name = "char_inputs")
char_vectors = char_vectorizer(char_inputs)
char_embeddings = char_embedding_layer(char_vectors)
char_bi_lstm = layers.Bidirectional(layers.LSTM(24))(char_embeddings)
char_model = tf.keras.Model(char_inputs, char_bi_lstm)

In [None]:
combined_embeddings = layers.Concatenate(name = "concat")([token_model.output, char_model.output])

In [None]:
hybrid_dropout = layers.Dropout(0.5)(combined_embeddings)
hybrid_dense = layers.Dense(128, activation = "relu")(hybrid_dropout)
end_dropout = layers.Dropout(0.5)(hybrid_dense)
output = layers.Dense(num_classes, activation = "softmax")(end_dropout)

In [None]:
model_2 = tf.keras.Model(inputs = [token_model.input, char_model.input],
                         outputs = output,
                         name = "Model_2")

In [None]:
model_2.summary()

In [None]:
model_2.compile(loss = "sparse_categorical_crossentropy",
                optimizer = "Adam",
                metrics = ["accuracy"])

In [None]:
hybrid_train_data = tf.data.Dataset.from_tensor_slices((train_text, train_chars))
hybrid_train_labels = tf.data.Dataset.from_tensor_slices(train_labels)
hybrid_train_dataset = tf.data.Dataset.zip((hybrid_train_data, hybrid_train_labels))

hybrid_train_dataset = hybrid_train_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
hybrid_train_dataset

In [None]:
hybrid_val_data = tf.data.Dataset.from_tensor_slices((val_text, val_chars))
hybrid_val_labels = tf.data.Dataset.from_tensor_slices(val_labels)
hybrid_val_dataset = tf.data.Dataset.zip((hybrid_val_data, hybrid_val_labels))

hybrid_val_dataset = hybrid_val_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
hybrid_val_dataset

In [None]:
history_2 = model_2.fit(hybrid_train_dataset, epochs = 30, steps_per_epoch=len(hybrid_train_dataset),
                        validation_data=hybrid_val_dataset,
                        validation_steps= int(0.1 * len(hybrid_val_dataset)),
                        callbacks = [early_stopping, reduce_lr])

In [None]:
model_2.evaluate(hybrid_val_dataset)

#Failed experiments

In [None]:
inputs = layers.Input(shape = (1,), dtype = tf.string, name = "inputs")
text_vectors = text_vectorizer(inputs)
embeds = embedding_layer(text_vectors)

x = layers.LSTM(units = 64, return_sequences=True)(x) # inputs = 3 dimensions, output = 3 dimensions. Return sequences must be true when stackng RNN
x = layers.LSTM(64)(x) # inputs = 3 dimensions, output = 2 dimensions

outputs = layers.Dense(num_classes, activation="softmax", name = "outputs")(x)

model_2 = tf.keras.Model(inputs, outputs, name = "Model_1")

model_2.compile(loss = "sparse_categorical_crossentropy",
                optimizer = "Adam",
                metrics = ["accuracy"])

history_2 = model_2.fit(train_dataset, epochs = 30, steps_per_epoch=len(train_dataset),
                        validation_data=val_dataset,
                        validation_steps= len(val_dataset),
                        callbacks = [early_stopping, reduce_lr])


In [None]:
inputs = layers.Input(shape = (1,), dtype = tf.string, name = "inputs")
text_vectors = text_vectorizer(inputs)
embeds = embedding_layer(text_vectors)

x = layers.GRU(units = 64, return_sequences=True)(x) # inputs = 3 dimensions, output = 3 dimensions. Return sequences must be true when stackng RNN
x = layers.GRU(64)(x) # inputs = 3 dimensions, output = 2 dimensions

outputs = layers.Dense(num_classes, activation="softmax", name = "outputs")(x)

model_2 = tf.keras.Model(inputs, outputs, name = "Model_1")

model_2.compile(loss = "sparse_categorical_crossentropy",
                optimizer = "Adam",
                metrics = ["accuracy"])

history_2 = model_2.fit(train_dataset, epochs = 30, steps_per_epoch=len(train_dataset),
                        validation_data=val_dataset,
                        validation_steps= len(val_dataset),
                        callbacks = [early_stopping, reduce_lr])


#predictions

In [None]:
!wget https://raw.githubusercontent.com/t-davidson/hate-speech-and-offensive-language/master/lexicons/refined_ngram_dict.csv

In [None]:
import pandas as pd
targets = pd.read_csv("/content/refined_ngram_dict.csv")

In [None]:
targets = targets.drop("prophate", axis = 1)

In [None]:
targets = targets["ngram"]

In [None]:
import pandas as pd
pd.read_csv("https://raw.githubusercontent.com/t-davidson/hate-speech-and-offensive-language/master/lexicons/refined_ngram_dict.csv")

In [None]:
"allah akbar" == targets[0]