In [1]:
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
#####################################################################
#                           Set C                                   #
#####################################################################
# Testing a variety of NN architectures with Embeddings             #
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
import numpy as np
import time
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, precision_score, recall_score
from sklearn.model_selection import StratifiedKFold, KFold
from utilities.helping_functions import create_embedding_matrix
from utilities.attention_layer import Attention
from utilities.preprocess import Preproccesor
from keras.preprocessing.text import Tokenizer
from keras.layers.embeddings import Embedding
from keras.models import Sequential, clone_model, model_from_json
from keras.optimizers import Adam
from keras import Input, Model
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.engine import Layer
from keras import backend as K
from keras import initializers, regularizers, constraints
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, SpatialDropout1D, CuDNNLSTM, Bidirectional, Dense, \
    LSTM, Conv1D, MaxPooling1D, Dropout, concatenate, Flatten, add
from keras.preprocessing.sequence import pad_sequences
from keras_preprocessing.sequence import pad_sequences
import pandas as pd
pd.set_option('max_colwidth', 400)

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  return f(*args, **kwds)


In [2]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/johnmollas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/johnmollas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
X, y = Preproccesor.load_data(True, False)
class_names = ['noHateSpeech', 'hateSpeech']
f = open("setC.txt", "a+")
f.write("{:<10} | {:<7} {:<7} {:<7} {:<7} {:<7} {:<7} \n".format(
    'Method', 'F1score', 'Precisi', 'Recall', 'Accurac', 'Specifi', 'Sensiti'))
f.write("=========================================================================\n")
f.close()
print("{:<10} | {:<7} {:<7} {:<7} {:<7} {:<7} {:<7}".format(
    'Method', 'F1score', 'Precisi', 'Recall', 'Accurac', 'Specifi', 'Sensiti'))


Method     | F1score Precisi Recall  Accurac Specifi Sensiti


In [4]:
import zipfile
!wget 'https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip'
!wget 'http://nlp.stanford.edu/data/glove.42B.300d.zip'
with zipfile.ZipFile("/content/crawl-300d-2M.vec.zip", "r") as zip_ref:
    zip_ref.extractall()
    print(zip_ref.filelist)
with zipfile.ZipFile("/content/glove.42B.300d.zip", "r") as zip_ref:
    zip_ref.extractall()
    print(zip_ref.filelist)

del zip_ref


--2020-11-07 16:20:22--  https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 2606:4700:10::6816:4b8e, 2606:4700:10::6816:4a8e, 2606:4700:10::ac43:904, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|2606:4700:10::6816:4b8e|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1523785255 (1,4G) [application/zip]
Saving to: ‘crawl-300d-2M.vec.zip’


2020-11-07 16:25:58 (4,35 MB/s) - ‘crawl-300d-2M.vec.zip’ saved [1523785255/1523785255]

--2020-11-07 16:25:58--  http://nlp.stanford.edu/data/glove.42B.300d.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.42B.300d.zip [following]
--2020-11-07 16:25:59--  https://nlp.stanford.edu/data/glove.42B.300d.zip
Connecting to nlp.stanf

FileNotFoundError: [Errno 2] No such file or directory: '/content/crawl-300d-2M.vec.zip'

In [0]:
!rm '/content/crawl-300d-2M.vec.zip'
!rm '/content/glove.42B.300d.zip'

In [5]:
def specificity(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    if(tn+fp) > 0:
        speci = tn/(tn+fp)
        return speci
    return 0


def sensitivity(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    if(tp+fn) > 0:
        sensi = tp/(tp+fn)
        return sensi
    return 0

In [6]:
embedding_path1 = "/embeddings/crawl-300d-2M.vec" #FastText
embedding_path2 = "/embeddings/glove.42B.300d.txt" #Glove 300d
embed_size = 300

In [7]:
def get_coefs(word,*arr):
    return word, np.asarray(arr, dtype='float32')

In [0]:
n_fold = 10
folds = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=7)

In [0]:
def build_model1(X_train, y_train, X_valid, y_valid, max_len, max_features, embed_size, embedding_matrix, lr=0.0, lr_d=0.0, spatial_dr=0.0, dense_units=128, conv_size=128, dr=0.2, patience=3, fold_id=1):
    file_path = f"best_model_fold_{fold_id}.hdf5"
    check_point = ModelCheckpoint(
        file_path, monitor="val_accuracy", verbose=1, save_best_only=True, mode="max")
    early_stop = EarlyStopping(
        monitor="val_accuracy", mode="max", patience=patience)
    inp = Input(shape=(max_len,))
    x = Embedding(max_features + 1, embed_size * 2,
                  weights=[embedding_matrix], trainable=False)(inp)
    x1 = SpatialDropout1D(spatial_dr)(x)
    att = Attention(max_len)(x1)
    x = Conv1D(conv_size, 2, activation='relu', padding='same')(x1)
    x = MaxPooling1D(5, padding='same')(x)
    x = Conv1D(conv_size, 3, activation='relu', padding='same')(x)
    x = MaxPooling1D(5, padding='same')(x)
    x = Flatten()(x)
    x = concatenate([x, att])
    x = Dropout(0.5)(Dense(dense_units, activation='relu')(x))
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss="binary_crossentropy",
                  optimizer=Adam(), metrics=["accuracy"])
    model2 = Model(inputs=inp, outputs=x)
    model.fit(X_train, y_train, batch_size=16, epochs=20, validation_data=(
        X_valid, y_valid), verbose=1, callbacks=[early_stop, check_point])
    model2.load_weights(file_path)
    model2.compile(loss="binary_crossentropy",
                   optimizer=Adam(), metrics=["accuracy"])
    return model2


def build_model3(X_train, y_train, X_valid, y_valid, max_len, max_features, embed_size, embedding_matrix, lr=0.0, lr_d=0.0, spatial_dr=0.0, dense_units=128, conv_size=128, dr=0.2, patience=3, fold_id=1):
    file_path = f"best_model_fold_{fold_id}.hdf5"
    check_point = ModelCheckpoint(
        file_path, monitor="val_accuracy", verbose=1, save_best_only=True, mode="max")
    early_stop = EarlyStopping(
        monitor="val_accuracy", mode="max", patience=patience)
    main_input = Input(shape=(max_len,), name='main_input')
    glove_Embed = (Embedding(max_features + 1, embed_size * 2,
                             weights=[embedding_matrix], trainable=False))(main_input)
    y = LSTM(300)(glove_Embed)
    y = Dropout(rate=0.5)(y)
    y = Dense(200, activation='relu')(y)
    y = Dropout(rate=0.5)(y)
    z = Dense(100, activation='relu')(y)
    output_lay = Dense(1, activation='sigmoid')(z)
    model = Model(inputs=[main_input], outputs=[output_lay])
    model.compile(loss="binary_crossentropy",
                  optimizer=Adam(), metrics=["accuracy"])
    model2 = Model(inputs=[main_input], outputs=[output_lay])
    model.fit(X_train, y_train, batch_size=16, epochs=20, validation_data=(
        X_valid, y_valid), verbose=1, callbacks=[early_stop, check_point])
    model2.load_weights(file_path)
    model2.compile(loss="binary_crossentropy",
                   optimizer=Adam(), metrics=["accuracy"])
    return model2


def build_model4(X_train, y_train, X_valid, y_valid, max_len, max_features, embed_size, embedding_matrix, lr=0.0, lr_d=0.0, spatial_dr=0.0, dense_units=128, conv_size=128, dr=0.2, patience=3, fold_id=1):
    file_path = f"best_model_fold_{fold_id}.hdf5"
    check_point = ModelCheckpoint(
        file_path, monitor="val_accuracy", verbose=1, save_best_only=True, mode="max")
    early_stop = EarlyStopping(
        monitor="val_accuracy", mode="max", patience=patience)
    main_input = Input(shape=(max_len,), dtype='int32', name='main_input')
    glove_Embed = (Embedding(max_features + 1, embed_size * 2, input_length=max_len,
                             weights=[embedding_matrix], trainable=False))(main_input)

    x0 = Conv1D(128, 10, activation='relu')(glove_Embed)
    x1 = Conv1D(64, 5, activation='relu')(x0)
    x2 = Conv1D(32, 4, activation='relu')(x1)
    x3 = Conv1D(16, 3, activation='relu')(x2)
    x4 = Conv1D(8, 5, activation='relu')(x3)
    x = MaxPooling1D(pool_size=3)(x4)
    x = Dropout(rate=0.5)(x)
    x = LSTM(100)(x)

    p = MaxPooling1D(pool_size=10)(x0)
    p = Dropout(rate=0.5)(p)
    p = LSTM(100)(p)

    o = MaxPooling1D(pool_size=8)(x1)
    o = Dropout(rate=0.5)(o)
    o = LSTM(100)(o)

    i = MaxPooling1D(pool_size=6)(x2)
    i = Dropout(rate=0.5)(i)
    i = LSTM(100)(i)

    r = MaxPooling1D(pool_size=4)(x3)
    r = Dropout(rate=0.5)(r)
    r = LSTM(100)(r)

    t = MaxPooling1D(pool_size=3)(x4)
    t = Dropout(rate=0.5)(t)
    t = LSTM(100)(t)

    y = LSTM(500)(glove_Embed)
    y = Dense(250, activation='relu')(y)
    y = Dropout(rate=0.5)(y)

    z = concatenate([x, p, o, i, r, t, y])

    z = Dense(400, activation='relu')(z)
    z = Dropout(0.5)(z)
    z = Dense(200, activation='relu')(z)
    z = Dense(100, activation='relu')(z)
    z = Dropout(0.5)(z)
    z = Dense(50, activation='relu')(z)
    output_lay = Dense(1, activation='sigmoid')(z)
    model = Model(inputs=[main_input], outputs=[output_lay])
    model.compile(loss="binary_crossentropy", optimizer=Adam(
        lr=lr, decay=lr_d), metrics=["accuracy"])
    model2 = Model(inputs=[main_input], outputs=[output_lay])
    model.fit(X_train, y_train, batch_size=16, epochs=20, validation_data=(
        X_valid, y_valid), verbose=1, callbacks=[early_stop, check_point])
    model2.load_weights(file_path)
    model2.compile(loss="binary_crossentropy", optimizer=Adam(
        lr=lr, decay=lr_d), metrics=["accuracy"])
    return model2


def build_model5(X_train, y_train, X_valid, y_valid, max_len, max_features, embed_size, embedding_matrix, lr=0.0, lr_d=0.0, spatial_dr=0.0, dense_units=128, conv_size=128, dr=0.2, patience=3, fold_id=1):
    file_path = f"best_model_fold_{fold_id}.hdf5"
    check_point = ModelCheckpoint(
        file_path, monitor="val_accuracy", verbose=1, save_best_only=True, mode="max")
    early_stop = EarlyStopping(
        monitor="val_accuracy", mode="max", patience=patience)
    main_input = Input(shape=(max_len,), dtype='int32', name='main_input')
    x = (Embedding(max_features + 1, embed_size*2, input_length=max_len,
                   weights=[embedding_matrix], trainable=False))(main_input)
    x = SpatialDropout1D(0.3)(x)
    x = Bidirectional(LSTM(128, return_sequences=True))(x)
    x = Dropout(0.5)(x)
    x = Bidirectional(LSTM(128, return_sequences=True))(x)
    hidden = concatenate([
        GlobalMaxPooling1D()(x),
        GlobalAveragePooling1D()(x),
        Attention(max_len)(x)
    ])
    hidden = Dense(256, activation='relu')(hidden)
    hidden = Dropout(0.5)(hidden)
    hidden = Dense(128, activation='relu')(hidden)
    hideen = Dropout(0.5)(hidden)
    output_lay = Dense(1, activation='sigmoid')(hidden)
    model = Model(inputs=[main_input], outputs=[output_lay])
    model.compile(loss="binary_crossentropy",
                  optimizer=Adam(), metrics=["accuracy"])
    model2 = Model(inputs=[main_input], outputs=[output_lay])
    model.fit(X_train, y_train, batch_size=16, epochs=20, validation_data=(
        X_valid, y_valid), verbose=1, callbacks=[early_stop, check_point])
    model2.load_weights(file_path)
    model2.compile(loss="binary_crossentropy",
                   optimizer=Adam(), metrics=["accuracy"])
    return model2

In [None]:
def run_model_on_fold(name, max_len, embed_size, embed, bulid_fun):
    max_features = 50000
    scores = {}
    scores.setdefault('fit_time', [])
    scores.setdefault('score_time', [])
    scores.setdefault('test_F1', [])
    scores.setdefault('test_Precision', [])
    scores.setdefault('test_Recall', [])
    scores.setdefault('test_Accuracy', [])
    scores.setdefault('test_Specificity', [])
    scores.setdefault('test_Sensitivity', [])
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y)):
        print('Fold', fold_n, 'started at', time.ctime())
        X_train, X_valid = X[train_index], X[valid_index]
        y_train, y_valid = y[train_index], y[valid_index]
        tk = Tokenizer(lower=True, filters='',
                       num_words=max_features, oov_token=True)
        tk.fit_on_texts(X_train)
        train_tokenized = tk.texts_to_sequences(X_train)
        valid_tokenized = tk.texts_to_sequences(X_valid)
        X_train = pad_sequences(train_tokenized, maxlen=max_len)
        X_valid = pad_sequences(valid_tokenized, maxlen=max_len)
        embedding_matrix = create_embedding_matrix(embed, tk, max_features)

        model = bulid_fun(X_train, y_train, X_valid, y_valid, max_len, max_features, embed_size, embedding_matrix,
                          lr=1e-3, lr_d=0, spatial_dr=0.1, dense_units=128, conv_size=128, dr=0.1, patience=4,
                          fold_id=fold_n)

        y_preds = []
        for i in model.predict(X_valid):
            if i[0] >= 0.5:
                y_preds.append(1)
            else:
                y_preds.append(0)
        print(accuracy_score(y_valid, y_preds))
        scores['test_F1'].append(f1_score(y_valid, y_preds, average='macro'))
        scores['test_Precision'].append(
            precision_score(y_valid, y_preds, average='macro'))
        scores['test_Recall'].append(
            recall_score(y_valid, y_preds, average='macro'))
        scores['test_Accuracy'].append(accuracy_score(y_valid, y_preds))
        scores['test_Specificity'].append(specificity(y_valid, y_preds))
        scores['test_Sensitivity'].append(sensitivity(y_valid, y_preds))
    print("{:<10} | {:<7} {:<7} {:<7} {:<7} {:<7} {:<7}".format(str(name)[:7],
                                                                str('%.4f' % (
                                                                    sum(scores['test_F1']) / 10)),
                                                                str('%.4f' % (
                                                                    sum(scores['test_Precision']) / 10)),
                                                                str('%.4f' % (
                                                                    sum(scores['test_Recall']) / 10)),
                                                                str('%.4f' % (
                                                                    sum(scores['test_Accuracy']) / 10)),
                                                                str('%.4f' % (
                                                                    sum(scores['test_Specificity']) / 10)),
                                                                str('%.4f' % (sum(scores['test_Sensitivity']) / 10))))
    f = open("setC.txt", "a+")
    f.write("{:<10} | {:<7} {:<7} {:<7} {:<7} {:<7} {:<7}".format(str(name)[:7],
                                                                  str('%.4f' % (
                                                                      sum(scores['test_F1']) / 10)),
                                                                  str('%.4f' % (
                                                                      sum(scores['test_Precision']) / 10)),
                                                                  str('%.4f' % (
                                                                      sum(scores['test_Recall']) / 10)),
                                                                  str('%.4f' % (
                                                                      sum(scores['test_Accuracy']) / 10)),
                                                                  str('%.4f' % (
                                                                      sum(scores['test_Specificity']) / 10)),
                                                                  str('%.4f' % (sum(scores['test_Sensitivity']) / 10)))+'\n')
    f.close()

In [13]:
for emb_ma in [1, 2, 3]:
    embed_size = 150  # * 2 = 300 for matrix 1 and 2
    if emb_ma == 3:
        embed_size = 300
    for max_len in [100, 150, 200, 250, 300]:
        run_model_on_fold('b1_'+str(emb_ma)+'_'+str(max_len),
                          max_len, embed_size, emb_ma, build_model1)
        run_model_on_fold('b3_'+str(emb_ma)+'_'+str(max_len),
                          max_len, embed_size, emb_ma, build_model3)
        run_model_on_fold('b4_'+str(emb_ma)+'_'+str(max_len),
                          max_len, embed_size, emb_ma, build_model4)
        run_model_on_fold('b5_'+str(emb_ma)+'_'+str(max_len),
                          max_len, embed_size, emb_ma, build_model5)

Fold 0 started at Wed Aug 28 05:19:53 2019


W0828 05:25:08.278795 139950453757824 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0828 05:25:08.365013 139950453757824 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0828 05:25:08.368760 139950453757824 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0828 05:25:08.384673 139950453757824 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:174: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.

W0828 05:25:08.385567 1399504537

Train on 897 samples, validate on 101 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.71287, saving model to best_model_fold_0.hdf5
Epoch 2/10

Epoch 00002: val_acc did not improve from 0.71287
Epoch 3/10

Epoch 00003: val_acc improved from 0.71287 to 0.73267, saving model to best_model_fold_0.hdf5
Epoch 4/10

Epoch 00004: val_acc did not improve from 0.73267
Epoch 5/10

Epoch 00005: val_acc did not improve from 0.73267
Epoch 6/10

Epoch 00006: val_acc did not improve from 0.73267
Epoch 7/10

Epoch 00007: val_acc improved from 0.73267 to 0.74257, saving model to best_model_fold_0.hdf5
Epoch 8/10

Epoch 00008: val_acc improved from 0.74257 to 0.78218, saving model to best_model_fold_0.hdf5
Epoch 9/10

Epoch 00009: val_acc did not improve from 0.78218
Epoch 10/10

Epoch 00010: val_acc improved from 0.78218 to 0.79208, saving model to best_model_fold_0.hdf5
0.7920792079207921
Fold 1 started at Wed Aug 28 05:26:30 2019
Train on 897 samples, validate on 101 samples
Epoch 1/