In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
os.listdir('.')
os.getcwd()
os.chdir('/content/drive/My Drive/Colab Notebooks')
os.getcwd()

In [None]:
pip install tensorflow_gpu==2.2.0

In [None]:
pip install keras==2.4.3

In [None]:
pip install tensorflow==2.2.0

In [None]:
pip show tensorflow


In [21]:
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
# from nltk.stem import SnowballStemmer
from textblob import TextBlob
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from keras.layers import Dense, Input, Embedding, Lambda, Dropout, SpatialDropout1D, GlobalAveragePooling1D, merge, Flatten, Bidirectional, GRU, GlobalMaxPooling1D
from keras.layers.merge import concatenate
from keras.models import Model
from keras import optimizers
from keras import initializers
from keras.engine import InputSpec, Layer
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import roc_auc_score
from sklearn.externals import joblib
from sklearn.model_selection import KFold

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
glove_embedding_file = '/content/drive/My Drive/Colab Notebooks/glove.840B.300d.txt'
train_data_file = '/content/drive/My Drive/Colab Notebooks/train.csv'
test_data_file = '/content/drive/My Drive/Colab Notebooks/test.csv'

max_sequence_length = 400
max_nb_words = 100000
embedding_size = 300

train_data = pd.read_csv(train_data_file)
test_data = pd.read_csv(test_data_file)

# **Create embedding index**

In [None]:
def create_embedding_index(path):
    embeddings_index = {}
    with open(path, 'r', encoding='utf-8') as f:
      for line in f:
          words = line.split()
          try:
              word = words[0]
              values = np.asarray(words[1:], dtype='float32')
              embeddings_index[word] = values
          except:
              continue

    return embeddings_index

embeddings_index = create_embedding_index(glove_embedding_file)

# **Clean data**

In [9]:
def preprocess_text(text):
    # convert to lower case
    text = text.lower()

    #remove links and numbers
    text = re.sub(r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", "", text)
    text = re.sub(r"(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}", "", text)

    #spelling check (Takes a really long time)
    # text = str(TextBlob(text).correct())
    
    #abbreviations 
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\?", " ? ", text)
    text = re.sub(r"\!", " ! ", text)
    text = re.sub(r"\"", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)

    #remove special characters
    text = re.sub(r'[^?!.,:a-z\d ]', '',text, flags=re.IGNORECASE)
    
    #stop word removal
    STOPWORDS = set(stopwords.words('english'))
    text = " ".join([word for word in str(text).split() if word not in STOPWORDS])

    return text

# **Process Text in dataset**

In [10]:
train_sentences = train_data["comment_text"].fillna("no comment").values
test_sentences = test_data["comment_text"].fillna("no comment").values
classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train_data[classes].values

train_sentences = [preprocess_text(text) for text in train_sentences]    
test_sentences=[preprocess_text(text) for text in test_sentences]

tokenizer = Tokenizer(num_words=max_nb_words, filters='"#%&()+,-./:;<=>@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(train_sentences + test_sentences)

#save tokenizer
joblib_file = "Tokenizer.pkl"  
joblib.dump(tokenizer, joblib_file)

train_sequences = tokenizer.texts_to_sequences(train_sentences)
test_sequences = tokenizer.texts_to_sequences(test_sentences)

word_index = tokenizer.word_index
train_data = pad_sequences(train_sequences, maxlen=max_sequence_length)
test_data = pad_sequences(test_sequences, maxlen=max_sequence_length)

## **Prepare embedding matrix**

In [11]:
nb_words = len(word_index) + 1
embedding_matrix = np.zeros((nb_words, embedding_size))

for word, i in word_index.items():
    if i >= max_nb_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# **Bidirectional Recurrent Neural Network**

In [12]:
def rnn(nb_words, embedding_size, embedding_matrix, max_sequence_length, out_size):
    recurrent_units = 60
    input_layer = Input(shape=(max_sequence_length,))
    embedding_layer = Embedding(nb_words,
                                embedding_size,
                                weights=[embedding_matrix],
                                input_length=max_sequence_length,
                                trainable=False)(input_layer)
    embedding_layer = SpatialDropout1D(0.25)(embedding_layer)

    #CuDNNGRU
    rnn_layer_1 = Bidirectional(GRU(recurrent_units, return_sequences=True))(embedding_layer)
    rnn_layer_2 = Bidirectional(GRU(recurrent_units, return_sequences=True))(rnn_layer_1)
    x = concatenate([rnn_layer_1, rnn_layer_2], axis=2)

    last = Lambda(lambda t: t[:, -1], name='last')(x)
    maxpool = GlobalMaxPooling1D()(x)
    average = GlobalAveragePooling1D()(x)

    concatenated_layer = concatenate([last, maxpool, average], axis=1)
    x = Dropout(0.5)(concatenated_layer)
    x = Dense(144, activation="relu")(x)
    output_layer = Dense(out_size, activation="sigmoid")(x)
    model = Model(inputs=input_layer, outputs=output_layer)
    adam_optimizer = optimizers.Adam(lr=1e-3, decay=1e-6, clipvalue=5)
    model.compile(loss='binary_crossentropy', optimizer=adam_optimizer, metrics=['accuracy'])
    model.summary()
    return model

# **Train Model**

In [22]:
class RNNModel(object):

    def __init__(self, model_stamp, epoch_num, learning_rate):
        self.models = []
        self.epoch_num = epoch_num
        self.learning_rate = learning_rate
        self.model_stamp = model_stamp
        self.val_loss = -1
        self.auc = -1

    
    def train_k_folds(self, X, y, n_folds, batch_size, get_model_func):
        models = []
        fold_predictions = []
        score = 0
        total_auc = 0

        #k-fold cross validation
        kf = KFold(n_splits=n_folds, random_state=None) 

        for fold_id, (train_index, val_index) in enumerate(kf.split(X)):
            print("Train:", train_index, "Validation:",val_index)
            train_x, val_x = X[train_index], X[val_index] 
            train_y, val_y = y[train_index], y[val_index]

            model, bst_val_loss, fold_prediction, auc = self._train_model(
              get_model_func(), batch_size, train_x, train_y, val_x, val_y, fold_id)
            
            total_val_loss += bst_val_loss
            total_auc += auc
            models.append(model)
            fold_predictions.append(fold_prediction)

        self.models = models
        self.val_loss = total_val_loss / n_folds
        self.auc = total_auc / n_folds
        return models, self.val_loss, self.auc, fold_predictions

    def _train_model(self, model, batch_size, train_x, train_y, val_x, val_y, fold_id):
        early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=2)
        model_path = self.model_stamp + str(fold_id) + '.h5'
        model_checkpoint = ModelCheckpoint(model_path, save_best_only=True, save_weights_only=True)
        hist = model.fit(train_x, train_y,
                         validation_data=(val_x, val_y),
                         epochs=self.epoch_num, batch_size=batch_size, shuffle=True,
                         callbacks=[early_stopping, model_checkpoint])
        best_val_score = min(hist.history['val_loss'])
        print("Validation score", best_val_score)
        predictions = model.predict(val_x)
        auc = roc_auc_score(val_y, predictions)
        print("AUC Score", auc)
        return model, best_val_score, predictions, auc

In [None]:
def rnn_model():
    return rnn(nb_words, embedding_size, embedding_matrix, max_sequence_length, out_size=6)

model = RNNModel(model_stamp='kmax_text_rnn', epoch_num=50, learning_rate=1e-3)
trained_models, val_loss, auc, fold_predictions = model.train_k_folds(train_data, y, n_folds=3, batch_size=256, get_model_func=rnn_model)

print("Overall val-loss:", val_loss, "AUC", auc)

# **Save Model**

In [None]:
#choosing the best model
model = trained_models[1]

#save model
model.save('RNN_Model.h5')
model = load_model('RNN_Model.h5')


# **Test Model**

In [None]:
from sklearn.externals import joblib
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
import os

# test_sentences = ["Yo bitch Ja Rule is more succesful then you'll ever be whats up with you and hating you sad mofuckas...i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me. Ja rule is about pride in da music man. dont diss that shit on him. and nothin is wrong bein like tupac he was a brother too...fuckin white boys get things right next time.,", '== From RfC == \n\n The title is fine as it is, IMO.', '" \n\n == Sources == \n\n * Zawe Ashton on Lapland —  /  "', ":If you have a look back at the source, the information I updated was the correct form. I can only guess the source hadn't updated. I shall update the information once again but thank you for your message.", "I don't anonymously edit articles at all.", 'Thank you for understanding. I think very highly of you and would not revert without discussion.', 'Please do not add nonsense to Wikipedia. Such edits are considered vandalism and quickly undone. If you would like to experiment, please use the sandbox instead. Thank you.   -', ':Dear god this site is horrible.', '" \n Only a fool can believe in such numbers. \n The correct number lies between 10 000 to 15 000. \n Ponder the numbers carefully.  \n\n This error will persist for a long time as it continues to reproduce... The latest reproduction I know is from ENCYCLOPÆDIA BRITANNICA ALMANAC 2008 wich states \n Magnittude: 8.7 (fair enough) \n victims: 70 000 (today 10 000 to 15 000 is not ""a lot"" so I guess people just come out with a number that impresses enough, I don\'t know. But I know this: it\'s just a shameless lucky number that they throw in the air. \n GC \n\n "', "== Double Redirects == \n\n When fixing double redirects, don't just blank the outer one, you need edit it to point it to the final target, unless you think it's inappropriate, in which case, it needs to be nominated at WP:RfD", 'I think its crap that the link to roggenbier is to this article. Somebody that knows how to do things should change it.', '"::: Somebody will invariably try to add Religion?  Really??  You mean, the way people have invariably kept adding ""Religion"" to the Samuel Beckett infobox?  And why do you bother bringing up the long-dead completely non-existent ""Influences"" issue?  You\'re just flailing, making up crap on the fly. \n ::: For comparison, the only explicit acknowledgement in the entire Amos Oz article that he is personally Jewish is in the categories!    \n\n "', ", 25 February 2010 (UTC) \n\n :::Looking it over, it's clear that  (a banned sockpuppet of ) ignored the consensus (&, fwiw, policy-appropriate) choice to leave the page at Chihuahua (Mexico) and the current page should be returned there. Anyone have the time to fix the incoming links? -  18:24", '" \n\n It says it right there that it IS a type. The ""Type"" of institution is needed in this case because there are three levels of SUNY schools: \n -University Centers and Doctoral Granting Institutions \n -State Colleges \n -Community Colleges. \n\n It is needed in this case to clarify that UB is a SUNY Center. It says it even in Binghamton University, University at Albany, State University of New York, and Stony Brook University. Stop trying to say it\'s not because I am totally right in this case."', '" \n\n == Before adding a new product to the list, make sure it\'s relevant == \n\n Before adding a new product to the list, make sure it has a wikipedia entry already, ""proving"" it\'s relevance and giving the reader the possibility to read more about it. \n Otherwise it could be subject to deletion. See this article\'s revision history."', '==Current Position== \n Anyone have confirmation that Sir, Alfred is no longer at the airport and is hospitalised?', 'this other one from 1897', '== Reason for banning throwing == \n\n This article needs a section on /why/ throwing is banned. At the moment, to a non-cricket fan, it seems kind of arbitrary.', ":: Wallamoose was changing the cited material to say things the original source did not say. In response to his objections, I modified the article as we went along. I was not just reverting him. I repeatedly asked him to use the talk page. I've been trying to add to the article for a long time.  It's so thin on content. This is wrong.", '|blocked]] from editing Wikipedia.   |']

CLASSES = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
test_sentences = ["Go back to your country"]
tokenizer = joblib.load('Tokenizer.pkl')
test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_data = pad_sequences(test_sequences, maxlen=400)
model = load_model('RNN_Model.h5')
test_predicts = model.predict(test_data, batch_size=256, verbose=1)
print(test_predicts)

[[0.14269954 0.0015585  0.01939207 0.01231543 0.05094578 0.00594961]]
