In [2]:
# Author: Daniel Lee (dani.cmh.lee@gmail.com)
# Started: August 3, 2018
# Tensorflow LTSM/RNN network for detecting toxic comments

import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot
from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

# Load data
train_df = pd.read_csv('../../data/train.csv')
test_df = pd.read_csv('../../data/test.csv')

# Explore data
print(train_df.dtypes)
list_class = ['toxic', 'severe_toxic','obscene', 'threat', 'insult', 'identity_hate']

print("Toxic" + ": " + str(sum(train_df['toxic'])/float(len(train_df))))

id               object
comment_text     object
toxic             int64
severe_toxic      int64
obscene           int64
threat            int64
insult            int64
identity_hate     int64
dtype: object
Toxic: 0.0958444830201


In [3]:
### Load Google's pre-trained word2vec and save txt file
path_to_model = '/Volumes/bluelight/word2vec/GoogleNews-vectors-negative300.bin'
path_to_text  = '/Volumes/bluelight/word2vec/GoogleNews-vectors-negative300.txt'
model_wv = KeyedVectors.load_word2vec_format(path_to_model, binary=True)
model_wv.save_word2vec_format(path_to_text, binary=False)

In [4]:
# Explore Word2Vec Data
del model_wv

In [5]:
# Split data
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train_df[list_classes].values
list_sentences_train = train_df["comment_text"]
list_sentences_test = test_df["comment_text"]

In [6]:
# Text tokenization
max_features = 20000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))

list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

# Pad tokenization
maxlen = 200
X_train = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_test  = pad_sequences(list_tokenized_test,  maxlen=maxlen)


In [7]:
# Create word vector dictionary
embeddings_index = {}
wv_data = '/Volumes/bluelight/word2vec/GoogleNews-vectors-negative300.txt'
f=open(wv_data)
for line in f:
    values = line.split(' ')
    word = values[0]
    value = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = value

embedding_dimension = 300
word_index = tokenizer.word_index

In [8]:
# Embedding Matrix

embedding_matrix = np.zeros((len(word_index) + 1, embedding_dimension))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    # embedding_vector = model_wv.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector[:embedding_dimension]

In [20]:
# Model Development: Embedding Layer
embed_size = 300
inp = inp = Input(shape=(maxlen, ))
x = Embedding(embedding_matrix.shape[0],
                            embedding_matrix.shape[1],
                            weights=[embedding_matrix],
                            input_length=200)(inp)
x = LSTM(80, return_sequences=True, name='lstm_layer')(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.1)(x)
x = Dense(60, activation='relu')(x)
x = Dropout(0.1)(x)
x = Dense(6, activation='sigmoid')(x)

model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 200)               0         
_________________________________________________________________
embedding_5 (Embedding)      (None, 200, 300)          63166500  
_________________________________________________________________
lstm_layer (LSTM)            (None, 200, 80)           121920    
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 80)                0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 80)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 60)                4860      
_________________________________________________________________
dropout_6 (Dropout)          (None, 60)                0         
__________

In [21]:
### Visualize Training
class PlotLoss(keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.i = 0
        self.x = []
        self.losses = []
        self.val_losses = []
        self.acc = []
        self.val_acc = []
        self.fig = plt.figure()
        
        self.logs = []

    def on_epoch_end(self, epoch, logs={}):
        
        self.logs.append(logs)
        self.x.append(self.i)
        self.losses.append(logs.get('loss'))
        self.val_losses.append(logs.get('val_loss'))
        self.acc.append(logs.get('acc'))
        self.val_acc.append(logs.get('val_acc'))
        self.i += 1
        f, (ax1, ax2) = plt.subplots(1, 2, sharex=True)
        
        clear_output(wait=True)
        
        ax1.set_yscale('log')
        ax1.plot(self.x, self.losses, label="loss")
        ax1.plot(self.x, self.val_losses, label="val_loss")
        ax1.legend()
        
        ax2.plot(self.x, self.acc, label="accuracy")
        ax2.plot(self.x, self.val_acc, label="validation accuracy")
        ax2.legend()
        
        plt.show();
plot_losses = PlotLoss()


In [None]:
# Memory Management
del f, embeddings_index

In [None]:
# Train Model
from IPython.display import clear_output

batch_size = 32
epochs = 2
model.fit(X_train,y, 
          batch_size=batch_size, 
          epochs=epochs, 
          validation_split=0.1,
          callbacks=[plot_losses])