In [11]:
# LTSM->CNN Model implementation
# Inspired by: http://konukoii.com/blog/2018/02/19/twitter-sentiment-analysis-using-combined-lstm-cnn-models/

import numpy as np
import pandas as pd # Handle csv data and dataframes
import matplotlib.pyplot as plt
import gensim # Load Pre-trained word2vec embeddings

from keras.layers import Dense,Input,LSTM,Bidirectional,Activation,Conv1D,GRU
from keras.callbacks import Callback
from keras.layers import Dropout,Embedding,GlobalMaxPooling1D, MaxPooling1D, Add, Flatten
from keras.preprocessing import text, sequence
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras.models import Model
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

# Load data
train_df = pd.read_csv('../../data/train.csv')
test_df  = pd.read_csv('../../data/test.csv')
print(train_df.dtypes)

# Identify % of data that is toxic
list_class = ['toxic', 'severe_toxic',
              'obscene', 'threat', 
              'insult', 'identity_hate'
]
p_toxic = sum(train_df['toxic'])/float(len(train_df))

print("Toxic" + ": " + str(p_toxic))

id               object
comment_text     object
toxic             int64
severe_toxic      int64
obscene           int64
threat            int64
insult            int64
identity_hate     int64
dtype: object
Toxic: 0.0958444830201


In [4]:
# Split Training & Evaluation Inputs
y_eval = train_df[list_class].values

sentences_train = train_df['comment_text']
sentences_test = test_df['comment_text']

In [6]:
# Tokenize texts
max_features = 300000
tokenizer = text.Tokenizer(num_words=max_features, lower=True)
tokenizer.fit_on_texts(sentences_train)

tokenized_train = tokenizer.texts_to_sequences(sentences_train)
tokenized_test  = tokenizer.texts_to_sequences(sentences_test)

# Pad tokenized sequences
maxlen = 200
X_train = sequence.pad_sequences(tokenized_train, maxlen=maxlen)
X_test  = sequence.pad_sequences(tokenized_train, maxlen=maxlen)

In [8]:
# Create dictionary from pre-trained Google Word2Vec Embeddings
embeddings_index = {}
path_to_wv = '/Volumes/bluelight/word2vec/GoogleNews-vectors-negative300.txt'
f = open(path_to_wv)

for line in f:
    embedding = line.split()
    word = embedding[0]
    vec = np.asarray(embedding[1:], dtype='float32')
    embeddings_index[word] = vec 
    
embedding_dim=300 # customize embedding size

In [9]:
# Construct Embedding Matrix from dict
word_index = tokenizer.word_index
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector[:embedding_dim]

In [14]:
inp = Input(shape=(maxlen, ))
x = Embedding(embedding_matrix.shape[0],
              embedding_matrix.shape[1],
              weights=[embedding_matrix],
              input_length=200,
              trainable=False)(inp)
x = SpatialDropout1D(0.2)(x)
x = Bidirectional(LSTM(128, return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x)
x = Conv1D(64, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform")(x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
x = concatenate([avg_pool, max_pool]) 
x = Dense(128, activation='relu')(x)
x = Dropout(0.1)(x)
preds = Dense(6, activation="sigmoid")(x)
model = Model(inp, preds)
model.compile(loss='binary_crossentropy',optimizer=Adam(lr=1e-3),metrics=['accuracy'])

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            (None, 200)          0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 200, 300)     63166500    input_4[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_4 (SpatialDro (None, 200, 300)     0           embedding_4[0][0]                
__________________________________________________________________________________________________
bidirectional_3 (Bidirectional) (None, 200, 256)     439296      spatial_dropout1d_4[0][0]        
__________________________________________________________________________________________________
conv1d_3 (

In [15]:
batch_size = 128
epochs = 4
model.fit(X_train,y_eval, 
          batch_size=batch_size, 
          epochs=epochs, 
          validation_split=0.1,)

Train on 143613 samples, validate on 15958 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x12e9cab50>

In [16]:
model.save('../../trained_model/lstm_cnn_w2v/pc_net.h5')

# Save Tokenizer
import pickle
with open('../../trained_model/lstm_cnn_w2v/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)