In [1]:
import numpy as np
np.random.seed(42)
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback

# import warnings
# warnings.filterwarnings('ignore')

import os
from os.path import join
#os.environ['OMP_NUM_THREADS'] = '4'


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
max_features = 30000
maxlen = 100
embed_size = 300

path_wordvec = 'E://DM//NLP//WordVec'
EMBEDDING_FILE = join(path_wordvec, 'crawl-300d-2M.vec')
labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
submission = pd.read_csv('data/sample_submission.csv')

y_train = train[labels]
X_train = train["comment_text"].fillna("fillna").values
X_test = test["comment_text"].fillna("fillna").values

print('X_train: ' + str(X_train.shape))
print('y_train: ' + str(y_train.shape))
print('X_test: ' + str(X_test.shape))
print('y_train: ' + str(list(y_train.columns)))

X_train: (159571,)
y_train: (159571, 6)
X_test: (153164,)
y_train: ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


In [3]:
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_train_pad = sequence.pad_sequences(X_train_seq, maxlen=maxlen)
X_test_pad = sequence.pad_sequences(X_test_seq, maxlen=maxlen)

length_seq = [len(document) for document in X_train_seq[:10]]
print('length of X_train_seq:' + str(length_seq))
print('shape of X_train_pad: ' + str(X_train_pad.shape))

length of X_train_seq:[47, 17, 42, 114, 13, 11, 8, 19, 84, 11]
shape of X_train_pad: (159571, 100)


In [4]:
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
with open(EMBEDDING_FILE, encoding='UTF-8') as file:
    lists = (line.rstrip().rsplit(' ') for line in file)
    tuples = (get_coefs(*l) for l in lists if len(l)==301)
    embeddings_index = dict(tuples)
    
print('length of the index:' + str(len(embeddings_index)))
print((list(embeddings_index.keys())[:100]))

length of the index:2000000
['wotthefark', 'News-Record', 'homewear', 'anti-alcohol', 'Avantlink', 'microbevel', '116.49', 'Budokai', 'Armorial', '2013Its', 'Wagemans', 'DOUGHERTY', 'gecko', 'oneironaut', 'Alanne', 'Lee.The', 'ItalyCASA', 'Armatage', 'DarkPhoenix', 'come.Before', 'PhoneTrans', 'DoYourData', 'Kentucky.This', 'NSLP', '13,602', '170.9', 'drought-resistant', 'aRNA', 'Scooptram', 'storic', 'Codan', 'Casanovas', '---In', '3313', '0.1-1', 'owned.', 'Satinder', 'SeminarThe', 'wildlife-dependent', 'Streckfuss', 'Dv', 'TradingFloor.com', 'atrium-style', 'Acidity', 'Strone', 'escribi', 'Bbut', 'pasternak', 'Anti-Patterns', 'Twoje', 'ECSB', 'TeB', 't,', 'EQNext', 'Geordan', 'Hyperbiotics', 'Art3', 'Terredora', 'C-124', '01-09-2014', 'Savours', 'Yaquinto', 'Wag.com', 'Kerang', 'rôle', 'entomophila', 'AMlife', 'Flexible', 'Rhodophyceae', 'Constitucion', 'info-dump', 'ervaar', 'ReplyDeletem', 'Leiningers', '83501', '03.11.2008', 'orby', 'J.r', 'trachelectomy', 'Chwang', 'Brugnato', '

In [5]:
word_index = tokenizer.word_index
num_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((num_words, embed_size))
print(len(embeddings_index))
print(len(word_index))
print(max_features)
print(embedding_matrix.shape)

2000000
394787
30000
(30000, 300)


In [6]:
num_missed = 0
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: 
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        num_missed +=1
print('number of words that are not found in the embedding_vector: ' + str(num_missed))

number of words that are not found in the embedding_vector: 2340


In [7]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))


def GetModel():
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(GRU(80, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(6, activation="sigmoid")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

model = GetModel()
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 100, 300)     9000000     input_1[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_1 (SpatialDro (None, 100, 300)     0           embedding_1[0][0]                
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 100, 160)     182880      spatial_dropout1d_1[0][0]        
__________________________________________________________________________________________________
global_ave

In [12]:
X_train.shape

(159571,)

In [14]:
batch_size = 32
epochs = 2

X_tra, X_val, y_tra, y_val = train_test_split(X_train_pad, y_train, train_size=0.95, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc])







Train on 151592 samples, validate on 7979 samples
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.986903 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.987172 



In [15]:
y_pred = model.predict(X_test_pad, batch_size=1024)
submission[labels] = y_pred
submission.to_csv('submission.csv', index=False)
