In [1]:
#import dependencies
from __future__ import division
import numpy as np
import pandas as pd
import re
import h5py
import unidecode
from tqdm import tqdm
import matplotlib
% matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from keras.layers import Input, Dense, Conv1D, Dropout, Flatten, BatchNormalization
from keras.models import Model
from keras import metrics
from keras.callbacks import ModelCheckpoint, EarlyStopping, Callback
from keras import backend as K
from scipy.sparse import vstack
from sklearn.metrics import make_scorer
from sklearn.metrics import roc_auc_score

Using TensorFlow backend.


In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
levels = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']

#summing the toxicity levels so that we can easily divide the train data to K folds.
train['sum_level'] = train[levels[0]] + train[levels[1]] + train[levels[2]] + train[levels[3]] + train[levels[4]] + train[levels[5]]

In [4]:
#we see there are many \n characters in text. lets just remove those first
good_text = []
for i in tqdm(train['comment_text']):
    i = re.sub(r'[\n]+', ' ', i)
    i = re.sub(r'\s+', ' ', i)
    good_text.append(i)
train['comment_text'] = good_text


#we see there are many \n characters in text. lets just remove those first
good_text_test = []
for i in tqdm(test['comment_text']):
    i = re.sub(r'[\n]+', ' ', i)
    i = re.sub(r'\s+', ' ', i)
    good_text_test.append(i)
test['comment_text'] = good_text_test


#let us strip the unicode accents

good_text = []
for i in tqdm(train['comment_text']):
    i = unicode(i, 'utf-8')
    i = unidecode.unidecode(i)
    good_text.append(i)
train['comment_text'] = good_text


good_text_test = []
for i in tqdm(test['comment_text']):
    i = unicode(i, 'utf-8')
    i = unidecode.unidecode(i)
    good_text_test.append(i)
test['comment_text'] = good_text_test


good_text = []
for i in tqdm(train['comment_text']):
    i = i.lower()
    i = re.sub(r'\\\'s', ' is', i)
    i = re.sub(r'\'s', ' is', i)
    
    i = re.sub(r'can\\\'t', 'can not', i)
    i = re.sub(r'can\'t', 'can not', i)
    
    i = re.sub(r'n\\\'t', ' not', i)
    i = re.sub(r'n\'t', ' not', i)
    
    i = re.sub(r'\\\'nt', ' not', i)
    i = re.sub(r'\'nt', ' not', i)
    
    i = re.sub(r'\\\'re', ' are', i)
    i = re.sub(r'\'re', ' are', i)
    
    i = re.sub(r'\s[w]\'d', ' would', i)
    i = re.sub(r'\\\'d', ' would', i)
    i = re.sub(r'\'d', ' would', i)
    
    i = re.sub(r'\\\'ll', ' will', i)
    i = re.sub(r'\'ll', ' will', i)
    
    i = re.sub(r'i\\\'m', ' i am ', i)
    i = re.sub(r'i\'m', ' i am ', i)
    
    i = re.sub(r'\\\'pedia', ' wikipedia ', i)
    i = re.sub(r'\'pedia', ' wikipedia ', i)
    
    i = re.sub(r'https://www\.', ' www ', i)
    i = re.sub(r'www\.', ' www ', i)
    
    i = re.sub(r'\.com', ' com ', i)
    
    i = re.sub(r'[-]+', ' ', i)
    
    i = re.sub(r'[\[ \] \. " # \$ % \^ \* \( \) \? \\ / @ < > _ : = \+ \{ } \| ~ ! , \']+', ' ', i)
    
    i = re.sub(r'\s+', ' ', i)
    
    i = i.strip()
    
    good_text.append(i)
train['comment_text'] = good_text




good_text_test= []
for i in tqdm(test['comment_text']):
    i = i.lower()
    i = re.sub(r'\\\'s', ' is', i)
    i = re.sub(r'\'s', ' is', i)
    
    i = re.sub(r'can\\\'t', 'can not', i)
    i = re.sub(r'can\'t', 'can not', i)
    
    i = re.sub(r'n\\\'t', ' not', i)
    i = re.sub(r'n\'t', ' not', i)
    
    i = re.sub(r'\\\'nt', ' not', i)
    i = re.sub(r'\'nt', ' not', i)
    
    i = re.sub(r'\\\'re', ' are', i)
    i = re.sub(r'\'re', ' are', i)
    
    i = re.sub(r'\s[w]\'d', ' would', i)
    i = re.sub(r'\\\'d', ' would', i)
    i = re.sub(r'\'d', ' would', i)
    
    i = re.sub(r'\\\'ll', ' will', i)
    i = re.sub(r'\'ll', ' will', i)
    
    i = re.sub(r'i\\\'m', ' i am ', i)
    i = re.sub(r'i\'m', ' i am ', i)
    
    i = re.sub(r'\\\'pedia', ' wikipedia ', i)
    i = re.sub(r'\'pedia', ' wikipedia ', i)
    
    i = re.sub(r'https://www\.', ' www ', i)
    i = re.sub(r'www\.', ' www ', i)
    
    i = re.sub(r'\.com', ' com ', i)
    
    i = re.sub(r'[-]+', ' ', i)
    
    i = re.sub(r'[\[ \] \. " # \$ % \^ \* \( \) \? \\ / @ < > _ : = \+ \{ } \| ~ ! , \']+', ' ', i)
    
    i = re.sub(r'\s+', ' ', i)
    
    i = i.strip()
    
    good_text_test.append(i)
test['comment_text'] = good_text_test

100%|██████████| 159571/159571 [00:07<00:00, 20927.67it/s]
100%|██████████| 153164/153164 [00:07<00:00, 20472.66it/s]
100%|██████████| 159571/159571 [00:09<00:00, 16448.93it/s]
100%|██████████| 153164/153164 [00:12<00:00, 12572.25it/s]
100%|██████████| 159571/159571 [00:40<00:00, 3987.37it/s]
100%|██████████| 153164/153164 [00:45<00:00, 3373.46it/s]


In [5]:
train['line_length'] = train['comment_text'].apply(lambda x:len(x.split()))

In [6]:
train['line_length'].mean()

69.2807465015573

In [7]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

all_text = pd.concat([train['comment_text'], test['comment_text']])



tokenizer = Tokenizer(num_words=1000000)




tokenizer.fit_on_texts(all_text)
train_seq = tokenizer.texts_to_sequences(train['comment_text'])
test_seq = tokenizer.texts_to_sequences(test['comment_text'])



word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))



train_seq = pad_sequences(train_seq, maxlen=150)
test_seq = pad_sequences(test_seq, maxlen=150)

Found 340522 unique tokens.


In [12]:
embeddings_index = {}
f = open('glove.840B.300d.txt')
for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))


embedding_matrix = np.zeros((len(word_index) + 1, 300))
ii=0
for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        ii+=1
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

print("words found in glove: " + str(ii))

2196017it [05:57, 6144.98it/s] 
  6%|▋         | 21708/340522 [00:00<00:01, 217067.27it/s]

Found 2196016 word vectors.


100%|██████████| 340522/340522 [00:01<00:00, 270227.49it/s]


In [13]:
np.savetxt('embedding_matrix.txt',embedding_matrix)

In [8]:
embedding_matrix = np.loadtxt('embedding_matrix.txt')

In [9]:
train_clean_index = train[train['sum_level']==0].index.values
train_toxic_index = train[train['sum_level']>0].index.values

In [23]:
x = np.concatenate((train_seq[train_clean_index[28670*3:28670*4],:], train_seq[train_toxic_index,:]), axis=0)
y = np.concatenate((np.array(train.iloc[train_clean_index[28670*3:28670*4],2:-2]), np.array(train.iloc[train_toxic_index,2:-2])), axis=0)

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y,test_size=0.2, random_state=35)

In [12]:
class roc_callback(Callback):
    def __init__(self,training_data):
        self.x = training_data[0]
        self.y = training_data[1]

    def on_train_begin(self, logs={}):
        return

    def on_train_end(self, logs={}):
        return

    def on_epoch_begin(self, epoch, logs={}):
        return

    def on_epoch_end(self, epoch, logs={}):
        y_pred = self.model.predict(self.x)
        roc = roc_auc_score(self.y, y_pred, average='weighted')
        print('\rroc-auc: %s' % (str(round(roc,4)))+' '+'\n')
        return

    def on_batch_begin(self, batch, logs={}):
        return

    def on_batch_end(self, batch, logs={}):
        return

In [39]:
from keras.layers import Conv1D,MaxPooling1D,Flatten,Embedding, BatchNormalization,AveragePooling1D,GlobalMaxPooling1D


inputs = Input(shape=(150,))

embedding_layer = Embedding(len(word_index) + 1, 300, weights=[embedding_matrix],input_length=150,trainable=False)(inputs)
x = Conv1D(512, 2, activation='tanh', padding='valid')(embedding_layer)
x = Dropout(0.5)(x)
x = MaxPooling1D(2)(x)
x = Conv1D(512, 2, activation='tanh')(x)
x = Dropout(0.5)(x)
#x = GlobalMaxPooling1D(2)(x)
x = Flatten()(x)
x = BatchNormalization()(x)
x = Dense(256, activation='tanh')(x)
x = Dropout(0.5)(x)
x = Dense(256, activation='tanh')(x)
x = Dropout(0.5)(x)
x = Dense(256, activation='tanh')(x)
x = Dropout(0.5)(x)
x = Dense(256, activation='tanh')(x)
x = Dropout(0.5)(x)
output= Dense(6, activation='sigmoid')(x)



model = Model(inputs=inputs, outputs=output)
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=[metrics.categorical_accuracy])

mck = ModelCheckpoint('sub_conv1d.h5', monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=True, mode='auto', period=1)
estop = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=10, verbose=1, mode='auto')

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 150)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 150, 300)          102156900 
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 149, 512)          307712    
_________________________________________________________________
dropout_1 (Dropout)          (None, 149, 512)          0         
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 74, 512)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 73, 512)           524800    
_________________________________________________________________
dropout_2 (Dropout)          (None, 73, 512)           0         
__________

In [38]:
model.fit(x_train,y_train,epochs=100, batch_size=1000, verbose=1, shuffle=True, validation_split=0.3, callbacks=[mck,estop,roc_callback(training_data=(x_train,y_train))])

Train on 25141 samples, validate on 10775 samples
Epoch 1/100
roc-auc: 0.6649 

Epoch 2/100
roc-auc: 0.7174 

Epoch 3/100
roc-auc: 0.7506 

Epoch 4/100
roc-auc: 0.766 

Epoch 5/100
roc-auc: 0.7922 

Epoch 6/100
roc-auc: 0.8197 

Epoch 7/100
roc-auc: 0.854 

Epoch 8/100
roc-auc: 0.8598 

Epoch 9/100
roc-auc: 0.8441 

Epoch 10/100
roc-auc: 0.8379 

Epoch 11/100
roc-auc: 0.863 

Epoch 12/100
roc-auc: 0.8625 

Epoch 13/100
roc-auc: 0.8467 

Epoch 14/100
roc-auc: 0.8451 

Epoch 15/100
roc-auc: 0.8603 

Epoch 16/100
roc-auc: 0.8331 

Epoch 17/100
roc-auc: 0.8784 

Epoch 18/100
roc-auc: 0.8324 

Epoch 19/100
roc-auc: 0.8169 

Epoch 20/100
roc-auc: 0.8392 

Epoch 21/100
roc-auc: 0.8255 

Epoch 22/100
roc-auc: 0.8097 

Epoch 23/100
roc-auc: 0.8104 

Epoch 24/100
roc-auc: 0.8328 

Epoch 25/100
roc-auc: 0.8276 

Epoch 26/100
roc-auc: 0.8339 

Epoch 27/100
roc-auc: 0.8478 

Epoch 28/100
roc-auc: 0.839 

Epoch 00027: early stopping


<keras.callbacks.History at 0x7f74a7147f50>

In [41]:
model.load_weights('sub_conv1d.h5')

y_pred = model.predict(x_test)
score = roc_auc_score(y_test,y_pred,average='weighted')
print(score)

0.877316403448


In [42]:
sub = pd.DataFrame(test['id'])

preds = model.predict(test_seq)

for i in range(len(levels)):
    sub[levels[i]]=preds[:,i]
sub.to_csv('final_pred.csv', index=False)

In [48]:
K.clear_session()

In [24]:
x_train, x_test, y_train, y_test = train_test_split(x, y,test_size=0.2, random_state=65)#7365

In [33]:
from keras.layers import Conv1D,MaxPooling1D,Flatten,Embedding, BatchNormalization,AveragePooling1D,GlobalMaxPooling1D,Bidirectional, LSTM


inputs = Input(shape=(150,))

embedding_layer = Embedding(len(word_index) + 1, 300, weights=[embedding_matrix],input_length=150,trainable=False)(inputs)
x = Bidirectional(LSTM(256, activation='tanh', dropout=0.3,recurrent_dropout=0.3),merge_mode='concat')(embedding_layer)
#x = BatchNormalization()(x)
x = Dense(256, activation='tanh')(x)
x = Dropout(0.5)(x)
x = Dense(256, activation='tanh')(x)
x = Dropout(0.5)(x)
x = Dense(256, activation='tanh')(x)
x = Dropout(0.5)(x)
x = Dense(256, activation='tanh')(x)
x = Dropout(0.5)(x)
x = Dense(256, activation='tanh')(x)
x = Dropout(0.5)(x)
output= Dense(6, activation='sigmoid')(x)



model = Model(inputs=inputs, outputs=output)
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=[metrics.categorical_accuracy])

mck = ModelCheckpoint('sub_conv1d_new.h5', monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=True, mode='auto', period=1)
estop = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=10, verbose=1, mode='auto')

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 150)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 150, 300)          102156900 
_________________________________________________________________
bidirectional_1 (Bidirection (None, 512)               1140736   
_________________________________________________________________
dense_1 (Dense)              (None, 256)               131328    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 256)               65792     
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
__________

In [34]:
model.fit(x_train, y_train, epochs=20, batch_size=200, verbose=1, shuffle=True, validation_split=0.3, callbacks=[mck,estop,roc_callback(training_data=(x_train,y_train))])

Train on 25141 samples, validate on 10775 samples
Epoch 1/20
roc-auc: 0.8295 

Epoch 2/20
roc-auc: 0.8363 

Epoch 3/20
roc-auc: 0.7121 

Epoch 4/20
roc-auc: 0.7504 

Epoch 5/20
roc-auc: 0.7956 

Epoch 6/20
roc-auc: 0.891 

Epoch 7/20
roc-auc: 0.8348 

Epoch 8/20
roc-auc: 0.6716 

Epoch 9/20
roc-auc: 0.7056 

Epoch 10/20
roc-auc: 0.8364 

Epoch 11/20
roc-auc: 0.7502 

Epoch 12/20
roc-auc: 0.8917 

Epoch 13/20
roc-auc: 0.812 

Epoch 14/20
roc-auc: 0.6749 

Epoch 15/20
roc-auc: 0.91 

Epoch 16/20
roc-auc: 0.8357 

Epoch 17/20
roc-auc: 0.8747 

Epoch 18/20
roc-auc: 0.7946 

Epoch 19/20
roc-auc: 0.9026 

Epoch 20/20
roc-auc: 0.8708 

Epoch 00019: early stopping


<keras.callbacks.History at 0x7f4dd6973e90>

In [38]:
K.clear_session()

In [35]:
model.load_weights('sub_conv1d_new.h5')

In [36]:
y_pred = model.predict(x_test)
score = roc_auc_score(y_test,y_pred,average='weighted')
print(score)

0.746647545169


In [37]:
sub = pd.DataFrame(test['id'])

preds = model.predict(test_seq)

for i in range(len(levels)):
    sub[levels[i]]=preds[:,i]
sub.to_csv('final_pred.csv', index=False)