In [121]:
#import dependencies
from __future__ import division
import numpy as np
import pandas as pd
import re
import h5py
import unidecode
from tqdm import tqdm
import matplotlib
% matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from keras.layers import Input, Dense, Conv1D, Dropout, Flatten, BatchNormalization
from keras.models import Model
from keras import metrics
from keras.callbacks import ModelCheckpoint, EarlyStopping, Callback
from keras import backend as K
from scipy.sparse import vstack

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
levels = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']

#summing the toxicity levels so that we can easily divide the train data to K folds.
train['sum_level'] = train[levels[0]] + train[levels[1]] + train[levels[2]] + train[levels[3]] + train[levels[4]] + train[levels[5]]

In [None]:
#we see there are many \n characters in text. lets just remove those first
good_text = []
for i in tqdm(train['comment_text']):
    i = re.sub(r'[\n]+', ' ', i)
    i = re.sub(r'\s+', ' ', i)
    good_text.append(i)
train['comment_text'] = good_text


#we see there are many \n characters in text. lets just remove those first
good_text_test = []
for i in tqdm(test['comment_text']):
    i = re.sub(r'[\n]+', ' ', i)
    i = re.sub(r'\s+', ' ', i)
    good_text_test.append(i)
test['comment_text'] = good_text_test


#let us strip the unicode accents

good_text = []
for i in tqdm(train['comment_text']):
    i = unicode(i, 'utf-8')
    i = unidecode.unidecode(i)
    good_text.append(i)
train['comment_text'] = good_text


good_text_test = []
for i in tqdm(test['comment_text']):
    i = unicode(i, 'utf-8')
    i = unidecode.unidecode(i)
    good_text_test.append(i)
test['comment_text'] = good_text_test


good_text = []
for i in tqdm(train['comment_text']):
    i = i.lower()
    i = re.sub(r'\\\'s', ' is', i)
    i = re.sub(r'\'s', ' is', i)
    
    i = re.sub(r'can\\\'t', 'can not', i)
    i = re.sub(r'can\'t', 'can not', i)
    
    i = re.sub(r'n\\\'t', ' not', i)
    i = re.sub(r'n\'t', ' not', i)
    
    i = re.sub(r'\\\'nt', ' not', i)
    i = re.sub(r'\'nt', ' not', i)
    
    i = re.sub(r'\\\'re', ' are', i)
    i = re.sub(r'\'re', ' are', i)
    
    i = re.sub(r'\s[w]\'d', ' would', i)
    i = re.sub(r'\\\'d', ' would', i)
    i = re.sub(r'\'d', ' would', i)
    
    i = re.sub(r'\\\'ll', ' will', i)
    i = re.sub(r'\'ll', ' will', i)
    
    i = re.sub(r'i\\\'m', ' i am ', i)
    i = re.sub(r'i\'m', ' i am ', i)
    
    i = re.sub(r'\\\'pedia', ' wikipedia ', i)
    i = re.sub(r'\'pedia', ' wikipedia ', i)
    
    i = re.sub(r'https://www\.', ' www ', i)
    i = re.sub(r'www\.', ' www ', i)
    
    i = re.sub(r'\.com', ' com ', i)
    
    i = re.sub(r'[-]+', ' ', i)
    
    i = re.sub(r'[\[ \] \. " # \$ % \^ \* \( \) \? \\ / @ < > _ : = \+ \{ } \| ~ ! , \']+', ' ', i)
    
    i = re.sub(r'\s+', ' ', i)
    
    i = i.strip()
    
    good_text.append(i)
train['comment_text'] = good_text




good_text_test= []
for i in tqdm(test['comment_text']):
    i = i.lower()
    i = re.sub(r'\\\'s', ' is', i)
    i = re.sub(r'\'s', ' is', i)
    
    i = re.sub(r'can\\\'t', 'can not', i)
    i = re.sub(r'can\'t', 'can not', i)
    
    i = re.sub(r'n\\\'t', ' not', i)
    i = re.sub(r'n\'t', ' not', i)
    
    i = re.sub(r'\\\'nt', ' not', i)
    i = re.sub(r'\'nt', ' not', i)
    
    i = re.sub(r'\\\'re', ' are', i)
    i = re.sub(r'\'re', ' are', i)
    
    i = re.sub(r'\s[w]\'d', ' would', i)
    i = re.sub(r'\\\'d', ' would', i)
    i = re.sub(r'\'d', ' would', i)
    
    i = re.sub(r'\\\'ll', ' will', i)
    i = re.sub(r'\'ll', ' will', i)
    
    i = re.sub(r'i\\\'m', ' i am ', i)
    i = re.sub(r'i\'m', ' i am ', i)
    
    i = re.sub(r'\\\'pedia', ' wikipedia ', i)
    i = re.sub(r'\'pedia', ' wikipedia ', i)
    
    i = re.sub(r'https://www\.', ' www ', i)
    i = re.sub(r'www\.', ' www ', i)
    
    i = re.sub(r'\.com', ' com ', i)
    
    i = re.sub(r'[-]+', ' ', i)
    
    i = re.sub(r'[\[ \] \. " # \$ % \^ \* \( \) \? \\ / @ < > _ : = \+ \{ } \| ~ ! , \']+', ' ', i)
    
    i = re.sub(r'\s+', ' ', i)
    
    i = i.strip()
    
    good_text_test.append(i)
test['comment_text'] = good_text_test

In [4]:
tfidf_train_1 = np.loadtxt('train_tfidf_data_1.txt')
tfidf_test_1 = np.loadtxt('test_tfidf_data_1.txt')

tfidf_train_2 = np.loadtxt('train_tfidf_data_2.txt')
tfidf_test_2 = np.loadtxt('test_tfidf_data_2.txt')

We see that we have 10 times more non toxic data than toxic data. Let us divide the non toxic comments into 5 pars and train 5 models with each part each.

In [19]:
train[train['sum_level']==0].shape

(143346, 9)

In [8]:
train[train['sum_level']>0].shape

(16225, 9)

In [20]:
143346/5

28669

In [30]:
from sklearn.metrics import make_scorer
from sklearn.metrics import roc_auc_score

In [5]:
train_clean_index = train[train['sum_level']==0].index.values
train_toxic_index = train[train['sum_level']>0].index.values

In [31]:
x = np.concatenate((tfidf_train_1[train_clean_index[:28670],:], tfidf_train_1[train_toxic_index,:]), axis=0)
y = np.concatenate((np.array(train.iloc[train_clean_index[:28670],2:-1]), np.array(train.iloc[train_toxic_index,2:-1])), axis=0)

In [32]:
assert(x.shape[0]==y.shape[0])

In [33]:
x_train, x_test, y_train, y_test = train_test_split(x, y,test_size=0.2, random_state=2017)

In [42]:
class roc_callback(Callback):
    def __init__(self,training_data):
        self.x = training_data[0]
        self.y = training_data[1]

    def on_train_begin(self, logs={}):
        return

    def on_train_end(self, logs={}):
        return

    def on_epoch_begin(self, epoch, logs={}):
        return

    def on_epoch_end(self, epoch, logs={}):
        y_pred = self.model.predict(self.x)
        roc = roc_auc_score(self.y, y_pred, average='weighted')
        print('\rroc-auc: %s' % (str(round(roc,4)))+' '+'\n')
        return

    def on_batch_begin(self, batch, logs={}):
        return

    def on_batch_end(self, batch, logs={}):
        return

In [102]:
#now we are ready for Deep learning. Wanted to start with boosting, but lets strt with DL. 
#we will start with a simple keras model

#considering text data let us start with a simple fully connected model


#starting keras model with tensorflow backend

inputs = Input(shape=(200,))

a = Dense(128,activation='tanh', input_shape=(None,200))(inputs)
a = Dropout(0.6)(a)
a = Dense(128,activation='tanh')(a)
a = Dropout(0.6)(a)
a = Dense(128,activation='tanh')(a)
a = Dropout(0.6)(a)
a = Dense(128,activation='tanh')(a)
a = Dropout(0.6)(a)
output= Dense(6, activation='sigmoid')(a)


model = Model(inputs=inputs, outputs=output)
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=[metrics.categorical_accuracy])

mck = ModelCheckpoint('sub_x1.h5', monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=True, mode='auto', period=1)
estop = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=10, verbose=1, mode='auto')

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               25728     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               16512     
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               16512     
_________________________________________________________________
dropout_3 (Dropout)          (None, 128)               0         
__________

In [44]:
model.fit(x_train,y_train,epochs=200, batch_size=80, verbose=1, shuffle=True, validation_split=0.3, callbacks=[mck,estop,roc_callback(training_data=(x_train,y_train))])

Train on 25141 samples, validate on 10775 samples
Epoch 1/200
roc-auc: 0.6644 

Epoch 2/200
roc-auc: 0.7303 

Epoch 3/200
roc-auc: 0.7013 

Epoch 4/200
roc-auc: 0.7215 

Epoch 5/200
roc-auc: 0.7811 

Epoch 6/200
roc-auc: 0.776 

Epoch 7/200
roc-auc: 0.7787 

Epoch 8/200
roc-auc: 0.742 

Epoch 9/200
roc-auc: 0.7698 

Epoch 10/200
roc-auc: 0.7849 

Epoch 11/200
roc-auc: 0.8311 

Epoch 12/200
roc-auc: 0.7231 

Epoch 13/200
roc-auc: 0.773 

Epoch 14/200
roc-auc: 0.8119 

Epoch 15/200
roc-auc: 0.7059 

Epoch 16/200
roc-auc: 0.7706 

Epoch 17/200
roc-auc: 0.699 

Epoch 18/200
roc-auc: 0.7046 

Epoch 19/200
roc-auc: 0.7665 

Epoch 20/200
roc-auc: 0.8004 

Epoch 21/200
roc-auc: 0.7652 

Epoch 22/200
roc-auc: 0.7629 

Epoch 23/200
roc-auc: 0.7624 

Epoch 00022: early stopping


<keras.callbacks.History at 0x7f3bcc36c090>

In [46]:
model.load_weights('sub_x1.h5')

y_pred = model.predict(x_test)
score = roc_auc_score(y_test,y_pred,average='weighted')
print(score)

0.763827757657


In [48]:
K.clear_session()

In [56]:
x = np.concatenate((tfidf_train_1[train_clean_index[28670:28670*2],:], tfidf_train_1[train_toxic_index,:]), axis=0)
y = np.concatenate((np.array(train.iloc[train_clean_index[28670:28670*2],2:-1]), np.array(train.iloc[train_toxic_index,2:-1])), axis=0)

In [57]:
x_train, x_test, y_train, y_test = train_test_split(x, y,test_size=0.2, random_state=1)

In [105]:
inputs = Input(shape=(200,))

a = Dense(128,activation='tanh', input_shape=(None,200))(inputs)
a = Dropout(0.6)(a)
a = Dense(128,activation='tanh')(a)
a = Dropout(0.6)(a)
a = Dense(128,activation='tanh')(a)
a = Dropout(0.6)(a)
output= Dense(6, activation='sigmoid')(a)


model = Model(inputs=inputs, outputs=output)
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=[metrics.categorical_accuracy])

mck = ModelCheckpoint('sub_x2.h5', monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=True, mode='auto', period=1)
estop = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=10, verbose=1, mode='auto')

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               25728     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               16512     
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               16512     
_________________________________________________________________
dropout_3 (Dropout)          (None, 128)               0         
__________

In [59]:
model.fit(x_train,y_train,epochs=200, batch_size=80, verbose=1, shuffle=True, validation_split=0.3, callbacks=[mck,estop,roc_callback(training_data=(x_train,y_train))])

Train on 25141 samples, validate on 10775 samples
Epoch 1/200
roc-auc: 0.6378 

Epoch 2/200
roc-auc: 0.6812 

Epoch 3/200
roc-auc: 0.7117 

Epoch 4/200
roc-auc: 0.7204 

Epoch 5/200
roc-auc: 0.7413 

Epoch 6/200
roc-auc: 0.7773 

Epoch 7/200
roc-auc: 0.7527 

Epoch 8/200
roc-auc: 0.7785 

Epoch 9/200
roc-auc: 0.7974 

Epoch 10/200
roc-auc: 0.7877 

Epoch 11/200
roc-auc: 0.7754 

Epoch 12/200
roc-auc: 0.7864 

Epoch 13/200
roc-auc: 0.7641 

Epoch 14/200
roc-auc: 0.8031 

Epoch 15/200
roc-auc: 0.8094 

Epoch 16/200
roc-auc: 0.7578 

Epoch 17/200
roc-auc: 0.8047 

Epoch 18/200
roc-auc: 0.8167 

Epoch 00017: early stopping


<keras.callbacks.History at 0x7f3b86b7c150>

In [60]:
model.load_weights('sub_x2.h5')

y_pred = model.predict(x_test)
score = roc_auc_score(y_test,y_pred,average='weighted')
print(score)

0.772809472792


In [61]:
K.clear_session()

In [62]:
x = np.concatenate((tfidf_train_1[train_clean_index[28670*2:28670*3],:], tfidf_train_1[train_toxic_index,:]), axis=0)
y = np.concatenate((np.array(train.iloc[train_clean_index[28670*2:28670*3],2:-1]), np.array(train.iloc[train_toxic_index,2:-1])), axis=0)

x_train, x_test, y_train, y_test = train_test_split(x, y,test_size=0.2, random_state=634)

In [108]:
inputs = Input(shape=(200,))

a = Dense(128,activation='tanh', input_shape=(None,200))(inputs)
a = Dropout(0.5)(a)
a = Dense(128,activation='tanh')(a)
a = Dropout(0.5)(a)
a = Dense(128,activation='tanh')(a)
a = Dropout(0.5)(a)
a = Dense(128,activation='tanh')(a)
a = Dropout(0.5)(a)
output= Dense(6, activation='sigmoid')(a)


model = Model(inputs=inputs, outputs=output)
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=[metrics.categorical_accuracy])

mck = ModelCheckpoint('sub_x3.h5', monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=True, mode='auto', period=1)
estop = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=10, verbose=1, mode='auto')

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               25728     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               16512     
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               16512     
_________________________________________________________________
dropout_3 (Dropout)          (None, 128)               0         
__________

In [81]:
model.fit(x_train,y_train,epochs=200, batch_size=50, verbose=1, shuffle=True, validation_split=0.3, callbacks=[mck,estop,roc_callback(training_data=(x_train,y_train))])

Train on 25141 samples, validate on 10775 samples
Epoch 1/200
roc-auc: 0.6937 

Epoch 2/200
roc-auc: 0.7108 

Epoch 3/200
roc-auc: 0.7133 

Epoch 4/200
roc-auc: 0.8045 

Epoch 5/200
roc-auc: 0.8054 

Epoch 6/200
roc-auc: 0.8098 

Epoch 7/200
roc-auc: 0.7438 

Epoch 8/200
roc-auc: 0.8101 

Epoch 9/200
roc-auc: 0.8145 

Epoch 10/200
roc-auc: 0.7836 

Epoch 11/200
roc-auc: 0.7714 

Epoch 12/200
roc-auc: 0.7998 

Epoch 13/200
roc-auc: 0.7855 

Epoch 14/200
roc-auc: 0.7527 

Epoch 15/200
roc-auc: 0.7539 

Epoch 16/200
roc-auc: 0.7333 

Epoch 17/200
roc-auc: 0.7614 

Epoch 18/200
roc-auc: 0.7825 

Epoch 19/200
roc-auc: 0.7914 

Epoch 20/200
roc-auc: 0.7322 

Epoch 00019: early stopping


<keras.callbacks.History at 0x7f3b2461cf50>

In [82]:
model.load_weights('sub_x3.h5')

y_pred = model.predict(x_test)
score = roc_auc_score(y_test,y_pred,average='weighted')
print(score)

0.806991016792


In [83]:
K.clear_session()

In [84]:
x = np.concatenate((tfidf_train_1[train_clean_index[28670*3:28670*4],:], tfidf_train_1[train_toxic_index,:]), axis=0)
y = np.concatenate((np.array(train.iloc[train_clean_index[28670*3:28670*4],2:-1]), np.array(train.iloc[train_toxic_index,2:-1])), axis=0)

x_train, x_test, y_train, y_test = train_test_split(x, y,test_size=0.2, random_state=1264)

In [111]:
inputs = Input(shape=(200,))

a = Dense(128,activation='tanh', input_shape=(None,200))(inputs)
a = Dropout(0.6)(a)
a = Dense(128,activation='tanh')(a)
a = Dropout(0.6)(a)
a = Dense(128,activation='tanh')(a)
a = Dropout(0.6)(a)
a = Dense(128,activation='tanh')(a)
a = Dropout(0.6)(a)
output= Dense(6, activation='sigmoid')(a)


model = Model(inputs=inputs, outputs=output)
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=[metrics.categorical_accuracy])

mck = ModelCheckpoint('sub_x4.h5', monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=True, mode='auto', period=1)
estop = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=10, verbose=1, mode='auto')

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               25728     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               16512     
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               16512     
_________________________________________________________________
dropout_3 (Dropout)          (None, 128)               0         
__________

In [90]:
model.fit(x_train,y_train,epochs=200, batch_size=50, verbose=1, shuffle=True, validation_split=0.3, callbacks=[mck,estop,roc_callback(training_data=(x_train,y_train))])

Train on 25141 samples, validate on 10775 samples
Epoch 1/200
roc-auc: 0.6472 

Epoch 2/200
roc-auc: 0.6804 

Epoch 3/200
roc-auc: 0.741 

Epoch 4/200
roc-auc: 0.6969 

Epoch 5/200
roc-auc: 0.7619 

Epoch 6/200
roc-auc: 0.7564 

Epoch 7/200
roc-auc: 0.7763 

Epoch 8/200
roc-auc: 0.7468 

Epoch 9/200
roc-auc: 0.7777 

Epoch 10/200
roc-auc: 0.7665 

Epoch 11/200
roc-auc: 0.7743 

Epoch 12/200
roc-auc: 0.8114 

Epoch 13/200
roc-auc: 0.8167 

Epoch 14/200
roc-auc: 0.7948 

Epoch 15/200
roc-auc: 0.7909 

Epoch 16/200
roc-auc: 0.7848 

Epoch 17/200
roc-auc: 0.7931 

Epoch 18/200
roc-auc: 0.8138 

Epoch 19/200
roc-auc: 0.7906 

Epoch 20/200
roc-auc: 0.8208 

Epoch 21/200
roc-auc: 0.7893 

Epoch 22/200
roc-auc: 0.781 

Epoch 23/200
roc-auc: 0.7853 

Epoch 00022: early stopping


<keras.callbacks.History at 0x7f3b449a6810>

In [91]:
model.load_weights('sub_x4.h5')

y_pred = model.predict(x_test)
score = roc_auc_score(y_test,y_pred,average='weighted')
print(score)

0.816510195348


In [92]:
K.clear_session()

In [93]:
x = np.concatenate((tfidf_train_1[train_clean_index[28670*4:],:], tfidf_train_1[train_toxic_index,:]), axis=0)
y = np.concatenate((np.array(train.iloc[train_clean_index[28670*4:],2:-1]), np.array(train.iloc[train_toxic_index,2:-1])), axis=0)

x_train, x_test, y_train, y_test = train_test_split(x, y,test_size=0.2, random_state=2673)

In [114]:
inputs = Input(shape=(200,))

a = Dense(128,activation='tanh', input_shape=(None,200))(inputs)
a = Dropout(0.6)(a)
a = Dense(128,activation='tanh')(a)
a = Dropout(0.6)(a)
a = Dense(128,activation='tanh')(a)
a = Dropout(0.6)(a)
output= Dense(6, activation='sigmoid')(a)


model = Model(inputs=inputs, outputs=output)
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=[metrics.categorical_accuracy])

mck = ModelCheckpoint('sub_x5.h5', monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=True, mode='auto', period=1)
estop = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=10, verbose=1, mode='auto')

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               25728     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               16512     
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               16512     
_________________________________________________________________
dropout_3 (Dropout)          (None, 128)               0         
__________

In [99]:
model.fit(x_train,y_train,epochs=200, batch_size=50, verbose=1, shuffle=True, validation_split=0.3, callbacks=[mck,estop,roc_callback(training_data=(x_train,y_train))])

Train on 25138 samples, validate on 10774 samples
Epoch 1/200
roc-auc: 0.6555 

Epoch 2/200
roc-auc: 0.6983 

Epoch 3/200
roc-auc: 0.7153 

Epoch 4/200
roc-auc: 0.749 

Epoch 5/200
roc-auc: 0.7513 

Epoch 6/200
roc-auc: 0.7421 

Epoch 7/200
roc-auc: 0.7907 

Epoch 8/200
roc-auc: 0.7839 

Epoch 9/200
roc-auc: 0.7942 

Epoch 10/200
roc-auc: 0.8223 

Epoch 11/200
roc-auc: 0.7596 

Epoch 12/200
roc-auc: 0.8409 

Epoch 13/200
roc-auc: 0.7903 

Epoch 14/200
roc-auc: 0.7731 

Epoch 15/200
roc-auc: 0.7545 

Epoch 16/200
roc-auc: 0.7542 

Epoch 17/200
roc-auc: 0.7956 

Epoch 18/200
roc-auc: 0.8358 

Epoch 19/200
roc-auc: 0.7966 

Epoch 20/200
roc-auc: 0.772 

Epoch 21/200
roc-auc: 0.7585 

Epoch 22/200
roc-auc: 0.8235 

Epoch 23/200
roc-auc: 0.7812 

Epoch 24/200
roc-auc: 0.7977 



Epoch 25/200
roc-auc: 0.7492 

Epoch 26/200
roc-auc: 0.7822 

Epoch 27/200
roc-auc: 0.7801 

Epoch 28/200
roc-auc: 0.7359 

Epoch 00027: early stopping


<keras.callbacks.History at 0x7f3b116b9bd0>

In [100]:
model.load_weights('sub_x5.h5')

y_pred = model.predict(x_test)
score = roc_auc_score(y_test,y_pred,average='weighted')
print(score)

0.789489397956


In [113]:
K.clear_session()

In [103]:
model.load_weights('sub_x1.h5')
y_pred_1 = model.predict(tfidf_test_1)

In [106]:
model.load_weights('sub_x2.h5')
y_pred_2 = model.predict(tfidf_test_1)

In [109]:
model.load_weights('sub_x3.h5')
y_pred_3 = model.predict(tfidf_test_1)

In [112]:
model.load_weights('sub_x4.h5')
y_pred_4 = model.predict(tfidf_test_1)

In [115]:
model.load_weights('sub_x5.h5')
y_pred_5 = model.predict(tfidf_test_1)

In [122]:
bagging_pred = y_pred_1+y_pred_2+y_pred_3+y_pred_4+y_pred_5
bagging_pred = bagging_pred/5

In [124]:
sub = pd.DataFrame(test['id'])

for i in range(len(levels)):
    sub[levels[i]]=bagging_pred[:,i]
sub.to_csv('bagging_pred.csv', index=False)

In [116]:
sub = pd.DataFrame(test['id'])

for i in range(len(levels)):
    sub[levels[i]]=y_pred_1[:,i]
sub.to_csv('pred_1.csv', index=False)

In [117]:
sub = pd.DataFrame(test['id'])

for i in range(len(levels)):
    sub[levels[i]]=y_pred_2[:,i]
sub.to_csv('pred_2.csv', index=False)

In [118]:
sub = pd.DataFrame(test['id'])

for i in range(len(levels)):
    sub[levels[i]]=y_pred_3[:,i]
sub.to_csv('pred_3.csv', index=False)

In [119]:
sub = pd.DataFrame(test['id'])

for i in range(len(levels)):
    sub[levels[i]]=y_pred_4[:,i]
sub.to_csv('pred_4.csv', index=False)

#This is the best till now

In [120]:
sub = pd.DataFrame(test['id'])

for i in range(len(levels)):
    sub[levels[i]]=y_pred_5[:,i]
sub.to_csv('pred_5.csv', index=False)