In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
from keras.layers import Dense,Input,LSTM,Bidirectional,Activation,Conv1D
from keras.layers import Dropout,Embedding,GlobalMaxPooling1D, MaxPooling1D, Add, Flatten
from keras.preprocessing import text, sequence
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras.models import Model
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
EMBEDDING_FILE = '/media/ashish/New Volume/M.Tech/Thesis/Dataset/Thesis2/scripts/reader/data/embeddings/glove.840B.300d.txt'
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')

In [3]:
train["comment_text"].fillna("fillna")
test["comment_text"].fillna("fillna")
X_train = train["comment_text"].str.lower()
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values

X_test = test["comment_text"].str.lower()

In [4]:
max_features=110000
maxlen=200
embed_size=300

In [5]:
tok=text.Tokenizer(num_words=max_features,lower=True)
tok.fit_on_texts(list(X_train)+list(X_test))
X_train=tok.texts_to_sequences(X_train)
X_test=tok.texts_to_sequences(X_test)
x_train=sequence.pad_sequences(X_train,maxlen=maxlen)
x_test=sequence.pad_sequences(X_test,maxlen=maxlen)

In [6]:
embeddings_index = {}
with open(EMBEDDING_FILE,encoding='utf8') as f:
    for line in f:
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [7]:
word_index = tok.word_index
#prepare embedding matrix
num_words = min(max_features, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embed_size))
for word, i in word_index.items():
    if i >= max_features:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [8]:
sequence_input = Input(shape=(maxlen, ))
x = Embedding(max_features, embed_size, weights=[embedding_matrix],trainable = False)(sequence_input)
x = SpatialDropout1D(0.2)(x)
x = Bidirectional(LSTM(128, return_sequences=True,recurrent_dropout=0.1))(x)
x = Conv1D(64, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform")(x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
x = concatenate([avg_pool, max_pool])
# x = Dense(128, activation='relu')(x)
x = Dropout(0.1)(x)
preds = Dense(6, activation="sigmoid")(x)
# model = Model(sequence_input, preds)
# model.compile(loss='binary_crossentropy',optimizer=Adam(lr=1e-3),metrics=['accuracy'])

In [9]:
folds = KFold(n_splits=4, shuffle=True, random_state=1)

In [48]:
batch_size = 128
epochs = 5
# temp = []
filepath0="weights_base.best0.hdf5"
filepath1="weights_base.best1.hdf5"
filepath2="weights_base.best2.hdf5"
filepath3="weights_base.best3.hdf5"
model = Model(sequence_input, preds)
model.compile(loss='binary_crossentropy',optimizer='adamax',metrics=['accuracy'])
    
# for n_fold, (trn_idx, val_idx) in enumerate(folds.split(x_train)):
#     print('n_fold ',n_fold)
#     model = Model(sequence_input, preds)
#     model.compile(loss='binary_crossentropy',optimizer='adamax',metrics=['accuracy'])
    
#     filepath="weights_base.best"+str(n_fold)+".hdf5"
#     checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
#     early = EarlyStopping(monitor="val_acc", mode="max", patience=6)
#     callbacks_list = [checkpoint, early]
    
#     model.fit(x_train[trn_idx], y_train[trn_idx], batch_size=batch_size, epochs=epochs, validation_data=(x_train[val_idx], y_train[val_idx]),callbacks = callbacks_list,verbose=1)
            
model.load_weights(filepath3)
#     temp.append(y_pred)

In [49]:
y_pred = model.predict(x_test,batch_size=1024,verbose=1)



In [50]:
temp.append(y_pred)

In [51]:
np.mean(temp,axis=0)

array([[9.95147705e-01, 4.03120250e-01, 9.86546636e-01, 4.92067561e-02,
        9.54477608e-01, 4.82635289e-01],
       [7.03774393e-04, 4.34793656e-05, 2.60125875e-04, 3.30480834e-05,
        2.57319713e-04, 6.25163448e-05],
       [1.67856750e-03, 3.50551418e-04, 5.70371456e-04, 2.38873181e-04,
        4.52925306e-04, 2.11344071e-04],
       ...,
       [6.05303445e-04, 1.87894639e-05, 1.65383055e-04, 3.09339084e-05,
        1.14635804e-04, 3.02304106e-05],
       [6.12700824e-04, 4.91729588e-05, 1.69422958e-04, 3.66934837e-05,
        1.87041165e-04, 8.06317141e-04],
       [9.87170696e-01, 7.63911707e-03, 8.22301507e-01, 2.93512293e-03,
        5.28440833e-01, 3.28193512e-03]], dtype=float32)

In [52]:
submission = pd.read_csv('sample_submission.csv')
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = np.mean(temp,axis=0)
submission.to_csv('submission_128.csv', index=False)