In [7]:
import os
import re
import csv
import codecs
import numpy as np
import pandas as pd
import gc
import sys
import gensim
import time
import keras
from string import punctuation
from tflearn.data_utils import pad_sequences

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from gensim.models import KeyedVectors
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers.advanced_activations import PReLU
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [12]:
def merged_lstm():
    embedding_layer = Embedding(nb_words,
            embedding_dim,
            weights=[word_embedding_matrix],
            input_length=seq_length,
            trainable=False)
    
    lstm_layer = LSTM(128, dropout=0.25, recurrent_dropout=0.2,
                     go_backwards = False, implementation = 2)

    sequence_1_input = Input(shape=(seq_length,), dtype='int32')
    embedded_sequences_1 = embedding_layer(sequence_1_input)
    x1 = lstm_layer(embedded_sequences_1)

    sequence_2_input = Input(shape=(seq_length,), dtype='int32')
    embedded_sequences_2 = embedding_layer(sequence_2_input)
    y1 = lstm_layer(embedded_sequences_2)

    dense_input = Input(shape = (ncols,))
    d = Dense(256, kernel_initializer = 'he_normal')(dense_input)
    d = PReLU()(d)
    d = BatchNormalization()(d)
    d = Dropout(0.4)(d)
    
    d2 = Dense(512, kernel_initializer = 'he_normal')(d)
    d2 = PReLU()(d2)
    d2 = BatchNormalization()(d2)
    d2 = Dropout(0.2)(d2)
    
    d3 = Dense(512, kernel_initializer = 'he_normal')(d2)
    d3 = PReLU()(d3)
    d3 = Dropout(0.2)(d3)
    
    merged = concatenate([x1, y1, d3])
    merged = Dropout(0.25)(merged)
    merged = BatchNormalization()(merged)

    merged = Dense(256)(merged)
    merged = PReLU()(merged)
    merged = Dropout(0.25)(merged)
    merged = BatchNormalization()(merged)

    preds = Dense(1, activation='sigmoid')(merged)
    model = Model(inputs=[sequence_1_input, sequence_2_input, dense_input], outputs=preds)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
    return model

In [13]:
seq_length = 30
embedding_dim = 50
nb_words = 5000 + 2


word_embedding_matrix = np.load('../data/aux/fastvec_word.npy')

train_data_word = pd.read_csv('../data/aux/train_word_indexvec.csv')


word_squence_ques1_word = list(train_data_word.iloc[:, 1])
word_squence_ques1_word = [[int(im) for im in item.split(' ')] for item in word_squence_ques1_word]
word_squence_ques2_word = list(train_data_word.iloc[:, 2])
word_squence_ques2_word = [[int(im) for im in item.split(' ')] for item in word_squence_ques2_word]

MAX_SEQUENCE_LENGTH_WORD = 30 # char 40 word 30
word_squence_ques1_word = pad_sequences(word_squence_ques1_word, maxlen=MAX_SEQUENCE_LENGTH_WORD)
word_squence_ques2_word = pad_sequences(word_squence_ques2_word, maxlen=MAX_SEQUENCE_LENGTH_WORD)

q1 = word_squence_ques1_word
q2 = word_squence_ques2_word
y = train_data_word.iloc[:, 3]

lda_feas_char = pd.read_csv('../lda_features_char.csv')
lda_feas_char = lda_feas_char.values
print(lda_feas_char.shape)
ngram_feas_char = pd.read_csv('../ngram_features_char.csv')
ngram_feas_char = ngram_feas_char.values
print(ngram_feas_char.shape)
simsummary_feas_char = pd.read_csv('../simsummary_features_char.csv')
simsummary_feas_char = simsummary_feas_char.values
print(simsummary_feas_char.shape)
lda_feas_word = pd.read_csv('../lda_features_word.csv')
lda_feas_word = lda_feas_word.values
print(lda_feas_word.shape)
ngram_feas_word = pd.read_csv('../ngram_features_word.csv')
ngram_feas_word = ngram_feas_word.values
print(ngram_feas_word.shape)
simsummary_feas_word = pd.read_csv('../simsummary_features_word.csv')
simsummary_feas_word = simsummary_feas_word.values
print(simsummary_feas_word.shape)
tfidf_feas = pd.read_csv('../tfidf_features.csv')
tfidf_feas = tfidf_feas.values
print(tfidf_feas.shape)
all_feas = np.concatenate([ngram_feas_char,lda_feas_char, simsummary_feas_char, tfidf_feas], axis=1)
X_train = all_feas


(98976, 2)
(98976, 16)
(98976, 5)
(98976, 2)
(98976, 16)
(98976, 5)
(98976, 4)


In [24]:
class Metrics(keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.val_f1s = []

    def on_epoch_end(self, epoch, logs={}):
        val_predict = (np.asarray(self.model.predict(self.validation_data[:-3])))#
        val_predict = [1 if item > 0.25 else 0 for item in val_predict]
        val_targ = self.validation_data[-3]
        _val_f1 = f1_score(val_targ, val_predict)
        self.val_f1s.append(_val_f1)
        print "epoch end, f1-score: ", _val_f1
metrics = Metrics()

def lstm_foldrun(X, q1, q2, y, X_test = None, q1_test = None, q2_test = None, start_fold = 0,
                name = 'LSTM_merged866cols', save = True):
    
    skf = StratifiedKFold(n_splits = 10, random_state = 111, shuffle = True)
    if isinstance(X, pd.core.frame.DataFrame):
        X = X.values
    if isinstance(X_test, pd.core.frame.DataFrame):
        X_test = X_test.values
    if isinstance(y, pd.core.frame.DataFrame):
        y = y.is_duplicate.values
    if isinstance(y, pd.core.frame.Series):
        y = y.values
    
    i = 0
    losses = []
    train_splits = []
    val_splits = []
    for tr_index, val_index in skf.split(X, y):
        train_splits.append(tr_index)
        val_splits.append(val_index)
    for i in range(start_fold, 10):
        X_tr, X_val = X[train_splits[i]], X[val_splits[i]]
        q1_tr, q1_val = q1[train_splits[i]], q1[val_splits[i]]
        q2_tr, q2_val = q2[train_splits[i]], q2[val_splits[i]]
        y_tr, y_val = y[train_splits[i]], y[val_splits[i]]

        t = time.time()
        print('Start training on fold: {}'.format(i))
        callbacks = [ModelCheckpoint('../data/checkpoints/{}_fold{}.h5'.format(name, name, i),
                                    monitor='val_loss', 
                                    verbose = 0, save_best_only = True),
                 EarlyStopping(monitor='val_loss', patience = 7, verbose = 1), metrics]
        
        model = merged_lstm()
        model.fit([q1_tr, q2_tr, X_tr], y_tr, validation_data=([q1_val, q2_val, X_val], y_val),
                epochs=200, batch_size=512, callbacks = callbacks)
        ,class_weight = {0: 1.,1: 5,}
        
        val_pred = model.predict([q1_val, q2_val, X_val], batch_size = 64)
        
        val_predict = [1 if item > 0.25 else 0 for item in val_pred]
        print 'val score', f1_score(y_val, val_predict)
        
        del X_tr, X_val, q1_tr, q1_val, q2_tr, q2_val
        gc.collect()
        i += 1
        
    return

In [25]:
ncols = X_train.shape[1]
lstm_foldrun(X_train, q1, q2, y, start_fold = 9, name = 'LSTM_merged')

Start training on fold: 9
Train on 89079 samples, validate on 9897 samples
Epoch 1/200
epoch end, f1-score:  0.4266949152542373
Epoch 2/200
epoch end, f1-score:  0.43754478146644377
Epoch 3/200
epoch end, f1-score:  0.43645213628988644
Epoch 4/200
epoch end, f1-score:  0.4384432088959491
Epoch 5/200
epoch end, f1-score:  0.43902439024390244
Epoch 6/200
epoch end, f1-score:  0.44610202117420594
Epoch 7/200
epoch end, f1-score:  0.44619836885030256
Epoch 8/200
epoch end, f1-score:  0.4484611626770884
Epoch 9/200
epoch end, f1-score:  0.4599275070479259
Epoch 10/200
epoch end, f1-score:  0.4612628750559785
Epoch 11/200
epoch end, f1-score:  0.4512771996215705
Epoch 12/200
epoch end, f1-score:  0.4513681929016527
Epoch 13/200
epoch end, f1-score:  0.44425675675675674
Epoch 14/200
epoch end, f1-score:  0.46424328819196964
Epoch 15/200
epoch end, f1-score:  0.4634713820381573
Epoch 16/200
epoch end, f1-score:  0.4611138986452584
Epoch 17/200
epoch end, f1-score:  0.4674035641777577
Epoch 18/