## RNN + leak features

Since RNN alone can't yield a great result (since this data set as many special features and distributions), we need to geed some of extracted features as leakage into the RNN model. This boosted log-loss from 0.3 to 0.153, almost 50% boost!

In [3]:
import os
import re
import csv
import codecs
import pickle
import numpy as np
import pandas as pd

from string import punctuation
from collections import defaultdict

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.optimizers import RMSprop, Nadam

from sklearn.preprocessing import StandardScaler

import sys

Using TensorFlow backend.


In [4]:
'''
set directories and parameters
'''
EMBEDDING_FILE = '../dataset/glove/glove.6B.300d.txt'
TRAIN_DATA_FILE = '../dataset/quora-question-pairs/train.csv'
TEST_DATA_FILE = '../dataset/quora-question-pairs/test.csv'
MAX_SEQUENCE_LENGTH = 30
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.1

re_weight = True # whether to re-weight classes to fit the 17.5% share in test set

## Process texts in datasets (Cached)

<font color='red'>**Following codes are commented since their results are cached**</font>

In [68]:
# print('Processing text dataset')

# # The function "text_to_wordlist" is from
# # https://www.kaggle.com/currie32/quora-question-pairs/the-importance-of-cleaning-text
# def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
#     # Clean the text, with the option to remove stopwords and to stem words.
    
#     # Convert words to lower case and split them
#     text = text.lower().split()

#     # Optionally, remove stop words
#     if remove_stopwords:
#         stops = set(stopwords.words("english"))
#         text = [w for w in text if not w in stops]
    
#     text = " ".join(text)

#     # Clean the text
    
#     text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
#     text = re.sub(r"what's", "what is ", text)
#     text = re.sub(r"\'s", " ", text)
#     text = re.sub(r"\'ve", " have ", text)
#     text = re.sub(r"can't", "cannot ", text)
#     text = re.sub(r"won't", " will not ", text)
#     text = re.sub(r"n't", " not ", text)
#     text = re.sub(r"i'm", "i am ", text)
#     text = re.sub(r"I'm", "i am ", text)
#     text = re.sub(r"\'re", " are ", text)
#     text = re.sub(r"\'d", " would ", text)
#     text = re.sub(r"\'ll", " will ", text)
#     text = re.sub(r",", " ", text)
#     text = re.sub(r"\.", " ", text)
#     text = re.sub(r"!", " ! ", text)
#     text = re.sub(r"\/", " ", text)
#     text = re.sub(r"\^", " ^ ", text)
#     text = re.sub(r"\+", " + ", text)
#     text = re.sub(r"\-", " - ", text)
#     text = re.sub(r"\=", " = ", text)
#     text = re.sub(r"'", " ", text)
#     text = re.sub(r":", " : ", text)
#     text = re.sub(r" e g ", " eg ", text)
#     text = re.sub(r" b g ", " bg ", text)
#     text = re.sub(r" u s ", " american ", text)
#     text = re.sub(r"\0s", "0", text)
#     text = re.sub(r" 9 11 ", "911", text)
#     text = re.sub(r"e - mail", "email", text)
#     text = re.sub(r"j k", "jk", text)
#     text = re.sub(r"\s{2,}", " ", text)
    
#     # Optionally, shorten words to their stems
#     if stem_words:
#         text = text.split()
#         stemmer = SnowballStemmer('english')
#         stemmed_words = [stemmer.stem(word) for word in text]
#         text = " ".join(stemmed_words)
    
#     # Return a list of words
#     return(text)

# texts_1 = [] 
# texts_2 = []
# labels = []
# with codecs.open(TRAIN_DATA_FILE, encoding='utf-8') as f:
#     reader = csv.reader(f, delimiter=',')
#     header = next(reader)
#     for values in reader:
#         texts_1.append(text_to_wordlist(values[3]))
#         texts_2.append(text_to_wordlist(values[4]))
#         labels.append(int(values[5]))
# print('Found %s texts in train.csv' % len(texts_1))

# test_texts_1 = []
# test_texts_2 = []
# test_ids = []
# with codecs.open(TEST_DATA_FILE, encoding='utf-8') as f:
#     reader = csv.reader(f, delimiter=',')
#     header = next(reader)
#     for values in reader:
#         test_texts_1.append(text_to_wordlist(values[1]))
#         test_texts_2.append(text_to_wordlist(values[2]))
#         test_ids.append(values[0])
# print('Found %s texts in test.csv' % len(test_texts_1))

## Spell Checker with Glove (Cached)
<font color='red'>**Following codes are commented since their results are cached**</font>

In [69]:
# # Import spacy corpus, glove embeddings.
# import spacy
# import textacy

# print('Loading SpaCy `en_core_web_md` corpus...')
# nlp = spacy.load('en_core_web_md')

# def word_prob(word):
#     "Probability of word."
#     return nlp.vocab[word].prob

# def correction(word):
#     "Most probable spelling correction for word."
#     if nlp.vocab[word].prob > -15.0: 
#         return word, False
#     else:
#         return max(candidates(word), key=word_prob), True

# def candidates(word):
#     "Generate possible spelling corrections for word."
#     return set([word] + known([word]) + known(edits1(word)) + known(edits2(word)))

# def known(words):
#     "The subset of `words` that appear in the vocabulary."
#     return [w for w in words if w in nlp.vocab]

# def edits1(word):
#     "All edits that are one edit away from `word`."
#     letters    = 'abcdefghijklmnopqrstuvwxyz'
#     splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
#     deletes    = [L + R[1:]               for L, R in splits if R]
#     transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
#     replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
#     inserts    = [L + c + R               for L, R in splits for c in letters]
#     return set(deletes + transposes + replaces + inserts)

# def edits2(word):
#     "All edits that are two edits away from `word`."
#     return (e2 for e1 in edits1(word) for e2 in edits1(e1))

# abbr_dict = {
#     "\'s":" is",
#     "\'re":" are",

#     "i'm":"i am",
#     "it's":"it is",
#     "\'ve":" have",

#     "\'ll":" will",

#     "won't":"will not",
#     "can't":"can not",
#     "\'t":" not",
    
# }

# def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    
#     if type(text)!=str:
#         text = ''
    
#     # Remove spaces.
#     text = textacy.preprocess_text(text, lowercase=True)

#     # Handle Abbreviation.
#     for k, v in abbr_dict.items():
#         text = re.sub(r"{}".format(k), v, text)

#     # Spelling correction
#     corrected_words = []
#     modified_count = 0
#     for doc in nlp(text):
#         # Don't correct special words
#         if doc.ent_type_ is not '' or doc.pos_ in ['PROPN', 'PUNCT']:
#             corrected_words.append(doc.text)
#         else:
#             corrected, is_modified = correction(doc.text)
#             corrected_words.append(corrected)
#             if is_modified:
#                 modified_count += 1

# #     # Re-merge corrected words
# #     text = " ".join(corrected_words)

# #     # Text normalization
# #     text = re.sub('[\!\?\@\^\+\*\/\,\~\|\`\=\:\;\.\#\\\\(\)\[\]\{\}\<\>\'\"]', ' ', text)

# #     # Convert to lower case, remove punctuations and further text normalization.
# #     text = textacy.preprocess_text(text, lowercase=True, no_punct=True, no_numbers=True, no_currency_symbols=True)

#     return text, modified_count

In [70]:
# ########################################
# ## Implement multi thread
# ########################################

# from joblib import Parallel, delayed
# from math import ceil

# TOTAL_SPLITS = 64

# def batch_process(batch_data, is_train=False):
#     res1 = [] 
#     res2 = [] 
#     labels = []
#     mod_count1 = []
#     mod_count2 = []
    
#     for i,series in batch_data.iterrows():
#         q1_modified, q1_modified_count = text_to_wordlist(series['question1'])
#         q2_modified, q2_modified_count = text_to_wordlist(series['question2'])
#         res1.append(q1_modified)
#         res2.append(q2_modified)
#         mod_count1.append(q1_modified_count)
#         mod_count2.append(q2_modified_count)
#         if is_train:
#             labels.append(int(series['is_duplicate']))
#         else:
#             labels.append(int(series['test_id']))
    
#     return res1, res2, mod_count1, mod_count2, labels
    
# def join_jobs_result(results, is_train=False):
         
#     res1 = np.concatenate([results[i][0] for i in range(TOTAL_SPLITS)]).tolist()
#     res2 = np.concatenate([results[i][1] for i in range(TOTAL_SPLITS)]).tolist()
    
#     mod1 = np.concatenate([results[i][2] for i in range(TOTAL_SPLITS)]).tolist()
#     mod2 = np.concatenate([results[i][3] for i in range(TOTAL_SPLITS)]).tolist()
    
#     if is_train:
#         labels = np.concatenate([results[i][-1] for i in range(TOTAL_SPLITS)]).tolist()
#     else:
#         ids = np.concatenate([results[i][-1] for i in range(TOTAL_SPLITS)]).tolist()
    
#     if is_train:
#         return res1, res2, mod1, mod2, labels
#     else:
#         return res1, res2, mod1, mod2, ids

# ########################################
# ## process texts in datasets
# ########################################
# print('Processing text dataset')

# # df = pd.read_csv(TRAIN_DATA_FILE, encoding='utf-8')
# # SPLIT_SIZE = int(ceil(len(df)/TOTAL_SPLITS))
# # res = Parallel(n_jobs=-1, verbose=11)(delayed(batch_process)(df[i*SPLIT_SIZE:(i+1)*SPLIT_SIZE],is_train=True) for i in range(TOTAL_SPLITS))
# # texts_1, texts_2, mod1, mod2, labels = join_jobs_result(res, is_train=True)

# # print('Found %s texts in train.csv' % len(texts_1))

# df = pd.read_csv(TEST_DATA_FILE, encoding='utf-8')
# SPLIT_SIZE = int(ceil(len(df)/TOTAL_SPLITS))
# res = Parallel(n_jobs=-1, verbose=11)(delayed(batch_process)(df[i*SPLIT_SIZE:(i+1)*SPLIT_SIZE],is_train=False) for i in range(TOTAL_SPLITS))
# test_texts_1, test_texts_2, test_mod1, test_mod2, test_ids = join_jobs_result(res, is_train=False)

# print('Found %s texts in test.csv' % len(test_texts_1))

# del df
# del res

In [71]:
# ## dump modified count feature to file
# def write_modified_features(m1, m2, is_train):
#     if is_train:
#         path = './features_from_model/train/modified_count.csv'
#     else:
#         path = './features_from_model/test/modified_count.csv'
#     df = pd.DataFrame(np.array([m1, m2]).reshape(-1,2), columns=['mod1','mod2'])
#     df.to_csv(path)
    
# # write_modified_features(mod1, mod2, is_train=True)
# write_modified_features(test_mod1, test_mod2, is_train=False)

In [72]:
# pickle.dump([texts_1, texts_2, labels],open('./leaks_cache/train_text_processed.pkl','wb'))
# pickle.dump([test_texts_1, test_texts_2, test_ids],open('./leaks_cache/test_text_processed.pkl','wb'))

## Embed sentences into vectors (cached)
<font color='red'>**Following codes are commented since their results are cached**</font>

In [178]:
# # use NER (name entity recognition)
# is_use_NER = True 

# correct_words = True
# use_fasttext = False

# if is_use_NER:
#     [texts_1, texts_2, labels] = pickle.load(open('./leaks_cache/train_text_spacy_cleaned.pkl','rb'))
#     [test_texts_1, test_texts_2, test_ids] = pickle.load(open('./leaks_cache/test_text_spacy_cleaned.pkl','rb'))
# else:
#     if not correct_words:
#         [texts_1, texts_2, labels] = pickle.load(open('./leaks_cache/train_text_without_process.pkl','rb'))
#         [test_texts_1, test_texts_2, test_ids] = pickle.load(open('./leaks_cache/test_text_without_process.pkl','rb'))
#     else:
#         [texts_1, texts_2, labels] = pickle.load(open('./leaks_cache/train_text_processed.pkl','rb'))
#         [test_texts_1, test_texts_2, test_ids] = pickle.load(open('./leaks_cache/test_text_processed.pkl','rb'))


In [1]:
# print('Tokenizing ... ')
# tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
# tokenizer.fit_on_texts(texts_1 + texts_2 + test_texts_1 + test_texts_2)

# sequences_1 = tokenizer.texts_to_sequences(texts_1)
# sequences_2 = tokenizer.texts_to_sequences(texts_2)
# test_sequences_1 = tokenizer.texts_to_sequences(test_texts_1)
# test_sequences_2 = tokenizer.texts_to_sequences(test_texts_2)

# word_index = tokenizer.word_index
# print('Found %s unique tokens' % len(word_index))

# data_1 = pad_sequences(sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
# data_2 = pad_sequences(sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
# labels = np.array(labels)
# print('Shape of data tensor:', data_1.shape)
# print('Shape of label tensor:', labels.shape)

# test_data_1 = pad_sequences(test_sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
# test_data_2 = pad_sequences(test_sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
# test_ids = np.array(test_ids)

In [2]:
# ########################################
# ## index word vectors
# ########################################
# print('Indexing word vectors')

# embeddings_index = {}
# f = open(EMBEDDING_FILE, encoding='utf-8')
# count = 0
# for line in f:
#     values = line.split()
#     word = values[0]
#     coefs = np.asarray(values[1:], dtype='float32')
#     embeddings_index[word] = coefs
# f.close()

# print('Found %d word vectors of glove.' % len(embeddings_index))

In [3]:
# ########################################
# ## prepare embeddings
# ########################################
# print('Preparing embedding matrix')

# if use_fasttext:
#     fasttext_model = pickle.load(open('../Howard/features/fasttext/fasttext_model.pkl','rb'))

# nb_words = min(MAX_NB_WORDS, len(word_index))+1

# embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
# for word, i in word_index.items():
#     if use_fasttext:
#         embedding_vector = fasttext_model.get(word, np.zeros((300,)))
#     else:
#         embedding_vector = embeddings_index.get(word)
#     if embedding_vector is not None:
#         embedding_matrix[i] = embedding_vector
# print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

In [4]:
# print('Saving')
# if is_use_NER:
#     pickle.dump([data_1,data_2,labels,test_data_1,test_data_2,test_ids,embedding_matrix,nb_words],open('./leaks_cache/cache_NER.pkl','wb'))
# else:
#     if correct_words:
#         if use_fasttext:
#             pickle.dump([data_1,data_2,labels,test_data_1,test_data_2,test_ids,embedding_matrix,nb_words],open('./leaks_cache/cache_fasttext_text_correction.pkl','wb'))
#         else:
#             pickle.dump([data_1,data_2,labels,test_data_1,test_data_2,test_ids,embedding_matrix,nb_words],open('./leaks_cache/cache_text_correction.pkl','wb'))

#     else:
#         if use_fasttext:
#             pickle.dump([data_1,data_2,labels,test_data_1,test_data_2,test_ids,embedding_matrix,nb_words],open('./leaks_cache/cache_fasttext.pkl','wb'))
#         else:
#             pickle.dump([data_1,data_2,labels,test_data_1,test_data_2,test_ids,embedding_matrix,nb_words],open('./leaks_cache/cache.pkl','wb'))

## Load cached data

<font color='red'>**Choose which version of cache to be used**</font>

In [181]:
# '''no correction , use GloVe'''
# data_1,data_2,labels,test_data_1,test_data_2,test_ids,embedding_matrix, nb_words = pickle.load(open('./leaks_cache/cache.pkl','rb'))
# model_name = 'glove_without_word_correction'

# '''correction , use GloVe'''
# data_1,data_2,labels,test_data_1,test_data_2,test_ids,embedding_matrix, nb_words = pickle.load(open('./leaks_cache/cache_text_correction.pkl','rb'))
# model_name = 'glove_with_word_correction'


# '''no correction , use fasttext , 0.1602 -> 0.1607'''
# data_1,data_2,labels,test_data_1,test_data_2,test_ids,embedding_matrix, nb_words = pickle.load(open('./leaks_cache/cache_fasttext.pkl','rb'))
# model_name = 'fasttext_without_word_correction'

# '''correction , use fasttext , 0.1592 -> 0.1614'''
# data_1,data_2,labels,test_data_1,test_data_2,test_ids,embedding_matrix, nb_words = pickle.load(open('./leaks_cache/cache_fasttext_text_correction.pkl','rb'))
# model_name = 'fasttext_with_word_correction'

'''no correction , use fasttext , 0.1602 -> 0.1607'''
data_1,data_2,labels,test_data_1,test_data_2,test_ids,embedding_matrix, nb_words = pickle.load(open('./leaks_cache/cache_NER.pkl','rb'))
model_name = 'use_NER'

## Gen or read features

Select which leak feature will be chosen to be used in the training phase. These can be treated as a kind of hyper parameters.

In [5]:
#### Add extra features to leak

is_all_features_in_single_dense = False

feature_files = [
#     'lystdo_correctwords_lstm',
#     'lystdo_1234_loss017',
    'Abhishek_features',
    'magic_feature',
    'magic_feature_v1',
    'Howard_feature',
    'HubertLin_features_raw',
    'HubertLin_features_simple_tokenizer',
    'HubertLin_features_word_corrected',
    'pagerank',
    'word_match_share',
    'fasttext_distance',
#     'modified_count',
    'magic_v25_qid',
    'k_scrore',
    'pos_dist',
    'dep_dist',
    'lystdo_leaks'
]

def inf_nan_to_zero(arr):
    nan = np.isnan(arr)
    inf = np.isinf(arr)
    arr[nan] = 0
    arr[inf] = 0
    return arr

def standardize(train,test):
    ss = StandardScaler()
    ss.fit(np.vstack((train, test)))
    train = ss.transform(train)
    test = ss.transform(test)
    return train, test

print('Loading original leaks')
leaks,test_leaks = None, None
if not is_all_features_in_single_dense:
    leaks = []
    test_leaks = []

for feature_file in feature_files:
    
    try:
        print('Loading '+feature_file)
        train_features,test_features = pickle.load(open('./leaks_cache/ss_cache/'+feature_file+'.pkl' , 'rb'))
    except:
        train_features = pd.read_csv('./features_from_model/train/'+feature_file+'.csv').as_matrix()
        test_features = pd.read_csv('./features_from_model/test/'+feature_file+'.csv').as_matrix()
        train_features = inf_nan_to_zero(train_features)
        test_features = inf_nan_to_zero(test_features)

        train_features,test_features = standardize(train_features,test_features)
        pickle.dump([train_features,test_features], open('./leaks_cache/ss_cache/'+feature_file+'.pkl' , 'wb'))

    if is_all_features_in_single_dense:
        if leaks==None:
            leaks = train_features
            test_leaks = test_features
        else:
            leaks = np.hstack([leaks,train_features])
            test_leaks = np.hstack([test_leaks,test_features])
    else:
        leaks.append(train_features)
        test_leaks.append(test_features)
        
# need to wrap it up
if is_all_features_in_single_dense:
    leaks = [leaks]
    test_leaks = [test_leaks]


Loading original leaks
Loading Abhishek_features
Loading magic_feature
Loading magic_feature_v1
Loading Howard_feature
Loading HubertLin_features_raw
Loading HubertLin_features_simple_tokenizer
Loading HubertLin_features_word_corrected
Loading pagerank
Loading word_match_share
Loading fasttext_distance
Loading magic_v25_qid
Loading k_scrore
Loading pos_dist
Loading dep_dist
Loading lystdo_leaks


In [14]:
""" Check features name """
feature_names = []
for feature_file in feature_files:
    print('Loading '+feature_file)
    feature_names += list(pd.read_csv('./features_from_model/train/'+feature_file+'.csv').columns)

Loading Abhishek_features
Loading magic_feature
Loading magic_feature_v1
Loading Howard_feature
Loading HubertLin_features_raw
Loading HubertLin_features_simple_tokenizer
Loading HubertLin_features_word_corrected
Loading pagerank
Loading word_match_share
Loading fasttext_distance
Loading magic_v25_qid
Loading k_scrore
Loading pos_dist
Loading dep_dist
Loading lystdo_leaks


## merged leaks

In [217]:
# is_all_features_in_single_dense = True

# def standardize(train,test):
#     ss = StandardScaler()
#     ss.fit(np.vstack((train, test)))
#     train = ss.transform(train)
#     test = ss.transform(test)
#     return train, test

# leaks  = pd.read_csv('../Howard/features/merged/train_145_handcraft_features.csv').as_matrix()
# test_leaks  = pd.read_csv('../Howard/features/merged/test_145_handcraft_features.csv').as_matrix()

# # leaks, test_leaks = standardize(leaks, test_leaks)

# leaks = [leaks]
# test_leaks = [test_leaks]

# # pickle.dump([leaks, test_leaks], open('merge_cache.pkl','wb'))

In [218]:
# [leaks, test_leaks] = pickle.load(open('merge_cache.pkl','rb'))

## Use previous models' outputs as helper (seems not good)

In [219]:
use_prev_model_as_helper = False

helpers = [
    'lystdo_full_features_0.1744_prediction_max',
    'lystdo_full_features_0.1744_prediction_mean',
    'lystdo_full_features_and_poolings_0.1806_prediction_max',
    'lystdo_onlyAB_0.1704_prediction_max',
    'lystdo_onlyAB_0.1704_prediction_mean',
    'lystdo_original_fullfeature_prev_lstm_0.1739_prediction_max',
    'lystdo_original_fullfeature_prev_lstm_0.1739_prediction_mean',
    'lystdo_text_corrected_0.1719_prediction_max',
    'lystdo_text_corrected_0.1719_prediction_mean',
]

if use_prev_model_as_helper:
    train_tmp = []
    test_tmp = []

    def read_csv(file, is_train):
        if is_train:
            name = './model_predictions/train/'+file+'.csv'
        else:
            name = './model_predictions/test/'+file+'.csv'
        df = pd.read_csv(name)
        df = pd.DataFrame(df['is_duplicate'].as_matrix(), columns=[file+'_is_duplicate'])
        return df

    for helper in helpers:
        print('Loading :', helper)
        train_tmp.append(read_csv(helper,is_train=True))
        test_tmp.append(read_csv(helper,is_train=False))

    train_helpers = [pd.concat(train_tmp, axis=1).as_matrix()]
    test_helpers = [pd.concat(test_tmp, axis=1).as_matrix()]

    del train_tmp
    del test_tmp
else:
    train_helpers = []
    val_helpers = []
    test_helpers = []

## Prepare validation

In [220]:
########################################
## prepare validation data index
########################################

np.random.seed(1234)
perm = np.random.permutation(len(data_1))
idx_train = perm[:int(len(data_1)*(1-VALIDATION_SPLIT))]
idx_val = perm[int(len(data_1)*(1-VALIDATION_SPLIT)):]

# import pickle
# idx_val = np.unique(pickle.load(open('./val/val_idxes.pkl','rb'))).astype('int')
# idx_train = np.delete(np.arange(len(data_1)), idx_val)

In [221]:
########################################
## sample train/validation data
########################################

def select_leaks(leaks, idxes, stack):
    ret = []
    for leak in leaks:
        if stack:
            ret.append(np.vstack([leak[idxes],leak[idxes]]))
        else:
            ret.append(leak[idxes])
    return ret

stack = True

if stack:
    data_1_train = np.vstack((data_1[idx_train], data_2[idx_train]))
    data_2_train = np.vstack((data_2[idx_train], data_1[idx_train]))
    leaks_train = select_leaks(leaks,idx_train,stack)
    labels_train = np.concatenate((labels[idx_train], labels[idx_train]))

    data_1_val = data_1[idx_val]
    data_2_val = data_2[idx_val]
    leaks_val = select_leaks(leaks,idx_val,stack=False)
    labels_val = labels[idx_val]
    
    if use_prev_model_as_helper:
        val_helpers = [train_helpers[0][idx_val]]
        train_helpers = [np.vstack([train_helpers[0][idx_train],train_helpers[0][idx_train]])]
else:
    data_1_train = data_1[idx_train]
    data_2_train = data_2[idx_train]
    leaks_train = select_leaks(leaks,idx_train,stack)
    labels_train = labels[idx_train]
    
    data_1_val = data_1[idx_val]
    data_2_val = data_2[idx_val]
    leaks_val = select_leaks(leaks,idx_val,stack)
    labels_val = labels[idx_val]
    
    if use_prev_model_as_helper:
        val_helpers = [train_helpers[0][idx_val]]
        train_helpers = [train_helpers[0][idx_train]]
    

weight_val = np.ones(len(labels_val))
if re_weight:
    weight_val *= 0.472001959
    weight_val[labels_val==0] = 1.309028344

## Create and train model

In [229]:
num_lstm = 256 # np.random.randint(175, 256)
num_dense = 128 # np.random.randint(100, 128)
rate_drop_lstm = 0.1 # + np.random.rand() * 0.25
rate_drop_dense = 0.1 # + np.random.rand() * 0.25

STAMP = './leaks_cache/lstm_%d_%d_%.2f_%.2f'%(num_lstm, num_dense, rate_drop_lstm, \
        rate_drop_dense)

In [231]:
from keras.layers.pooling import GlobalAveragePooling1D, GlobalMaxPooling1D
from keras import backend as K
from keras.layers.core import Lambda
from keras.layers import Bidirectional
from keras.layers.advanced_activations import LeakyReLU, PReLU
from keras.activations import relu

# act = PReLU
act = 'relu'

def dropout_dense(layer, d_rate=0.05, dense_size=num_dense//2):
    layer = BatchNormalization()(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(d_rate)(layer)
    layer = Dense(dense_size, activation=act)(layer)
    return layer

########################################
## define the model structure
########################################
embedding_layer = Embedding(nb_words,
        EMBEDDING_DIM,
        weights=[embedding_matrix],
        input_length=MAX_SEQUENCE_LENGTH,
        trainable=False)
lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)

sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_1 = embedding_layer(sequence_1_input)
x1 = lstm_layer(embedded_sequences_1)

sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_2 = embedding_layer(sequence_2_input)
y1 = lstm_layer(embedded_sequences_2)

extra_inputs = []
extra_outputs = []

features_len = len(leaks)
for i in range(features_len):
    leaks_input = Input(shape=(leaks[i].shape[1],))
    extra_inputs.append(leaks_input)
    
    leaks_dense = Dropout(0.05)(leaks_input)
    leaks_dense = Dense(num_dense//2, activation=act)(leaks_dense)
    extra_outputs.append(leaks_dense)
    # leaks_dense = BatchNormalization()(leaks_dense)
    # leaks_dense = Activation('relu')(leaks_dense)
    # leaks_dense = Dropout(0.25)(leaks_dense)
    # leaks_dense = Dense(num_dense, activation=act)(leaks_dense)
    
if features_len>1:
    extra_merge = concatenate(extra_outputs)
    extra_merge = BatchNormalization()(extra_merge)
    extra_merge = Activation('relu')(extra_merge)
    extra_merge = Dropout(rate_drop_dense)(extra_merge)
    extra_merge = Dense(num_dense*2, activation=act)(extra_merge)

    extra_merge = BatchNormalization()(extra_merge)
    extra_merge = Activation('relu')(extra_merge)
    extra_merge = Dropout(rate_drop_dense)(extra_merge)
    extra_merge = Dense(num_dense*2, activation=act)(extra_merge)
else:
    extra_merge = BatchNormalization()(extra_outputs[0])
    extra_merge = Activation('relu')(extra_merge)
    extra_merge = Dropout(rate_drop_dense)(extra_merge)
    extra_merge = Dense(num_dense*2, activation=act)(extra_merge)

    
first_dense = Dense(num_dense, activation=act)

merged1 = concatenate([x1, y1, extra_merge])
merged1 = BatchNormalization()(merged1)
merged1 = Activation('relu')(merged1)
merged1 = Dropout(rate_drop_dense)(merged1)
merged1 = first_dense(merged1)

merged2 = concatenate([y1, x1, extra_merge])
merged2 = BatchNormalization()(merged2)
merged2 = Activation('relu')(merged2)
merged2 = Dropout(rate_drop_dense)(merged2)
merged2 = first_dense(merged2)

merged = concatenate([merged1, merged2])
merged = BatchNormalization()(merged)
merged = Activation('relu')(merged)
merged = Dropout(rate_drop_dense)(merged)
merged = Dense(num_dense, activation=act)(merged)

merged = BatchNormalization()(merged)
merged = Activation('relu')(merged)
merged = Dropout(0.05)(merged)
merged = Dense(num_dense, activation=act)(merged)

# merged = Dense(16)(merged)

# merged = concatenate([x1, y1, extra_merge])
# merged = BatchNormalization()(merged)
# merged = Dropout(rate_drop_dense)(merged)

# merged = Dense(num_dense, activation=act)(merged)
# merged = BatchNormalization()(merged)
# merged = Dropout(rate_drop_dense)(merged)

if use_prev_model_as_helper:
    helper_input = Input(shape=(train_helpers[0].shape[1],))
    merged = concatenate([merged, helper_input])
    merged = Dropout(0.05)(merged)
    merged = Dense(num_dense, activation=act)(merged)
    merged = Dropout(0.05)(merged)

preds = Dense(1, activation='sigmoid')(merged)

In [232]:
########################################
## add class weight
########################################
if re_weight:
    class_weight = {0: 1.309028344, 1: 0.472001959}
else:
    class_weight = None

In [233]:
########################################
## train the model
########################################

if use_prev_model_as_helper:
    model = Model(inputs=[sequence_1_input, sequence_2_input, helper_input]+extra_inputs, \
            outputs=preds)
else:
    model = Model(inputs=[sequence_1_input, sequence_2_input]+extra_inputs, \
            outputs=preds)
    
model.compile(loss='binary_crossentropy',
        optimizer=Nadam(lr=0.002),
        metrics=['acc'])
#model.summary()
print(STAMP)

bst_model_path = STAMP + '.h5'

lstm_256_128_0.10_0.10


In [None]:
'''
Learn with fitting all data at once, requires large memory
'''

# early_stopping =EarlyStopping(monitor='val_loss', patience=3)
# model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

# hist = model.fit([data_1_train, data_2_train]+train_helpers+leaks_train, labels_train, \
#         validation_data=([data_1_val, data_2_val]+val_helpers+leaks_val, labels_val, weight_val), \
#         epochs=200, batch_size=1024, shuffle=True, \
#         class_weight=class_weight, callbacks=[early_stopping, model_checkpoint], verbose=2)
# model.load_weights(bst_model_path)
# bst_val_score = min(hist.history['val_loss'])

In [234]:
batch_size = 512

def batch_generator(q1, q2, leaks, labels):
    
    def select_leaks(leaks, idxes):
        ret = []
        for leak in leaks:
            ret.append(leak[idxes])
        return ret
    
    while True:
        
        idxes = np.array([np.random.randint(0,len(q1)) for i in range(batch_size)])
        extra_features = select_leaks(leaks, idxes)
        if use_prev_model_as_helper:
            helper = [train_helpers[0][idxes]]
        else:
            helper = []
        yield [q1[idxes],q2[idxes]]+helper+extra_features, labels[idxes]

callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, mode='min', verbose=1),
    ModelCheckpoint(bst_model_path, monitor='val_loss', verbose=0, save_best_only=True, mode='min', period=1, save_weights_only=True)
]
    
try:
        
    hist = model.fit_generator(batch_generator(data_1_train, data_2_train, leaks_train, labels_train),
                            steps_per_epoch=128,
                            class_weight=class_weight,
                            epochs=1000,
                            validation_data=([data_1_val, data_2_val]+val_helpers+leaks_val, labels_val, weight_val), 
                            verbose=2,
                            callbacks=callbacks)

        
    model.load_weights(bst_model_path)
    bst_val_score = min(hist.history['val_loss'])
    
except KeyboardInterrupt:
    print('\nEarly stopped by user')

Epoch 1/1000
9s - loss: 0.2408 - acc: 0.8210 - val_loss: 0.2663 - val_acc: 0.8604
Epoch 2/1000
2s - loss: 0.2025 - acc: 0.8497 - val_loss: 0.2299 - val_acc: 0.8652
Epoch 3/1000
2s - loss: 0.1960 - acc: 0.8562 - val_loss: 0.2044 - val_acc: 0.8491
Epoch 4/1000
2s - loss: 0.1966 - acc: 0.8533 - val_loss: 0.1951 - val_acc: 0.8492
Epoch 5/1000
2s - loss: 0.1906 - acc: 0.8610 - val_loss: 0.1855 - val_acc: 0.8587
Epoch 6/1000
2s - loss: 0.1903 - acc: 0.8587 - val_loss: 0.1877 - val_acc: 0.8706
Epoch 7/1000
2s - loss: 0.1867 - acc: 0.8622 - val_loss: 0.1845 - val_acc: 0.8535
Epoch 8/1000
2s - loss: 0.1868 - acc: 0.8636 - val_loss: 0.1858 - val_acc: 0.8732
Epoch 9/1000
2s - loss: 0.1883 - acc: 0.8616 - val_loss: 0.1850 - val_acc: 0.8616
Epoch 10/1000
2s - loss: 0.1874 - acc: 0.8623 - val_loss: 0.1856 - val_acc: 0.8512
Epoch 11/1000
2s - loss: 0.1850 - acc: 0.8619 - val_loss: 0.1886 - val_acc: 0.8423
Epoch 12/1000
2s - loss: 0.1841 - acc: 0.8655 - val_loss: 0.1787 - val_acc: 0.8695
Epoch 13/1000

In [235]:
# model_name = 'lystdo_Fasttext_AllFeatures_WordCorrection_Loss' + '_%.4f'%(bst_val_score)
# model_name

In [236]:
from sklearn.metrics import log_loss

SET = 'VAL'

if SET=='VAL':
    pred = model.predict([data_1_val, data_2_val]+leaks_val)
#     weights = weight_val
    gts = labels_val
    weights = [class_weight[label] for label in gts]
elif SET=='TRAIN':
    sample_size = 30000
    rnd_idxes = np.random.permutation(sample_size)
    pred = model.predict([data_1_train[rnd_idxes], data_2_train[rnd_idxes]]+ leaks_train[rnd_idxes])
    gts = labels_train[rnd_idxes]
    weights = [class_weight[label] for label in gts]

pred[pred==0] = 0.000001
pred[pred==1] = 0.999999
log_loss(gts,pred.flatten(), sample_weight=weights)

0.17339470358864317

In [237]:
from sklearn.metrics import log_loss

def abs_dist(a):
    return abs(a-0.5)

pred_a = model.predict([data_1_val, data_2_val]+leaks_val)
pred_b = model.predict([data_2_val, data_1_val]+leaks_val)
preds = [
    np.array([pred_a[i] if abs_dist(pred_a[i])>abs_dist(pred_b[i]) else pred_b[i] for i in range(pred_a.shape[0])]),
    (pred_a + pred_b)/2,
    pred_a,
]

gts = labels_val
weights = [class_weight[label] for label in gts]

for pred in preds:

    pred[pred==0] = 0.000001
    pred[pred==1] = 0.999999
    loss = log_loss(gts,pred.flatten(), sample_weight=weights)
    
    print('loss=',loss)

loss= 0.173394703589
loss= 0.173394703589
loss= 0.173394703589


## Create prediction CSV for testing set

In [214]:
########################################
## make the submission
########################################
print('Start making the submission before fine-tuning')

def do_pred(SET, mode):
    if SET=='train':
        q1 = data_1
        q2 = data_2
        extra = leaks
    else:
        q1 = test_data_1
        q2 = test_data_2
        extra = test_leaks

    if mode=='mean':
        print('Predicting first part')
        preds = model.predict([q1, q2]+extra, batch_size=1024, verbose=0)
        print('Predicting second part')
        preds += model.predict([q2, q1]+extra, batch_size=1024, verbose=0)
        preds /= 2
    else:
        def abs_dist(a):
            return abs(a-0.5)

        print('Predicting first part')
        pred_a = model.predict([q1, q2]+extra, batch_size=1024, verbose=0)
        print('Predicting second part')
        pred_b = model.predict([q2, q1]+extra, batch_size=1024, verbose=0)
        preds = np.array([pred_a[i] if abs_dist(pred_a[i])>abs_dist(pred_b[i]) else pred_b[i] for i in range(pred_a.shape[0])])

    if SET=='train':
        submission = pd.DataFrame({'is_duplicate':preds.ravel()})
        submission.to_csv('./model_predictions/train/' + model_name + '_%.4f'%(bst_val_score) + '_prediction_' + mode + '.csv', index=False)
    else:
        submission = pd.DataFrame({'test_id':test_ids, 'is_duplicate':preds.ravel()})
        submission.to_csv('./model_predictions/test/' + model_name + '_%.4f'%(bst_val_score) + '_prediction_' + mode + '.csv', index=False)
        
do_pred(SET='train', mode='mean')
do_pred(SET='test', mode='mean')

# do_pred(SET='train', mode='max')
# do_pred(SET='test', mode='max')
        
print('Complete')

Start making the submission before fine-tuning
Predicting first part
Predicting second part
Predicting first part
Predicting second part
Complete


In [202]:
1

1

## Feature extraction

Get the internal output of model, which can be used as extracted features/

In [38]:
model.layers

[<keras.engine.topology.InputLayer at 0x7f3947b99828>,
 <keras.engine.topology.InputLayer at 0x7f3947776e80>,
 <keras.engine.topology.InputLayer at 0x7f394764b550>,
 <keras.engine.topology.InputLayer at 0x7f394758e860>,
 <keras.layers.normalization.BatchNormalization at 0x7f3947b999e8>,
 <keras.layers.normalization.BatchNormalization at 0x7f3947739e48>,
 <keras.layers.normalization.BatchNormalization at 0x7f3947677ef0>,
 <keras.layers.normalization.BatchNormalization at 0x7f39475bc860>,
 <keras.layers.core.Activation at 0x7f39477f2da0>,
 <keras.layers.core.Activation at 0x7f39476cf358>,
 <keras.layers.core.Activation at 0x7f394760e198>,
 <keras.layers.core.Activation at 0x7f3947550048>,
 <keras.layers.core.Dropout at 0x7f39477f2d30>,
 <keras.layers.core.Dropout at 0x7f39476cf320>,
 <keras.layers.core.Dropout at 0x7f394760e630>,
 <keras.layers.core.Dropout at 0x7f3947550940>,
 <keras.layers.core.Dense at 0x7f39477f2f60>,
 <keras.layers.core.Dense at 0x7f39476cf438>,
 <keras.layers.core.

In [44]:
import keras.backend as K

layer_id = -9
# model_segment = K.function(model.input+[K.learning_phase()], [model.layers[layer_id].output])
model_segment = K.function(model.input+[K.learning_phase()], [model.layers[layer_id].get_output_at(-1)])

def get_layer_output(inputs):
    return model_segment(inputs+[0])[0]

In [45]:
model_segment.outputs

[<tf.Tensor 'lstm_3_1/TensorArrayReadV3:0' shape=(?, 199) dtype=float32>]

In [46]:
feature_count = num_lstm
feature_names = [model_name+'_lstm_' + str(i) for i in range(feature_count)]

In [48]:
from math import ceil
import csv

SET = 'TRAIN'
# SET = 'TEST'

batch_size = 1024

f_name = './leaks_cache/'+model_name + '_%.4f'%(bst_val_score) + '_feature.csv'

if SET == 'TRAIN':
    q1 = data_1
    q2 = data_2
    other = leaks
else:
    q1 = test_data_1
    q2 = test_data_2
    other = test_leaks

    
with open(f_name,'w') as f:

    writer = csv.writer(f, delimiter=',')
    writer.writerow(feature_names)

    batch_num = int(ceil(q1.shape[0] / batch_size))
    
    for b_id in range(batch_num):
        low = b_id*batch_size
        high = (b_id+1)*batch_size

        extra_inputs = [i[low:high] for i in other]
        extracted_features = get_layer_output([q1[low:high],q2[low:high]]+extra_inputs)

        for feature in extracted_features:
            writer.writerow(feature)

        if b_id % (100000//batch_size) ==0:
            print( b_id*batch_size)
    

0
99328
198656
297984
397312
496640
595968
695296
794624
893952
993280
1092608
1191936
1291264
1390592
1489920
1589248
1688576
1787904
1887232
1986560
2085888
2185216
2284544
