In [1]:
# -*- coding: <utf-8> -*-
from __future__ import unicode_literals
from future.utils import iteritems

import csv
from nltk.tokenize import word_tokenize
import numpy as np

from keras import callbacks
from keras.utils.data_utils import get_file
from keras import layers
from keras.layers.embeddings import Embedding
from keras.layers.core import Dropout
from keras.layers import recurrent
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import text_to_word_sequence

Using TensorFlow backend.


In [2]:
import os

WD = os.getcwd()
DATA_DIR = os.path.join(WD, 'data')
MODEL_DIR = os.path.join(WD, 'model')
GLOVE_DIR = os.path.join(DATA_DIR, 'glove.6B')
print DATA_DIR


/Users/diogomesquita/Documents/Kaggle/impossible_labs_workshop/data


In [3]:
DEBUG=False
USE_PRETRAINED_EMBEDDINGS=True

if USE_PRETRAINED_EMBEDDINGS:
    EMBEDDING_DIM = 50
else:
    EMBEDDING_DIM = 32
    
# must match embeddings dim
GLOVE_FILE = os.path.join(GLOVE_DIR, 'glove.6B.50d.txt')


In [4]:
%%bash -s "$train_file" "$test_file"

head -100 $1 > $1.sample
head $2 > $2.sample

head: $trainfile: No such file or directory
head: $test_file: No such file or directory


In [5]:
if DEBUG:
    N_TRAIN_SAMPLES=60
    N_VAL_SAMPLES=20
    
else:
    N_TRAIN_SAMPLES=2000
    N_VAL_SAMPLES=500
    
    train_file = os.path.join(DATA_DIR, 'train.csv')
    test_file = os.path.join(DATA_DIR, 'test.csv')

In [6]:
def read_data(file):
    with open(file, 'rb') as csv_file:
        reader = csv.DictReader(csv_file)
        for line in reader:
            yield line
            
def preprocess_data(data_gen):
    data = []
    max_question_len = 0
    for line in data_gen:
        q1 = preprocess_sentence(line['question1'])
        q2 = preprocess_sentence(line['question2'])
        is_duplicate = int(line['is_duplicate'])\
            if 'is_duplicate' in line else None 
        
        data.append([q1, q2, is_duplicate])
        
        n_words = max(len(q1), len(q2))
        if n_words > max_question_len:
            max_question_len = n_words

    return data, max_question_len

def preprocess_sentence(sentence):
    return text_to_word_sequence(sentence)

def vectorize_data(data, wrd_idx, max_len):
    q1s = []
    q2s = []
    ys = []
    for row in data:
        q1 = get_wrd_index_sentence(row[0], wrd_idx)
        q2 = get_wrd_index_sentence(row[1], wrd_idx)
        
        q1s.append(q1)
        q2s.append(q2)
        ys.append(row[2])
        
    return pad_sequences(q1s, maxlen=max_len),\
           pad_sequences(q2s, maxlen=max_len),\
           np.array(ys)

def get_wrd_index_sentence(sent, wrd_idx):
    return [wrd_idx[wrd] if wrd in wrd_idx else wrd_idx[UNK] for wrd in sent]

def vectorize_sentence(sent, wrd_idx, max_len):
    sent = get_wrd_index_sentence(sent, wrd_idx)
    return pad_sequences([sent], maxlen=max_len)


In [7]:
def get_embeddings(file):
    embeddings_index = {}
    f = open(file)
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    return embeddings_index


In [8]:
%%time
train_gen = read_data(train_file)
#test_gen = read_data(test_file)

train, max_train_len = preprocess_data(train_gen)
#test, max_test_len = preprocess_data(test_gen)

n_train = len(train)
#n_test = len(test)
#max_question_len = max([max_train_len, max_test_len])
max_question_len = max_train_len

CPU times: user 8.22 s, sys: 348 ms, total: 8.57 s
Wall time: 8.62 s


In [9]:
print "Max question length: ", max_question_len
print "Number of training examples: ", n_train
#print "Number of test examples: ", n_test

have_pos = False
have_neg = False
for question_pairs in train:
    if question_pairs[2] == 1 and not have_pos:
        have_pos = True
        pos = [question_pairs[0], question_pairs[1]]
    elif question_pairs[2] == 0 and not have_neg:
        have_neg = True
        neg = [question_pairs[0], question_pairs[1]]
    if have_pos and have_neg:
        break

print "\nSame question:\n\t- {}\n\t- {}".format(' '.join(pos[0]), ' '.join(pos[1]))
print "\nDifferent question:\n\t- {}\n\t- {}".format(' '.join(neg[0]), ' '.join(neg[1]))

Max question length:  237
Number of training examples:  404290

Same question:
	- astrology i am a capricorn sun cap moon and cap rising what does that say about me
	- i'm a triple capricorn sun moon and ascendant in capricorn what does this say about me

Different question:
	- what is the step by step guide to invest in share market in india
	- what is the step by step guide to invest in share market


In [10]:
%%time
UNK='unk'
vocab = set([UNK])
for q1, q2, y in (train): #+ test):
    vocab |= set(q1 + q2)

# the "+1" is for the padding symbol
vocab_size = len(vocab) + 1

wrd_idx = dict((wrd, i+1) for i, wrd in enumerate(vocab))
wrd_idx[UNK] = 1

print "Vocabulary size: ", vocab_size

Vocabulary size:  95605
CPU times: user 1.74 s, sys: 81.8 ms, total: 1.82 s
Wall time: 1.82 s


In [583]:
%%time
q1, q2, y = vectorize_data(train, wrd_idx, max_question_len)

CPU times: user 7.82 s, sys: 2.65 s, total: 10.5 s
Wall time: 11.9 s


In [584]:
print "A question vector:\n", q1[0]

A question vector:
[    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0  

In [585]:
embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
if USE_PRETRAINED_EMBEDDINGS:
    # get embeddings
    embedding_index = get_embeddings(GLOVE_FILE)
    
    embedding_size = EMBEDDING_DIM
    
    unk_vector= embedding_index.get(UNK)
    for wrd, idx in wrd_idx.iteritems():
        embed_vector = embedding_index[wrd] if wrd in embedding_index else unk_vector
        embedding_matrix[idx] = embed_vector

In [586]:
indices = np.arange(n_train)
np.random.shuffle(indices)

n_train_split = N_TRAIN_SAMPLES
n_val = N_VAL_SAMPLES

assert n_train_split + n_val <= n_train 

print "Number of training examples: ", n_train_split
print "Number of valid examples: ", n_val

Number of training examples:  2000
Number of valid examples:  500


In [587]:
q1_train = q1[indices[:n_train_split]]
q2_train = q2[indices[:n_train_split]]
y_train = y[indices[:n_train_split]]

q1_val = q1[indices[n_train_split:n_train_split+n_val]]
q2_val = q2[indices[n_train_split:n_train_split+n_val]]
y_val = y[indices[n_train_split:n_train_split+n_val]]

print "True labels percentage in train: ", sum(y_train)/float(len(y_train))
print "True labels percentage in val: ", sum(y_val)/float(len(y_val))

True labels percentage in train:  0.3715
True labels percentage in val:  0.374


In [592]:
def dot_product_model(embedding_matrix):
    embed_dropout = 0.3
    lstm_dropout = 0.3
    lstm_size = 64

        
    question1 = layers.Input(shape=(max_question_len,), dtype='int32')
    question2 = layers.Input(shape=(max_question_len,), dtype='int32')
    
    if USE_PRETRAINED_EMBEDDINGS:
        trainable = False
    else:
        trainable = True
        
    weights = [embedding_matrix]
    encoded_question_1 = Embedding(vocab_size,
                            EMBEDDING_DIM,
                            trainable=trainable,
                            weights=weights)(question1)
    #encoded_question_1 = Dropout(embed_dropout)(encoded_question_1)
    encoded_question_1 = recurrent.LSTM(50)(encoded_question_1)
    #encoded_question_1 = Dropout(lstm_dropout)(encoded_question_1)

    encoded_question_2 = Embedding(vocab_size,
                            EMBEDDING_DIM,
                            trainable=trainable,
                            weights=weights)(question2)
    #encoded_question_2 = Dropout(embed_dropout)(encoded_question_2)
    encoded_question_2 = recurrent.LSTM(50)(encoded_question_2)
    #encoded_question_2 = Dropout(lstm_dropout)(encoded_question_2)

    pred = layers.dot([encoded_question_1, encoded_question_2], axes=-1, normalize=True)
    
    return [question1, question2], pred




In [593]:
inputs, output = dot_product_model(embedding_matrix)
model = Model(inputs, output)
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [594]:
%%time
print "Training"
BATCH_SIZE=32
EPOCHS=10

hist = model.fit(
    [q1_train, q2_train],
    y_train,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=([q1_val, q2_val], y_val)
)

Training
Train on 2000 samples, validate on 500 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 14min 53s, sys: 4min 3s, total: 18min 56s
Wall time: 7min 29s


In [606]:
def is_same_question(model, q1, q2, wrd_idx, max_len):
    q1 = preprocess_sentence(q1)
    q2 = preprocess_sentence(q2)

    q1_vec = vectorize_sentence(q1, wrd_idx, max_len)
    q2_vec = vectorize_sentence(q2, wrd_idx, max_len)
    
    p = model.predict([q1_vec, q2_vec])
    return p[0][0] > 0.5

def save_weights(model, file):
    model.save_weights(file)

In [607]:
q1 = "Are you a cool person?".encode('utf-8')
print is_same_question(model, q1, q1, wrd_idx, max_question_len)

q2 = "Who will win the championship?".encode('utf-8')
print is_same_question(model, q1, q2, wrd_idx, max_question_len)

True
False


In [482]:
model_file = MODEL_DIR + '/dot_prod.h5'
save_weights(model, model_file)

### Things to try:
    - elminating words that don't appear often in the corpus
    - adding dropout configuration
    - improving the architecture