In [1]:
import re
import os
import keras.backend as K
import numpy as np
import pandas as pd
from keras import layers, models, utils
import json

Using TensorFlow backend.


In [2]:
def reset_everything():
    import tensorflow as tf
    %reset -f in out dhist
    tf.reset_default_graph()
    K.set_session(tf.InteractiveSession())

In [3]:
# Constants for our networks.  We keep these deliberately small to reduce training time.

VOCAB_SIZE = 250000
EMBEDDING_SIZE = 100
MAX_DOC_LEN = 128
MIN_DOC_LEN = 12

In [4]:
FILE_NAME = 'travel.stackexchange.com.7z'

In [5]:
xml_7z = utils.get_file(
    fname=FILE_NAME,
    origin='https://ia800107.us.archive.org/27/items/stackexchange/' + FILE_NAME
)

In [6]:
print(xml_7z)

C:\Users\GAO\.keras\datasets\travel.stackexchange.com.7z


In [7]:
import subprocess
cmd = ['C:\\Program Files\\7-Zip\\7z.exe', 'x', '-so', xml_7z, 'Posts.xml']
sp = subprocess.Popen(cmd, stderr=subprocess.STDOUT, stdout=subprocess.PIPE, shell=True)

In [8]:
result = sp.communicate()[0].decode('utf-8')

In [9]:
def extract_stackexchange(limit=1000000):
    json_file = 'data/' + FILE_NAME + '-limit=%s.json' % limit

    rows = []
    for i, line in enumerate(result.splitlines()):
        line = str(line)
        #print(line)
        if not line.startswith('  <row'):
            continue
            
        if i % 1000 == 0:
            print('\r%05d/%05d' % (i, limit), end='', flush=True)

        parts = line[6:-5].split('"')
        record = {}
        for i in range(0, len(parts), 2):
            k = parts[i].replace('=', '').strip()
            v = parts[i+1].strip()
            record[k] = v
        rows.append(record)
        
        if len(rows) > limit:
            break
    
    with open(json_file, 'w') as fout:
        json.dump(rows, fout)
    
    return rows

rows = extract_stackexchange()

95000/1000000

# Data Exploration

In [10]:
df = pd.DataFrame.from_records(rows)    
df = df.set_index('Id', drop=False)
df['Title'] = df['Title'].fillna('').astype('str')
df['Tags'] = df['Tags'].fillna('').astype('str')
df['Body'] = df['Body'].fillna('').astype('str')
df['Id'] = df['Id'].astype('int')
df['PostTypeId'] = df['PostTypeId'].astype('int')
df['ViewCount'] = df['ViewCount'].astype('float')

df.head()

Unnamed: 0_level_0,AcceptedAnswerId,AnswerCount,Body,ClosedDate,CommentCount,CommunityOwnedDate,CreationDate,FavoriteCount,Id,LastActivityDate,...,LastEditorDisplayName,LastEditorUserId,OwnerDisplayName,OwnerUserId,ParentId,PostTypeId,Score,Tags,Title,ViewCount
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,393.0,4.0,&lt;p&gt;My fiancée and I are looking for a go...,2013-02-25T23:52:47.95,4.0,,2011-06-21T20:19:34.730,,1,2012-05-24T14:52:14.760,...,,101.0,,9,,1,8,&lt;caribbean&gt;&lt;cruising&gt;&lt;vacations...,What are some Caribbean cruises for October?,462.0
2,,8.0,&lt;p&gt;This was one of our definition questi...,,4.0,,2011-06-21T20:22:33.760,,2,2018-08-26T00:04:13.520,...,,51577.0,,13,,1,37,&lt;guides&gt;&lt;extreme-tourism&gt;&lt;amazo...,How can I find a guide that will take me safel...,2116.0
3,,,&lt;p&gt;One way would be to go through an Adv...,,,,2011-06-21T20:24:28.080,,3,2011-06-21T20:24:28.080,...,,,,9,2.0,2,15,,,
4,,1.0,&lt;p&gt;Singapore Airlines has an all-busines...,,,,2011-06-21T20:24:57.160,,4,2013-01-09T09:55:22.743,...,,693.0,,24,,1,8,&lt;loyalty-programs&gt;&lt;routes&gt;&lt;ewr&...,Does Singapore Airlines offer any reward seats...,256.0
5,770.0,5.0,&lt;p&gt;Another definition question that inte...,,0.0,,2011-06-21T20:25:56.787,,5,2012-10-12T20:49:08.110,...,,101.0,,13,,1,14,&lt;romania&gt;&lt;transportation&gt;,What is the easiest transportation to use thro...,428.0


In [11]:
list(df[df['ViewCount'] > 250000]['Title'])

['Do I need a US visa to transit (or layover) through an American airport?',
 'How much electronics and other valuables can I bring duty-free when going to India?',
 'How to get from Nice to Monaco by public transport?',
 'Should my first trip be to the country which issued my Schengen Visa?',
 "What's the difference between 'Redress Number' and 'Known Traveler Number'? Do I need both for TSA PreCheck?",
 'Can I use Google Maps traffic information to estimate driving time for a specific date/time?',
 'Are aerosol cans allowed and safe, in checked luggage?',
 'How to track my UK Visa Application Status?',
 "When applying for an Indian Passport, how do I know if I'm in the ECR or non-ECR category?",
 'Are battery packs allowed in hand luggage?']

In [12]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(df['Body'] + df['Title'])

In [13]:
# Compute TF/IDF Values

total_count = sum(tokenizer.word_counts.values())
idf = { k: np.log(total_count/v) for (k,v) in tokenizer.word_counts.items() }

In [14]:
# Download pre-trained word2vec embeddings

import gensim

glove_100d = utils.get_file(
    fname='glove.6B.100d.txt',
    origin='https://storage.googleapis.com/deep-learning-cookbook/glove.6B.100d.txt',
)

w2v_100d = glove_100d + '.w2v'
from gensim.scripts.glove2word2vec import glove2word2vec
glove2word2vec(glove_100d, w2v_100d)
w2v_model = gensim.models.KeyedVectors.load_word2vec_format(w2v_100d)

w2v_weights = np.zeros((VOCAB_SIZE, w2v_model.syn0.shape[1]))
idf_weights = np.zeros((VOCAB_SIZE, 1))

for k, v in tokenizer.word_index.items():
    if v >= VOCAB_SIZE:
        continue
    
    if k in w2v_model:
        w2v_weights[v] = w2v_model[k]
    
    idf_weights[v] = idf[k]
    
del w2v_model

  from ipykernel import kernelapp as app


In [15]:
df['title_tokens'] = tokenizer.texts_to_sequences(df['Title'])
df['body_tokens'] = tokenizer.texts_to_sequences(df['Body'])

In [16]:
import random

# We can create a data generator that will randomly title and body tokens for questions.  We'll use random text
# from other questions as a negative example when necessary.
def data_generator(batch_size, negative_samples=1):
    questions = df[df['PostTypeId'] == 1]
    all_q_ids = list(questions.index)
        
    batch_x_a = []
    batch_x_b = []
    batch_y = []
    
    def _add(x_a, x_b, y):
        batch_x_a.append(x_a[:MAX_DOC_LEN])
        batch_x_b.append(x_b[:MAX_DOC_LEN])
        batch_y.append(y)
    
    while True:
        questions = questions.sample(frac=1.0)
        
        for i, q in questions.iterrows():
            _add(q['title_tokens'], q['body_tokens'], 1)
            
            negative_q = random.sample(all_q_ids, negative_samples)
            for nq_id in negative_q:
                _add(q['title_tokens'], df.at[nq_id, 'body_tokens'], 0)            
            
            if len(batch_y) >= batch_size:
                yield ({
                    'title': pad_sequences(batch_x_a, maxlen=None),
                    'body': pad_sequences(batch_x_b, maxlen=None),
                }, np.asarray(batch_y))
                
                batch_x_a = []
                batch_x_b = []
                batch_y = []

# dg = data_generator(1, 2)
# next(dg)
# next(dg)

# Embedding Lookups

In [17]:
questions = df[df['PostTypeId'] == 1]['Title'].reset_index(drop=True)
question_tokens = pad_sequences(tokenizer.texts_to_sequences(questions))

class EmbeddingWrapper(object):
    def __init__(self, model):
        self._r = questions
        self._i = {i:s for (i, s) in enumerate(questions)}
        self._w = model.predict({'title': question_tokens}, verbose=1, batch_size=1024)
        self._model = model
        self._norm = np.sqrt(np.sum(self._w * self._w + 1e-5, axis=1))

    def nearest(self, sentence, n=10):
        x = tokenizer.texts_to_sequences([sentence])
        if len(x[0]) < MIN_DOC_LEN:
            x[0] += [0] * (MIN_DOC_LEN - len(x))
        e = self._model.predict(np.asarray(x))[0]
        norm_e = np.sqrt(np.dot(e, e))
        dist = np.dot(self._w, e) / (norm_e * self._norm)

        top_idx = np.argsort(dist)[-n:]
        return pd.DataFrame.from_records([
            {'question': self._r[i], 'dist': float(dist[i])}
            for i in top_idx
        ])

In [18]:
# Our first model will just sum up the embeddings of each token.
# The similarity between documents will be the dot product of the final embedding.

import tensorflow as tf

def sum_model(embedding_size, vocab_size, embedding_weights=None, idf_weights=None):
    title = layers.Input(shape=(None,), dtype='int32', name='title')
    body = layers.Input(shape=(None,), dtype='int32', name='body')

    def make_embedding(name):
        if embedding_weights is not None:
            embedding = layers.Embedding(mask_zero=True, input_dim=vocab_size, output_dim=w2v_weights.shape[1], 
                                         weights=[w2v_weights], trainable=False, 
                                         name='%s/embedding' % name)
        else:
            embedding = layers.Embedding(mask_zero=True, input_dim=vocab_size, output_dim=embedding_size,
                                        name='%s/embedding' % name)

        if idf_weights is not None:
            idf = layers.Embedding(mask_zero=True, input_dim=vocab_size, output_dim=1, 
                                   weights=[idf_weights], trainable=False,
                                   name='%s/idf' % name)
        else:
            idf = layers.Embedding(mask_zero=True, input_dim=vocab_size, output_dim=1,
                                   name='%s/idf' % name)
            
        return embedding, idf
    
    embedding_a, idf_a = make_embedding('a')
    embedding_b, idf_b = embedding_a, idf_a
#     embedding_b, idf_b = make_embedding('b')

    mask = layers.Masking(mask_value=0)
    def _combine_and_sum(args):
        [embedding, idf] = args
        return K.sum(embedding * K.abs(idf), axis=1)

    sum_layer = layers.Lambda(_combine_and_sum, name='combine_and_sum')

    sum_a = sum_layer([mask(embedding_a(title)), idf_a(title)])
    sum_b = sum_layer([mask(embedding_b(body)), idf_b(body)])

    sim = layers.dot([sum_a, sum_b], axes=1, normalize=True)
    sim_model = models.Model(
        inputs=[title, body],
        outputs=[sim],
    )
    sim_model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['accuracy'])
    sim_model.summary()

    embedding_model = models.Model(
        inputs=[title],
        outputs=[sum_a]
    )
    return sim_model, embedding_model

In [19]:
# Try using our model with pretrained weights from word2vec

sum_model_precomputed, sum_embedding_precomputed = sum_model(
    embedding_size=EMBEDDING_SIZE, vocab_size=VOCAB_SIZE,
    embedding_weights=w2v_weights, idf_weights=idf_weights
)

x, y = next(data_generator(batch_size=4096))
sum_model_precomputed.evaluate(x, y)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
title (InputLayer)              (None, None)         0                                            
__________________________________________________________________________________________________
body (InputLayer)               (None, None)         0                                            
__________________________________________________________________________________________________
a/embedding (Embedding)         (None, None, 100)    25000000    title[0][0]                      
                                                                 body[0][0]                       
__________________________________________________________________________________________________
masking_1 (Masking)             (None, None, 100)    0           a/embedding[0][0]                
          

[0.959646952804178, 0.5087890625]

In [20]:
SAMPLE_QUESTIONS = [
    'Roundtrip ticket versus one way',
    'Shinkansen from Kyoto to Hiroshima',
    'Bus tour of Germany',
]

def evaluate_sample(lookup):
    pd.set_option('display.max_colwidth', 100)
    results = []
    for q in SAMPLE_QUESTIONS:
        print(q)
        q_res = lookup.nearest(q, n=4)
        q_res['result'] = q_res['question']
        q_res['question'] = q
        results.append(q_res)

    return pd.concat(results)

lookup = EmbeddingWrapper(model=sum_embedding_precomputed)
evaluate_sample(lookup)

Roundtrip ticket versus one way
Shinkansen from Kyoto to Hiroshima
Bus tour of Germany


Unnamed: 0,dist,question,result
0,0.811414,Roundtrip ticket versus one way,"Buy a roundtrip ticket for two people, but second person only travels on return - is that possible"
1,0.813275,Roundtrip ticket versus one way,How to pick the (phony) return destination for a roundtrip ticket intended as a one-way?
2,0.815021,Roundtrip ticket versus one way,What is cheapest way to fly around SE Asia in a circuit - hub with roundtrip tickets or sequence...
3,0.826081,Roundtrip ticket versus one way,The penalty for changing an airline ticket is per leg or per ticket?
0,0.753298,Shinkansen from Kyoto to Hiroshima,Culture Day in Osaka/Kyoto
1,0.75698,Shinkansen from Kyoto to Hiroshima,Where does the Tokaido Shinkansen stop in Tokyo?
2,0.775568,Shinkansen from Kyoto to Hiroshima,Best connection Tokyo - Kyoto
3,0.813006,Shinkansen from Kyoto to Hiroshima,Travel from Tokyo to Sendai with Shinkansen
0,0.891223,Bus tour of Germany,Trip in the south of Germany
1,0.895116,Bus tour of Germany,Travelling outside of Germany on a German Working Holiday visa (Australian)


# Training our own network

In [21]:
sum_model_trained, sum_embedding_trained = sum_model(
    embedding_size=EMBEDDING_SIZE, vocab_size=VOCAB_SIZE, 
    embedding_weights=None,
    idf_weights=None
)
sum_model_trained.fit_generator(
    data_generator(batch_size=128),
    epochs=10,
    steps_per_epoch=1000
)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
title (InputLayer)              (None, None)         0                                            
__________________________________________________________________________________________________
body (InputLayer)               (None, None)         0                                            
__________________________________________________________________________________________________
a/embedding (Embedding)         (None, None, 100)    25000000    title[0][0]                      
                                                                 body[0][0]                       
__________________________________________________________________________________________________
masking_2 (Masking)             (None, None, 100)    0           a/embedding[0][0]                
          

<keras.callbacks.History at 0x25fa504b630>

In [22]:
lookup = EmbeddingWrapper(model=sum_embedding_trained)
evaluate_sample(lookup)

Roundtrip ticket versus one way
Shinkansen from Kyoto to Hiroshima
Bus tour of Germany


Unnamed: 0,dist,question,result
0,0.756217,Roundtrip ticket versus one way,"Is there any way I can book separate airline's tickets on single ticket, without the help of tra..."
1,0.760429,Roundtrip ticket versus one way,How to pick the (phony) return destination for a roundtrip ticket intended as a one-way?
2,0.769111,Roundtrip ticket versus one way,How to get return prices for a one way ticket?
3,0.837668,Roundtrip ticket versus one way,"Buy a roundtrip ticket for two people, but second person only travels on return - is that possible"
0,0.963801,Shinkansen from Kyoto to Hiroshima,Options for sending a message while on a Shinkansen
1,0.9656,Shinkansen from Kyoto to Hiroshima,How early should I reserve Shinkansen tickets during April?
2,0.968743,Shinkansen from Kyoto to Hiroshima,What does my Shinkansen ticket say?
3,0.975935,Shinkansen from Kyoto to Hiroshima,Can I converse with Japanese travellers on the Shinkansen or is it rude?
0,0.678572,Bus tour of Germany,Where will my bus be at the Port Authority Bus Terminal in New York?
1,0.698401,Bus tour of Germany,How can I get information about bus lines between towns within Poland?


# CNN Model

In [23]:
def cnn_model(embedding_size, vocab_size):
    title = layers.Input(shape=(None,), dtype='int32', name='title')
    body = layers.Input(shape=(None,), dtype='int32', name='body')

    embedding = layers.Embedding(
        mask_zero=False,
        input_dim=vocab_size,
        output_dim=embedding_size,
    )


    def _combine_sum(v):
        return K.sum(v, axis=1)

    cnn_1 = layers.Convolution1D(256, 3)
    cnn_2 = layers.Convolution1D(256, 3)
    cnn_3 = layers.Convolution1D(256, 3)
    
    global_pool = layers.GlobalMaxPooling1D()
    local_pool = layers.MaxPooling1D(strides=2, pool_size=3)

    def forward(input):
        embed = embedding(input)
        return global_pool(
            cnn_2(local_pool(cnn_1(embed))))

    sum_a = forward(title)
    sum_b = forward(body)

    sim = layers.dot([sum_a, sum_b], axes=1, normalize=False)
    sim_model = models.Model(
        inputs=[title, body],
        outputs=[sim],
    )
    sim_model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

    embedding_model = models.Model(
        inputs=[title],
        outputs=[sum_a]
    )
    return sim_model, embedding_model

In [24]:
cnn, cnn_embedding = cnn_model(embedding_size=25, vocab_size=VOCAB_SIZE)
cnn.summary()
cnn.fit_generator(
    data_generator(batch_size=128),
    epochs=10,
    steps_per_epoch=1000,
)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
title (InputLayer)              (None, None)         0                                            
__________________________________________________________________________________________________
body (InputLayer)               (None, None)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 25)     6250000     title[0][0]                      
                                                                 body[0][0]                       
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, None, 256)    19456       embedding_1[0][0]                
          

<keras.callbacks.History at 0x25fa297a208>

In [25]:
evaluate_sample(lookup)

Roundtrip ticket versus one way
Shinkansen from Kyoto to Hiroshima
Bus tour of Germany


Unnamed: 0,dist,question,result
0,0.756217,Roundtrip ticket versus one way,"Is there any way I can book separate airline's tickets on single ticket, without the help of tra..."
1,0.760429,Roundtrip ticket versus one way,How to pick the (phony) return destination for a roundtrip ticket intended as a one-way?
2,0.769111,Roundtrip ticket versus one way,How to get return prices for a one way ticket?
3,0.837668,Roundtrip ticket versus one way,"Buy a roundtrip ticket for two people, but second person only travels on return - is that possible"
0,0.963801,Shinkansen from Kyoto to Hiroshima,Options for sending a message while on a Shinkansen
1,0.9656,Shinkansen from Kyoto to Hiroshima,How early should I reserve Shinkansen tickets during April?
2,0.968743,Shinkansen from Kyoto to Hiroshima,What does my Shinkansen ticket say?
3,0.975935,Shinkansen from Kyoto to Hiroshima,Can I converse with Japanese travellers on the Shinkansen or is it rude?
0,0.678572,Bus tour of Germany,Where will my bus be at the Port Authority Bus Terminal in New York?
1,0.698401,Bus tour of Germany,How can I get information about bus lines between towns within Poland?


# LSTM Model

In [26]:
def lstm_model(embedding_size, vocab_size):
    title = layers.Input(shape=(None,), dtype='int32', name='title')
    body = layers.Input(shape=(None,), dtype='int32', name='body')

    embedding = layers.Embedding(
        mask_zero=True,
        input_dim=vocab_size,
        output_dim=embedding_size,
#         weights=[w2v_weights],
#         trainable=False
    )

    lstm_1 = layers.LSTM(units=512, return_sequences=True)
    lstm_2 = layers.LSTM(units=512, return_sequences=False)
    
    sum_a = lstm_2(lstm_1(embedding(title)))
    sum_b = lstm_2(lstm_1(embedding(body)))

    sim = layers.dot([sum_a, sum_b], axes=1, normalize=True)
#     sim = layers.Activation(activation='sigmoid')(sim)
    sim_model = models.Model(
        inputs=[title, body],
        outputs=[sim],
    )
    sim_model.compile(loss='binary_crossentropy', optimizer='rmsprop')

    embedding_model = models.Model(
        inputs=[title],
        outputs=[sum_a]
    )
    return sim_model, embedding_model

In [27]:
lstm, lstm_embedding = lstm_model(embedding_size=EMBEDDING_SIZE, vocab_size=VOCAB_SIZE)
lstm.summary()
lstm.fit_generator(
    data_generator(batch_size=128),
    epochs=10,
    steps_per_epoch=100,
)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
title (InputLayer)              (None, None)         0                                            
__________________________________________________________________________________________________
body (InputLayer)               (None, None)         0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 100)    25000000    title[0][0]                      
                                                                 body[0][0]                       
__________________________________________________________________________________________________
lstm_1 (LSTM)                   (None, None, 512)    1255424     embedding_2[0][0]                
          

<keras.callbacks.History at 0x25ff2c92e48>

In [28]:
lookup = EmbeddingWrapper(model=lstm_embedding)
evaluate_sample(lookup)

Roundtrip ticket versus one way
Shinkansen from Kyoto to Hiroshima
Bus tour of Germany


Unnamed: 0,dist,question,result
0,0.99988,Roundtrip ticket versus one way,Health Insurance coverage in UK
1,0.999883,Roundtrip ticket versus one way,Cancelling apartment booking in Switzerland
2,0.999932,Roundtrip ticket versus one way,Transatlantic cruises allowing casual dressing?
3,0.999954,Roundtrip ticket versus one way,Bringing work goods across border
0,0.999942,Shinkansen from Kyoto to Hiroshima,Georgia invitation letter for visa
1,0.999943,Shinkansen from Kyoto to Hiroshima,Door stopper for hotel room
2,0.999947,Shinkansen from Kyoto to Hiroshima,Schengen business visa - multiple entry
3,0.999948,Shinkansen from Kyoto to Hiroshima,Schengen visa: appointment not available
0,0.999787,Bus tour of Germany,Visiting Moscow on Stopover
1,0.999791,Bus tour of Germany,US Visa for spouse
