In [1]:
import re
import os
import keras.backend as K
import numpy as np
import pandas as pd
from keras import layers, models

CACHE_DIR = os.path.expanduser('~/.cache/dl-cookbook')

def download(url):
    filename = os.path.join(CACHE_DIR, re.sub('[^a-zA-Z0-9.]+', '_', url))
    if os.path.exists(filename):
        return filename
    else:
        os.system('mkdir -p "%s"' % CACHE_DIR)
        assert os.system('wget -O "%s" "%s"' % (filename, url)) == 0
        return filename

Using TensorFlow backend.


In [2]:
def reset_everything():
    import tensorflow as tf
    %reset -f in out dhist
    tf.reset_default_graph()
    K.set_session(tf.InteractiveSession())

In [3]:
VOCAB_SIZE = 250000
EMBEDDING_SIZE = 100
MAX_DOC_LEN = 128
MIN_DOC_LEN = 8

In [4]:
def download_quora():
    quora_dataset = download("http://qim.ec.quoracdn.net/quora_duplicate_questions.tsv")
    df = pd.read_csv(quora_dataset, sep='\t')
    df['question1'] = df['question1'].astype('str')
    df['question2'] = df['question2'].astype('str')
    return df

In [5]:
def parse_stackexchange(filename, limit):
    rows = []
    for i, line in enumerate(os.popen('7z x -so "%s" Posts.xml' % filename)):
        if not line.startswith('  <row'):
            continue
            
        if i % 1000 == 0:
            print('\r%05d/%05d' % (i, limit), end='', flush=True)

        parts = line[6:-5].split('"')
        record = {}
        for i in range(0, len(parts), 2):
            k = parts[i].replace('=', '').strip()
            v = parts[i+1].strip()
            record[k] = v
        rows.append(record)
        
        if len(rows) > limit:
            break

    df = pd.DataFrame.from_records(rows)    
    df = df.set_index('Id', drop=False)
    return df

def download_stackexchange(limit=1000000):
    xml_7z = download("https://ia600500.us.archive.org/22/items/stackexchange/stackoverflow.com-Posts.7z")
    csv_file = xml_7z + 'limit=%s.csv.gz' % limit
    if not os.path.exists(csv_file):
        df = parse_stackexchange(xml_7z, limit=limit)
        df.to_csv(csv_file, compression='gzip')
    else:
        df = pd.read_csv(csv_file)
        
    df['Title'] = df['Title'].fillna('').astype('str')
    df['Tags'] = df['Tags'].fillna('').astype('str')
    df['Body'] = df['Body'].fillna('').astype('str')
    df['Id'] = df['Id'].astype('int')
    df['PostTypeId'] = df['PostTypeId'].astype('int')
    return df

df = download_stackexchange()
df.head()

Unnamed: 0,Id,AcceptedAnswerId,AnswerCount,Body,ClosedDate,CommentCount,CommunityOwnedDate,CreationDate,FavoriteCount,Id.1,...,LastEditorDisplayName,LastEditorUserId,OwnerDisplayName,OwnerUserId,ParentId,PostTypeId,Score,Tags,Title,ViewCount
0,4,7.0,13.0,&lt;p&gt;I want to use a track-bar to change a...,,4,2012-10-31T16:42:47.213,2008-07-31T21:42:52.667,37.0,4,...,Rich B,126970.0,,8.0,,1,491,&lt;c#&gt;&lt;winforms&gt;&lt;type-conversion&...,While applying opacity to a form should we use...,31416.0
1,6,31.0,5.0,&lt;p&gt;I have an absolutely positioned &lt;c...,,0,,2008-07-31T22:08:08.620,8.0,6,...,Rich B,63550.0,,9.0,,1,217,&lt;html&gt;&lt;css&gt;&lt;css3&gt;&lt;interne...,Percentage width child element in absolutely p...,14712.0
2,7,,,&lt;p&gt;An explicit cast to double isn't nece...,,1,,2008-07-31T22:17:57.883,,7,...,,967315.0,,9.0,4.0,2,349,,,
3,9,1404.0,58.0,&lt;p&gt;Given a &lt;code&gt;DateTime&lt;/code...,,6,2011-08-16T19:40:43.080,2008-07-31T23:40:59.743,341.0,9,...,Rich B,6025198.0,,1.0,,1,1509,&lt;c#&gt;&lt;.net&gt;&lt;datetime&gt;,Calculate age in C#,388201.0
4,11,1248.0,33.0,&lt;p&gt;Given a specific &lt;code&gt;DateTime...,,3,2009-09-04T13:15:59.820,2008-07-31T23:55:37.967,517.0,11,...,user2370523,6479704.0,,1.0,,1,1182,&lt;c#&gt;&lt;datetime&gt;&lt;time&gt;&lt;date...,Calculate relative time in C#,120658.0


In [83]:
list(df[df['ViewCount'] > 2500000]['Title'])

['How to horizontally center a &lt;div&gt; in another &lt;div&gt;?',
 'What is the best comment in source code you have ever encountered?',
 'How do I generate random integers within a specific range in Java?',
 'How to redirect to another webpage in JavaScript/jQuery?',
 'How can I get query string values in JavaScript?',
 'How to check whether a checkbox is checked in jQuery?',
 'How do I undo the last commit(s) in Git?',
 'Iterate through a HashMap',
 'Get selected value in dropdown list using JavaScript?',
 'How do I declare and initialize an array in Java?']

In [88]:
dict(df.iloc[0].drop(['body_tokens', 'title_tokens']))

{'AcceptedAnswerId': 7.0,
 'AnswerCount': 13.0,
 'Body': "&lt;p&gt;I want to use a track-bar to change a form's opacity.&lt;/p&gt;&#xA;&#xA;&lt;p&gt;This is my code:&lt;/p&gt;&#xA;&#xA;&lt;pre&gt;&lt;code&gt;decimal trans = trackBar1.Value / 5000;&#xA;this.Opacity = trans;&#xA;&lt;/code&gt;&lt;/pre&gt;&#xA;&#xA;&lt;p&gt;When I build the application, it gives the following error:&lt;/p&gt;&#xA;&#xA;&lt;blockquote&gt;&#xA;  &lt;p&gt;Cannot implicitly convert type 'decimal' to 'double'.&lt;/p&gt;&#xA;&lt;/blockquote&gt;&#xA;&#xA;&lt;p&gt;I tried using &lt;code&gt;trans&lt;/code&gt; and &lt;code&gt;double&lt;/code&gt; but then the control doesn't work. This code worked fine in a past VB.NET project. &lt;/p&gt;&#xA;",
 'ClosedDate': nan,
 'CommentCount': 4,
 'CommunityOwnedDate': '2012-10-31T16:42:47.213',
 'CreationDate': '2008-07-31T21:42:52.667',
 'FavoriteCount': 37.0,
 'Id': 4,
 'Id.1': 4,
 'LastActivityDate': '2017-03-10T15:18:33.147',
 'LastEditDate': '2017-03-10T15:18:33.147',
 'Las

In [6]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(df['Body'] + df['Title'])

In [7]:
total_count = sum(tokenizer.word_counts.values())
idf = { k: np.log(total_count/v) for (k,v) in tokenizer.word_counts.items() }

In [8]:
import gensim

word2vec_gz = download('https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz')
#assert os.system('gunzip -d --keep "%s"' % word2vec_gz) == 0
w2v_model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_gz.replace('.gz', ''), binary=True)

w2v_weights = np.zeros((VOCAB_SIZE, w2v_model.syn0.shape[1]))
idf_weights = np.zeros((VOCAB_SIZE, 1))

for k, v in tokenizer.word_index.items():
    if v >= VOCAB_SIZE:
        continue
    
    if k in w2v_model:
        w2v_weights[v] = w2v_model[k]
    
    idf_weights[v] = idf[k]
    
del w2v_model

In [9]:
df['title_tokens'] = tokenizer.texts_to_sequences(df['Title'])
df['body_tokens'] = tokenizer.texts_to_sequences(df['Body'])

In [17]:
import random

def data_generator(batch_size, negative_samples=1):
    questions = df[df['PostTypeId'] == 1]
    all_q_ids = list(questions.index)
        
    batch_x_a = []
    batch_x_b = []
    batch_y = []
    
    def _add(x_a, x_b, y):
        batch_x_a.append(x_a[:MAX_DOC_LEN])
        batch_x_b.append(x_b[:MAX_DOC_LEN])
        batch_y.append(y)
    
    while True:
        questions = questions.sample(frac=1.0)
        
        for i, q in questions.iterrows():
            _add(q['title_tokens'], q['body_tokens'], 1)
            
            negative_q = random.sample(all_q_ids, negative_samples)
            for nq_id in negative_q:
                _add(q['title_tokens'], df.at[nq_id, 'body_tokens'], 0)            
            
            if len(batch_y) >= batch_size:
                yield ({
                    'title': pad_sequences(batch_x_a, maxlen=None),
                    'body': pad_sequences(batch_x_b, maxlen=None),
                }, np.asarray(batch_y))
                
                batch_x_a = []
                batch_x_b = []
                batch_y = []

# dg = data_generator(1, 2)
# next(dg)
# next(dg)

# Embedding Lookups

Let's define a helper class for looking up our embedding results.  We'll use it
to verify our models.

In [11]:
questions = df[df['PostTypeId'] == 1]['Title'].reset_index(drop=True)
question_tokens = pad_sequences(tokenizer.texts_to_sequences(questions))

class EmbeddingWrapper(object):
    def __init__(self, model):
        self._r = questions
        self._i = {i:s for (i, s) in enumerate(questions)}
        self._w = model.predict({'title': question_tokens}, verbose=1, batch_size=1024)
        self._model = model
        self._norm = np.sqrt(np.sum(self._w * self._w + 1e-5, axis=1))

    def nearest(self, sentence, n=10):
        x = tokenizer.texts_to_sequences([sentence])
        e = self._model.predict(np.asarray(x))[0]
        norm_e = np.sqrt(np.dot(e, e))
        dist = np.dot(self._w, e) / (norm_e * self._norm)

        top_idx = np.argsort(dist)[-n:]
        return pd.DataFrame.from_records([
            {'question': self._r[i], 'dist': float(dist[i])}
            for i in top_idx
        ])

In [36]:
import tensorflow as tf

def sum_model(embedding_size, vocab_size, embedding_weights=None, idf_weights=None):
    title = layers.Input(shape=(None,), dtype='int32', name='title')
    body = layers.Input(shape=(None,), dtype='int32', name='body')

    def make_embedding(name):
        if embedding_weights is not None:
            embedding = layers.Embedding(mask_zero=True, input_dim=vocab_size, output_dim=w2v_weights.shape[1], 
                                         weights=[w2v_weights], trainable=False, 
                                         name='%s/embedding' % name)
        else:
            embedding = layers.Embedding(mask_zero=True, input_dim=vocab_size, output_dim=embedding_size,
                                        name='%s/embedding' % name)

        if idf_weights is not None:
            idf = layers.Embedding(mask_zero=True, input_dim=vocab_size, output_dim=1, 
                                   weights=[idf_weights], trainable=False,
                                   name='%s/idf' % name)
        else:
            idf = layers.Embedding(mask_zero=True, input_dim=vocab_size, output_dim=1,
                                   name='%s/idf' % name)
            
        return embedding, idf
    
    embedding_a, idf_a = make_embedding('a')
    embedding_b, idf_b = embedding_a, idf_a
#     embedding_b, idf_b = make_embedding('b')

    mask = layers.Masking(mask_value=0)
    def _combine_and_sum(args):
        [embedding, idf] = args
        return K.sum(embedding * K.abs(idf), axis=1)

    sum_layer = layers.Lambda(_combine_and_sum, name='combine_and_sum')

    sum_a = sum_layer([mask(embedding_a(title)), idf_a(title)])
    sum_b = sum_layer([mask(embedding_b(body)), idf_b(body)])

    sim = layers.dot([sum_a, sum_b], axes=1, normalize=True)
    sim_model = models.Model(
        inputs=[title, body],
        outputs=[sim],
    )
    sim_model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['accuracy'])
    sim_model.summary()

    embedding_model = models.Model(
        inputs=[title],
        outputs=[sum_a]
    )
    return sim_model, embedding_model

In [37]:
sum_model_precomputed, sum_embedding_precomputed = sum_model(
    embedding_size=EMBEDDING_SIZE, vocab_size=VOCAB_SIZE,
    embedding_weights=w2v_weights, idf_weights=idf_weights
)

x, y = next(data_generator(batch_size=4096))
sum_model_precomputed.evaluate(x, y)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
words_a (InputLayer)             (None, None)          0                                            
____________________________________________________________________________________________________
words_b (InputLayer)             (None, None)          0                                            
____________________________________________________________________________________________________
a/embedding (Embedding)          (None, None, 300)     75000000    words_a[0][0]                    
                                                                   words_b[0][0]                    
____________________________________________________________________________________________________
masking_1 (Masking)              (None, None, 300)     0           a/embedding[0][0]       

[0.61408924963325262, 0.67041015625]

In [38]:
lookup = EmbeddingWrapper(model=sum_embedding_precomputed)
lookup.nearest('Python Postgres object relational model')



Unnamed: 0,dist,question
0,0.708515,What is relational parametricity?
1,0.721483,What is an Object-Relational Mapping Framework?
2,0.722154,Is LINQ an Object-Relational Mapper?
3,0.724592,&quot;Diffing&quot; objects from a relational ...
4,0.725993,Is Functional to Relational mapping easier tha...
5,0.732545,Object-oriented-like structures in relational ...
6,0.737159,How can I model this in a relational database?
7,0.742205,Object/Relational mapping
8,0.746952,Object Oriented Database Vs object Relational ...
9,0.802204,ElevateDB relational model do’s and dont’s


In [39]:
lookup.nearest('Can I store JSON in Postgres?')

Unnamed: 0,dist,question
0,0.754007,How to store a file in LDAP?
1,0.760092,Mac OS X: Where should I store common applicat...
2,0.761879,"My winform app uses xml files to store data, w..."
3,0.765723,How Does MySQL Store Enums?
4,0.766301,How can I store a string in an sqlite3 Databas...
5,0.769674,how can i store value in an NSArray using Writ...
6,0.774603,How do I store just a date in MS SQL from VB?
7,0.786789,Can I store SQL Server sort order in a variable?
8,0.787879,Where should I store SSIS Data?
9,0.799822,How do I store an plist on iPhone?


# Training our own network

The results are okay but not great... instead of using the word2vec embeddings, what happens if we train our network end-to-end?

In [40]:
reset_everything()

sum_model_trained, sum_embedding_trained = sum_model(
    embedding_size=EMBEDDING_SIZE, vocab_size=VOCAB_SIZE, 
    embedding_weights=None,
    idf_weights=None
)
sum_model_trained.fit_generator(
    data_generator(batch_size=128),
    epochs=10,
    steps_per_epoch=1000
)

Flushing input history
Flushing output cache (7 entries)
Flushing directory history
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
words_a (InputLayer)             (None, None)          0                                            
____________________________________________________________________________________________________
words_b (InputLayer)             (None, None)          0                                            
____________________________________________________________________________________________________
a/embedding (Embedding)          (None, None, 100)     25000000    words_a[0][0]                    
                                                                   words_b[0][0]                    
____________________________________________________________________________________________________
masking

<keras.callbacks.History at 0x7f5158c49dd8>

In [41]:
lookup = EmbeddingWrapper(model=sum_embedding_trained)
lookup.nearest('Python Postgres object relational model')



Unnamed: 0,dist,question
0,0.849232,working with django and sqlalchemy but backend...
1,0.854862,Python ORM that auto-generates/updates tables ...
2,0.864254,Dynamic Table Creation and ORM mapping in SqlA...
3,0.870934,"SQLAlchemy with count, group_by and order_by u..."
4,0.874244,SQLAlchemy: Scan huge tables using ORM?
5,0.87728,Efficiently updating database using SQLAlchemy...
6,0.886905,What are some good Python ORM solutions?
7,0.899351,python orm
8,0.900963,Python libraries to construct classes from a r...
9,0.911261,python ORM allowing for table creation and bul...


In [43]:
lookup.nearest('Can I store JSON in Postgres?')

Unnamed: 0,dist,question
0,0.892392,Databases using JSON as storage/transport format
1,0.893417,JSON encode MySQL results
2,0.893883,Ajax / Json How to run an INSERT/UPDATE into m...
3,0.896096,"Read page content, convert to json, enter to SQL?"
4,0.897706,What is the best way to sync 2 sqlite tables o...
5,0.902693,Concat JSON objects
6,0.911446,import json file to couch db-
7,0.922449,Represent a query string in JSON
8,0.924316,What PL/SQL Libraries For Auto-Generating JSON...
9,0.930865,Is there a query language for JSON?


In [26]:
def cnn_model(embedding_size, vocab_size):
    title = layers.Input(shape=(None,), dtype='int32', name='title')
    body = layers.Input(shape=(None,), dtype='int32', name='body')

    embedding = layers.Embedding(
        mask_zero=False,
        input_dim=vocab_size,
        output_dim=embedding_size,
    )


    def _combine_sum(v):
        return K.sum(v, axis=1)

    cnn_1 = layers.Convolution1D(256, 3)
    cnn_2 = layers.Convolution1D(256, 3)
    cnn_3 = layers.Convolution1D(256, 3)
    
    global_pool = layers.GlobalMaxPooling1D()
    local_pool = layers.MaxPooling1D(strides=2, pool_size=3)

    def forward(input):
        embed = embedding(input)
        return global_pool(
            cnn_2(local_pool(cnn_1(embed))))

    sum_a = forward(title)
    sum_b = forward(body)

    sim = layers.dot([sum_a, sum_b], axes=1, normalize=False)
    sim_model = models.Model(
        inputs=[title, body],
        outputs=[sim],
    )
    sim_model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

    embedding_model = models.Model(
        inputs=[title],
        outputs=[sum_a]
    )
    return sim_model, embedding_model

In [27]:
reset_everything()

cnn, cnn_embedding = cnn_model(embedding_size=25, vocab_size=VOCAB_SIZE)
cnn.summary()
cnn.fit_generator(
    data_generator(batch_size=128),
    epochs=10,
    steps_per_epoch=1000,
)

Flushing input history
Flushing output cache (0 entries)
Flushing directory history
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
words_a (InputLayer)             (None, None)          0                                            
____________________________________________________________________________________________________
words_b (InputLayer)             (None, None)          0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, None, 25)      6250000     words_a[0][0]                    
                                                                   words_b[0][0]                    
____________________________________________________________________________________________________
conv1d_

<keras.callbacks.History at 0x7f5231926fd0>

In [30]:
lookup = EmbeddingWrapper(model=cnn_embedding)
lookup.nearest('Does Python have a Postgres object relational model')



Unnamed: 0,dist,question
0,,"In a asp.net webforms UserControl, why do cont..."
1,,How can I force a Perl script to use ActiveSta...
2,,UML sequence diagram call property
3,,How to add &quot;help&quot;-text to a mex-func...
4,,Where can I find the source code for J2ME?
5,,What does &lt;&gt; mean?
6,,"MDX - Sum at lowest time, then Max it out"
7,,How do you use WiX to deploy VSTO 3.0 addins?
8,,Listing files available for download - files a...
9,,"Using nHibernate and the repository pattern, n..."


In [None]:
lookup.nearest('how can i profile python?')

In [33]:
def lstm_model(embedding_size, vocab_size):
    title = layers.Input(shape=(None,), dtype='int32', name='title')
    body = layers.Input(shape=(None,), dtype='int32', name='body')

    embedding = layers.Embedding(
        mask_zero=True,
        input_dim=vocab_size,
        output_dim=embedding_size,
#         weights=[w2v_weights],
#         trainable=False
    )

    lstm_1 = layers.LSTM(units=512, return_sequences=True)
    lstm_2 = layers.LSTM(units=512, return_sequences=False)
    
    sum_a = lstm_2(lstm_1(embedding(title)))
    sum_b = lstm_2(lstm_1(embedding(body)))

    sim = layers.dot([sum_a, sum_b], axes=1, normalize=True)
#     sim = layers.Activation(activation='sigmoid')(sim)
    sim_model = models.Model(
        inputs=[title, body],
        outputs=[sim],
    )
    sim_model.compile(loss='binary_crossentropy', optimizer='rmsprop')

    embedding_model = models.Model(
        inputs=[title],
        outputs=[sum_a]
    )
    return sim_model, embedding_model

In [34]:
lstm, lstm_embedding = lstm_model(embedding_size=EMBEDDING_SIZE, vocab_size=VOCAB_SIZE)
lstm.summary()
lstm.fit_generator(
    data_generator(batch_size=128),
    epochs=10,
    steps_per_epoch=100,
)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
words_a (InputLayer)             (None, None)          0                                            
____________________________________________________________________________________________________
words_b (InputLayer)             (None, None)          0                                            
____________________________________________________________________________________________________
embedding_2 (Embedding)          (None, None, 100)     25000000    words_a[0][0]                    
                                                                   words_b[0][0]                    
____________________________________________________________________________________________________
lstm_1 (LSTM)                    (None, None, 512)     1255424     embedding_2[0][0]       

<keras.callbacks.History at 0x7f515b99fc50>

In [35]:
lookup = EmbeddingWrapper(model=lstm_embedding)
lookup.nearest('Does Python have a Postgres object relational model')



Unnamed: 0,dist,question
0,0.999889,"Same function, different return types for clas..."
1,0.999891,Wordpress: How do I fix my post class?
2,0.999892,How do I move from Java to C#?
3,0.999893,pycurl cancel a transfer and try &amp; except
4,0.999894,Starting service as a user with no password
5,0.999898,HTTP headers &quot;q&quot; factor in firefox?
6,0.999898,Create a file from a large Makefile variable
7,0.999901,which C++ material should i work on next?
8,0.999901,SQL Server Group By Query using multiple values
9,0.999908,Does Parrot have a database interface or API?


In [None]:
lookup.nearest('shinkansen')