In [1]:
import re
import os
import keras.backend as K
import numpy as np
import pandas as pd
from keras import layers, models, utils
import json

Using TensorFlow backend.


In [2]:
def reset_everything():
    import tensorflow as tf
    %reset -f in out dhist
    tf.reset_default_graph()
    K.set_session(tf.InteractiveSession())

In [3]:
# Constants for our networks.  We keep these deliberately small to reduce training time.

VOCAB_SIZE = 250000
EMBEDDING_SIZE = 100
MAX_DOC_LEN = 128
MIN_DOC_LEN = 12

In [4]:
FILE_NAME = 'travel.stackexchange.com.7z'

In [14]:
xml_7z = utils.get_file(
    fname=FILE_NAME,
    origin='https://ia800107.us.archive.org/27/items/stackexchange/' + FILE_NAME
)

In [15]:
print(xml_7z)

C:\Users\GAO\.keras\datasets\travel.stackexchange.com.7z


In [16]:
from bs4 import BeautifulSoup

In [20]:
import subprocess
cmd = ['C:\\Program Files\\7-Zip\\7z.exe', 'x', '-so', xml_7z, 'Posts.xml']
sp = subprocess.Popen(cmd, stderr=subprocess.STDOUT, stdout=subprocess.PIPE, shell=True)

In [21]:
result = sp.communicate()[0].decode('utf-8')

In [22]:
def extract_stackexchange(limit=1000000):
    json_file = 'data/' + FILE_NAME + '-limit=%s.json' % limit

    rows = []
    for i, line in enumerate(result.splitlines()):
        line = str(line)
        #print(line)
        if not line.startswith('  <row'):
            continue
            
        if i % 1000 == 0:
            print('\r%05d/%05d' % (i, limit), end='', flush=True)

        parts = line[6:-5].split('"')
        record = {}
        for i in range(0, len(parts), 2):
            k = parts[i].replace('=', '').strip()
            v = parts[i+1].strip()
            record[k] = v
        rows.append(record)
        
        if len(rows) > limit:
            break
    
    with open(json_file, 'w') as fout:
        json.dump(rows, fout)
    
    return rows

rows = extract_stackexchange()

95000/1000000

In [49]:
content = rows[4]['Title']

In [50]:
content

'What is the easiest transportation to use throughout Romania for a foreigner?'

# Data Exploration

In [13]:
df = pd.DataFrame.from_records(rows)    
df = df.set_index('Id', drop=False)
df['Title'] = df['Title'].fillna('').astype('str')
df['Tags'] = df['Tags'].fillna('').astype('str')
df['Body'] = df['Body'].fillna('').astype('str')
df['Id'] = df['Id'].astype('int')
df['PostTypeId'] = df['PostTypeId'].astype('int')
df['ViewCount'] = df['ViewCount'].astype('float')

df.head()

Unnamed: 0_level_0,AcceptedAnswerId,AnswerCount,Body,ClosedDate,CommentCount,CommunityOwnedDate,CreationDate,FavoriteCount,Id,LastActivityDate,...,LastEditorDisplayName,LastEditorUserId,OwnerDisplayName,OwnerUserId,ParentId,PostTypeId,Score,Tags,Title,ViewCount
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,393.0,4.0,&lt;p&gt;My fiancée and I are looking for a go...,2013-02-25T23:52:47.95,4.0,,2011-06-21T20:19:34.730,,1,2012-05-24T14:52:14.760,...,,101.0,,9,,1,8,&lt;caribbean&gt;&lt;cruising&gt;&lt;vacations...,What are some Caribbean cruises for October?,462.0
2,,8.0,&lt;p&gt;This was one of our definition questi...,,4.0,,2011-06-21T20:22:33.760,,2,2018-08-26T00:04:13.520,...,,51577.0,,13,,1,37,&lt;guides&gt;&lt;extreme-tourism&gt;&lt;amazo...,How can I find a guide that will take me safel...,2116.0
3,,,&lt;p&gt;One way would be to go through an Adv...,,,,2011-06-21T20:24:28.080,,3,2011-06-21T20:24:28.080,...,,,,9,2.0,2,15,,,
4,,1.0,&lt;p&gt;Singapore Airlines has an all-busines...,,,,2011-06-21T20:24:57.160,,4,2013-01-09T09:55:22.743,...,,693.0,,24,,1,8,&lt;loyalty-programs&gt;&lt;routes&gt;&lt;ewr&...,Does Singapore Airlines offer any reward seats...,256.0
5,770.0,5.0,&lt;p&gt;Another definition question that inte...,,0.0,,2011-06-21T20:25:56.787,,5,2012-10-12T20:49:08.110,...,,101.0,,13,,1,14,&lt;romania&gt;&lt;transportation&gt;,What is the easiest transportation to use thro...,428.0


In [27]:
list(df[df['ViewCount'] > 250000]['Title'])

['Do I need a US visa to transit (or layover) through an American airport?',
 'How much electronics and other valuables can I bring duty-free when going to India?',
 'How to get from Nice to Monaco by public transport?',
 'Should my first trip be to the country which issued my Schengen Visa?',
 "What's the difference between 'Redress Number' and 'Known Traveler Number'? Do I need both for TSA PreCheck?",
 'Can I use Google Maps traffic information to estimate driving time for a specific date/time?',
 'Are aerosol cans allowed and safe, in checked luggage?',
 'How to track my UK Visa Application Status?',
 "When applying for an Indian Passport, how do I know if I'm in the ECR or non-ECR category?",
 'Are battery packs allowed in hand luggage?']

In [51]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(df['Body'] + df['Title'])

In [52]:
# Compute TF/IDF Values

total_count = sum(tokenizer.word_counts.values())
idf = { k: np.log(total_count/v) for (k,v) in tokenizer.word_counts.items() }

In [53]:
# Download pre-trained word2vec embeddings

import gensim

glove_100d = utils.get_file(
    fname='glove.6B.100d.txt',
    origin='https://storage.googleapis.com/deep-learning-cookbook/glove.6B.100d.txt',
)

w2v_100d = glove_100d + '.w2v'
from gensim.scripts.glove2word2vec import glove2word2vec
glove2word2vec(glove_100d, w2v_100d)
w2v_model = gensim.models.KeyedVectors.load_word2vec_format(w2v_100d)

w2v_weights = np.zeros((VOCAB_SIZE, w2v_model.syn0.shape[1]))
idf_weights = np.zeros((VOCAB_SIZE, 1))

for k, v in tokenizer.word_index.items():
    if v >= VOCAB_SIZE:
        continue
    
    if k in w2v_model:
        w2v_weights[v] = w2v_model[k]
    
    idf_weights[v] = idf[k]
    
del w2v_model



Downloading data from https://storage.googleapis.com/deep-learning-cookbook/glove.6B.100d.txt


  from ipykernel import kernelapp as app


In [54]:
df['title_tokens'] = tokenizer.texts_to_sequences(df['Title'])
df['body_tokens'] = tokenizer.texts_to_sequences(df['Body'])

In [55]:
import random

# We can create a data generator that will randomly title and body tokens for questions.  We'll use random text
# from other questions as a negative example when necessary.
def data_generator(batch_size, negative_samples=1):
    questions = df[df['PostTypeId'] == 1]
    all_q_ids = list(questions.index)
        
    batch_x_a = []
    batch_x_b = []
    batch_y = []
    
    def _add(x_a, x_b, y):
        batch_x_a.append(x_a[:MAX_DOC_LEN])
        batch_x_b.append(x_b[:MAX_DOC_LEN])
        batch_y.append(y)
    
    while True:
        questions = questions.sample(frac=1.0)
        
        for i, q in questions.iterrows():
            _add(q['title_tokens'], q['body_tokens'], 1)
            
            negative_q = random.sample(all_q_ids, negative_samples)
            for nq_id in negative_q:
                _add(q['title_tokens'], df.at[nq_id, 'body_tokens'], 0)            
            
            if len(batch_y) >= batch_size:
                yield ({
                    'title': pad_sequences(batch_x_a, maxlen=None),
                    'body': pad_sequences(batch_x_b, maxlen=None),
                }, np.asarray(batch_y))
                
                batch_x_a = []
                batch_x_b = []
                batch_y = []

# dg = data_generator(1, 2)
# next(dg)
# next(dg)

In [56]:
dg = data_generator(1, 2)

In [57]:
next(dg)

({'title': array([[ 525, 1144, 1106,   18,    6,  297,  182,   73,    7, 1027],
         [ 525, 1144, 1106,   18,    6,  297,  182,   73,    7, 1027],
         [ 525, 1144, 1106,   18,    6,  297,  182,   73,    7, 1027]]),
  'body': array([[    0,     0,     2,     4,     1,    12,    80,   415,    18,
            150,     7,   952,    29,   234,   348,   297,   182,    15,
              5,   144,    67,    37,   360,    18,   529,     9,   385,
            358,    20,   292,     5,  1106,  1519,   138,    12,    22,
              6,   278,    31,     5,   525,  1144,  1106,   776,    13,
            721,  3329,   804,   240,  3329,   804,  1057,  3329,    17,
             14,   128,   387,     7,  1341,    37,   997,   141,  1607,
            949,  3329,     9,    12,   349,    22, 47818, 50961,  2784,
           1737,    15,   949,  5470,   997,    16,  1106,  1234,   512,
            804,   721,   109,   804,  1057,   506,  3329,     2,     4,
              1,     3,     3,     2, 