In [2]:
import numpy as np
import pandas as pd
import spacy
import os
from gensim.models import KeyedVectors
from gensim.models.phrases import Phraser
from gensim.models.word2vec import LineSentence
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

data_directory = 'data'
intermediate_directory = 'intermediate'
training_data = os.path.join(data_directory, 'TREC.train')
test_data = os.path.join(data_directory, 'TREC.test')
bigram_model_filepath = os.path.join(intermediate_directory, 'bigram_model_all')
trigram_model_filepath = os.path.join(intermediate_directory, 'trigram_model_all')
trigram_queries_filepath = os.path.join(intermediate_directory, 'trigram_sentences_all.txt')

with open('data/Trec.train') as f:
    labels, queries = zip(*[(int(x[0])," ".join(x[1:])) for x in (x.split() for x in f)])

bigram_model = Phraser.load(bigram_model_filepath)
trigram_model = Phraser.load(trigram_model_filepath)

nlp = spacy.load('en')

# Process Data

In [3]:
def punct_space(token):
    """
    helper function to eliminate tokens
    that are pure punctuation or whitespace
    """
    
    return token.is_punct or token.is_space or str(token) == "'s" or str(token) == "``"

with open(trigram_queries_filepath, 'w', encoding='utf_8') as f:
    for parsed_query in nlp.pipe(queries,
                                  batch_size=int(len(queries)/4), n_threads=4):

        # lemmatize the text, removing punctuation and whitespace
        unigram_review = [token.lemma_ for token in parsed_query
                          if not punct_space(token)]

        # apply the first-order and second-order phrase models
        bigram_review = bigram_model[unigram_review]
        trigram_review = trigram_model[bigram_review]

        # remove any remaining stopwords
        trigram_review = [term for term in trigram_review]
#                           if term not in spacy.lang.en.stop_words.STOP_WORDS]

        # write the transformed review as a line in the new file
        trigram_review = ' '.join(trigram_review)
        f.write(trigram_review + '\n')



# Load word2vec

In [4]:
filename = 'GoogleNews-vectors-negative300.bin'
model = KeyedVectors.load_word2vec_format(filename, binary=True)

# Load Queries and Labels into matrix

In [11]:

query_vectors = []

with open(trigram_queries_filepath, 'r', encoding='utf_8') as f:
    trigram_queries = [query.strip() for query in f]
    for query in queries:
        query_vec = np.zeros(300, dtype='float64')
        for word in query:
            try:
    #               summing all wordvecs to get queryvec
                query_vec += model.word_vec(word)
            except KeyError:
                pass
        query_vectors.append(query_vec)

data = pd.DataFrame(list(zip(queries,
                             trigram_queries,
                             query_vectors,
                             labels)),
                    columns=['queries', 'processed', 'query_vectors', 'labels'])


# Split Data into train and test set

In [14]:
(train_features,
test_features,
train_labels,
test_labels) = train_test_split(data['query_vectors'],
                               data['labels'],
                               test_size = 0.10,
                               random_state = 42)
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (4906,)
Training Labels Shape: (4906,)
Testing Features Shape: (546,)
Testing Labels Shape: (546,)


# Random Forest = 23% accuracy

In [7]:
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 10, random_state = 42)
# Train the model on training data
rf.fit(list(train_features), list(train_labels));
# Use the forest's predict method on the test data
predictions = rf.predict(list(test_features)).round().astype(int)
results = test_labels - predictions
error = results[results != 0].size/results.size
print(error)

0.8424908424908425


# Gradient Boosted Regression Trees = 27%

In [12]:
gbrt = GradientBoostingRegressor(n_estimators=100, max_depth=3, criterion='mse')
gbrt.fit(list(train_features), list(train_labels))
predictions = gbrt.predict(list(test_features)).round().astype(int)
results = test_labels - predictions
error = results[results != 0].size/results.size
print(error)

KeyboardInterrupt: 

# SGD and Logistic Regression 75% accuracy

In [9]:
from sklearn.linear_model import SGDClassifier

In [19]:
%%time

sgd = SGDClassifier(loss='log',
                    penalty='l2',
                    alpha=0.01, l1_ratio=0.15,
                    fit_intercept=True,
                    max_iter=5000,
                    tol=None,
                    shuffle=True,
                    verbose=0,
                    n_jobs=-1,
                    random_state=None,
                    learning_rate='optimal',
                    power_t=0.5,
                    warm_start=False,
                    average=False)
sgd.fit(list(train_features), list(train_labels))
predictions = sgd.predict(list(test_features)).round().astype(int)
# print(test_labels, predictions)
results = test_labels - predictions
error = results[results != 0].size/results.size
print(error)

0.554945054945055
CPU times: user 2min 54s, sys: 350 ms, total: 2min 55s
Wall time: 30.1 s


## Train our own Word2Vec

In [105]:
from gensim.models import Word2Vec

intermediate_directory = 'intermediate'
trigram_sentences = LineSentence(trigram_queries_filepath)
word2vec_filepath = os.path.join(intermediate_directory, 'word2vec_model_all')


<gensim.models.word2vec.LineSentence object at 0x111c55588>


In [125]:


# initiate the model and perform the first epoch of training
query2vec = Word2Vec(trigram_sentences, size=100, window=5,
                    min_count=20, sg=1, workers=4)

query2vec.save(word2vec_filepath)

# perform another 11 epochs of training
for i in range(1,12):

    query2vec.train(trigram_sentences, total_examples=query2vec.corpus_count, epochs=1)
    query2vec.save(word2vec_filepath)
        
# load the finished model from disk
query2vec = Word2Vec.load(word2vec_filepath)
query2vec.init_sims()

print('{} training epochs so far.'.format(query2vec.train_count))

12 training epochs so far.
True


In [126]:
print('{:,} terms in the query2vec vocabulary.'.format(len(query2vec.wv.vocab)))


221 terms in the query2vec vocabulary.


In [150]:
from sklearn.manifold import TSNE
import dill as pickle

In [139]:
# build a list of the terms, integer indices,
# and term counts from the food2vec model vocabulary
ordered_vocab = [(term, voc.index, voc.count)
                 for term, voc in query2vec.wv.vocab.items()]

# sort by the term counts, so the most common terms appear first
ordered_vocab = sorted(ordered_vocab, key=lambda term_index_count: -term_index_count[2])

# unzip the terms, integer indices, and counts into separate lists
ordered_terms, term_indices, term_counts = zip(*ordered_vocab)
word_vectors = pd.DataFrame(query2vec.wv.syn0norm, index=ordered_terms)

word_vectors

  # This is added back by InteractiveShellApp.init_path()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
the,0.082764,0.056660,0.052354,-0.095011,0.134851,-0.042358,0.013958,0.149973,-0.125316,-0.150634,...,-0.159974,0.095902,-0.146999,0.053865,-0.016220,-0.037080,-0.149845,-0.117190,-0.160773,0.060827
what,0.009948,0.030907,0.054388,-0.071068,0.295818,0.014764,0.014212,0.034104,0.007075,-0.034144,...,-0.154452,-0.030384,-0.167146,0.089585,0.101191,-0.021180,-0.132552,-0.047206,-0.218878,0.049358
be,0.093180,-0.055864,0.039647,-0.019285,0.218552,-0.092317,-0.018565,0.291342,-0.060736,-0.135484,...,0.010811,-0.114026,-0.187286,-0.010613,-0.101650,-0.016158,-0.216220,0.076895,0.017011,0.188910
of,0.023874,-0.063587,-0.012668,-0.010053,0.320194,-0.062640,0.039687,0.082633,-0.034010,0.004291,...,-0.072790,-0.154298,-0.197913,0.022957,0.025115,0.012796,-0.108055,0.039907,0.047662,0.133089
in,0.076350,-0.068459,0.018462,-0.104386,0.332071,-0.066345,0.025666,0.171233,0.003725,-0.045273,...,-0.024371,-0.149536,-0.120422,0.056795,-0.084409,0.025638,-0.150323,0.018411,0.082447,0.089866
a,0.049386,-0.187434,0.050423,-0.114565,0.254863,-0.155648,0.034831,0.214614,0.117975,-0.175877,...,-0.088239,0.015357,-0.236925,0.067868,0.015482,0.085787,-0.163749,-0.093912,-0.084260,-0.037068
do,0.196147,-0.089872,0.133070,0.046193,0.125533,-0.134998,-0.004350,0.283116,-0.005436,-0.167199,...,0.032108,0.064094,-0.166914,0.028681,-0.079148,-0.035876,-0.259225,0.043995,-0.184487,-0.015796
-PRON-,0.056362,-0.139172,0.068423,0.026965,0.161426,-0.098636,-0.036326,0.176486,0.098042,-0.162969,...,-0.090380,0.107647,-0.200873,-0.047997,0.010621,0.031908,-0.081392,-0.151286,0.016879,-0.011577
to,0.019036,-0.114339,0.104938,-0.031143,0.223384,-0.098349,-0.071151,0.119227,0.166705,-0.107440,...,-0.029281,0.012686,-0.162795,-0.001479,0.027710,0.058221,-0.069995,-0.174227,0.071921,-0.005838
who,-0.065144,0.006639,0.031852,-0.067946,0.225597,-0.017287,-0.031805,0.011691,0.031898,-0.064880,...,-0.112378,-0.003752,-0.112660,-0.010772,0.073891,0.035065,0.006612,-0.170352,0.024736,0.046098


In [141]:
tsne_input = word_vectors.drop(spacy.lang.en.stop_words.STOP_WORDS, errors=u'ignore')
tsne_input = tsne_input.head(5000)
tsne_input.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
-PRON-,0.056362,-0.139172,0.068423,0.026965,0.161426,-0.098636,-0.036326,0.176486,0.098042,-0.162969,...,-0.09038,0.107647,-0.200873,-0.047997,0.010621,0.031908,-0.081392,-0.151286,0.016879,-0.011577
how_many,-0.027047,-0.16908,-0.00393,-0.196408,0.303583,-0.038283,0.000599,0.126719,0.204781,-0.042276,...,-0.000655,-0.114003,-0.183158,0.123865,-0.024084,0.127117,-0.165935,-0.023878,-0.027506,-0.021153
how_do,-0.037914,-0.155911,0.098884,0.027427,0.218474,-0.093431,-0.003532,0.106461,0.200336,-0.139042,...,-0.089769,0.049447,-0.166646,0.006076,0.055773,0.047896,-0.070253,-0.141688,0.015812,-0.073702
country,0.134692,0.078992,-0.074053,-0.066917,0.214691,-0.069033,-0.008533,0.107232,-0.12359,-0.001035,...,-0.069649,-0.067564,-0.142729,0.015328,-0.053882,-0.024019,-0.148043,0.029751,0.004366,0.122968
world,0.118411,0.080735,-0.068246,-0.113165,0.186888,-0.03304,-0.004019,0.097219,-0.163202,0.002336,...,-0.06542,-0.092472,-0.09288,0.020854,-0.059259,-0.031965,-0.142264,0.048719,0.02511,0.147182


In [142]:
tsne_filepath = os.path.join(intermediate_directory,
                             u'tsne_model')

tsne_vectors_filepath = os.path.join(intermediate_directory,
                                     u'tsne_vectors.npy')

In [153]:
%%time

tsne = TSNE()
tsne_vectors = tsne.fit_transform(tsne_input.values)

with open(tsne_filepath, 'wb') as f:
    pickle.dump(tsne, f)

pd.np.save(tsne_vectors_filepath, tsne_vectors)
    
with open(tsne_filepath, 'rb') as f:
    tsne = pickle.load(f)
    
tsne_vectors = pd.np.load(tsne_vectors_filepath)

tsne_vectors = pd.DataFrame(tsne_vectors,
                            index=pd.Index(tsne_input.index),
                            columns=[u'x_coord', u'y_coord'])

CPU times: user 1.9 s, sys: 217 ms, total: 2.12 s
Wall time: 2.12 s


In [161]:
tsne_input.index

Index(['-PRON-', 'how_many', 'how_do', 'country', 'world', 'city', 'year',
       'u.s.', 'mean', 'large',
       ...
       'west', 'sell', 'house', 'national', 'internet', '-PRON-_take', 'ball',
       'eat', 'who_play', 'end'],
      dtype='object', length=155)

In [156]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource, value

output_notebook()

In [163]:
# add our DataFrame as a ColumnDataSource for Bokeh
plot_data = ColumnDataSource(tsne_vectors)

# create the plot and configure the
# title, dimensions, and tools
tsne_plot = figure(title=u't-SNE Word Embeddings',
                   plot_width = 800,
                   plot_height = 800,
                   tools= (u'pan, wheel_zoom, box_zoom,'
                           u'box_select, reset'),
                   active_scroll=u'wheel_zoom')

# add a hover tool to display words on roll-over
tsne_plot.add_tools( HoverTool(tooltips = u'@index') )

# draw the words as circles on the plot
tsne_plot.circle(u'x_coord', u'y_coord', source=plot_data,
                 color=u'blue', line_alpha=0.2, fill_alpha=0.1,
                 size=10, hover_line_color=u'black')

# configure visual elements of the plot
tsne_plot.title.text_font_size = value(u'16pt')
tsne_plot.xaxis.visible = False
tsne_plot.yaxis.visible = False
tsne_plot.grid.grid_line_color = None
tsne_plot.outline_line_color = None

# engage!
# show(tsne_plot);

In [184]:
query_vectors = []

with open(trigram_queries_filepath, 'r', encoding='utf_8') as f:
    trigram_queries = [query.strip() for query in f]
    for query in trigram_queries:
        query_vec = np.zeros(100, dtype='float64')
        for word in query:
            try:
    #               summing all wordvecs to get queryvec
                query_vec += query2vec.wv.word_vec(word)
            except KeyError:
                pass
        query_vectors.append(query_vec)

data = pd.DataFrame(list(zip(queries,
                             trigram_queries,
                             query_vectors,
                             labels)),
                    columns=['queries', 'processed', 'query_vectors', 'labels'])

Unnamed: 0,queries,processed,query_vectors,labels
0,How did serfdom develop in and then leave Russ...,how_do serfdom develop in and then leave russia,"[0.190615341067, -0.7234454602, 0.194620661438...",0
1,What films featured the character Popeye Doyle ?,what film feature the character popeye doyle,"[0.25415378809, -0.964593946934, 0.25949421525...",1
2,How can I find a list of celebrities ' real na...,how_can_-PRON- find a list of celebrity real_name,"[0.25415378809, -0.964593946934, 0.25949421525...",0
3,What fowl grabs the spotlight after the Chines...,what fowl grab the spotlight after the chinese...,"[0.25415378809, -0.964593946934, 0.25949421525...",1
4,What is the full form of .com ?,what be the full form of .com,"[0.0635384470224, -0.241148486733, 0.064873553...",2
5,What contemptible scoundrel stole the cork fro...,what contemptible scoundrel steal the cork fro...,"[0.127076894045, -0.482296973467, 0.1297471076...",3
6,What team did baseball 's St. Louis Browns bec...,what team do baseball st. louis browns become,"[0.25415378809, -0.964593946934, 0.25949421525...",3
7,What is the oldest profession ?,what be the old profession,"[0.0635384470224, -0.241148486733, 0.064873553...",3
8,What are liver enzymes ?,what be liver enzyme,"[0.0635384470224, -0.241148486733, 0.064873553...",0
9,Name the scar-faced bounty hunter of The Old W...,name the scar face bounty hunter of the old west,"[0.190615341067, -0.7234454602, 0.194620661438...",3
