In [1]:
import pandas
pandas.options.mode.chained_assignment = None
import numpy
import gensim
from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import LabeledSentence

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
from pymongo import MongoClient
from config import config

client = MongoClient(config['mongo']['host'], config['mongo']['port'])
db_name = config['mongo']['embedding_data']['db_name']
table_name = config['mongo']['embedding_data']['table_name']

embedding_db = client[db_name]
embedding_table = embedding_db[table_name]

In [3]:
def ingest_embedding():
    data = pandas.DataFrame(embedding_table.find())
    data.drop_duplicates(subset='id', inplace=True)
    data.drop(['_id', 'by', 'id', 'parent', 'date', 'preferred'], axis=1, inplace=True)
    data = data[data['text'].isnull() == False]
    data.reset_index(inplace=True)
    data.drop('index', axis=1, inplace=True)
    return data

In [4]:
data = ingest_embedding()
data.head()

Unnamed: 0,text
0,"Creative Commons | <a href=""https:&#x2F;&#x2F;..."
1,Latacora | Security Engineers | Full-time | Ch...
2,"University of Michigan | Ann Arbor, MI | Full-..."
3,Dyalog Ltd. | Programming Language Implementor...
4,"Knotch | New York, NY<p>We’re solving a major ..."


In [5]:
data.loc[1]['text']

'Latacora | Security Engineers | Full-time | Chicago or Remote<p>You could pick one startup to work for. But why choose? You can also work for a bunch of them simultaneously. That&#x27;s what we did. We&#x27;re Latacora, and we&#x27;re building a security team that runs security teams for startups. We&#x27;re a weird firm: we have only one kind of client, and we work full-time with them for 6-18 months, doing everything a security team does, from software security to cryptography design to AWS and container lockdown.<p>We&#x27;ve been working together since 2005 (when we were called Matasano). And for almost as long, we&#x27;ve been hiring people off the HN hiring thread. We don&#x27;t care about resumes or previous work experience. Rather, we&#x27;re interested in aptitude and level of interest in the problems we&#x27;re working on. No phone screens, no whiteboard interviews. More than you could want to know about our hiring process: <a href="https:&#x2F;&#x2F;latacora.com&#x2F;career

In [6]:
import re
link_pattern = re.compile('<a href[^<]+</a>')

In [7]:
def tokenize(job):
    job = job.lower()
    job = link_pattern.sub('', job)
    job = (
        job.replace('<p>', '')
        .replace('&#x27;', "'")
        .replace('&quot;', '"')
        .replace('|', '')
    )
    tokens = tokenizer.tokenize(job)
    tokens = list(filter(lambda t: not t.startswith('@'), tokens))
    tokens = list(filter(lambda t: not t.startswith('#'), tokens))
    return tokens

In [8]:
def postprocess(data):
    data['tokens'] = data['text'].progress_map(tokenize)
    data = data[data.tokens != 'NC']
    data.reset_index(inplace=True)
    data.drop('index', axis=1, inplace=True)
    return data

In [9]:
processed_data = postprocess(data)

progress-bar: 100%|██████████| 16221/16221 [00:15<00:00, 1023.22it/s]


In [10]:
processed_data.head()

Unnamed: 0,text,tokens
0,"Creative Commons | <a href=""https:&#x2F;&#x2F;...","[creative, commons, remote, full-time, senior,..."
1,Latacora | Security Engineers | Full-time | Ch...,"[latacora, security, engineers, full-time, chi..."
2,"University of Michigan | Ann Arbor, MI | Full-...","[university, of, michigan, ann, arbor, ,, mi, ..."
3,Dyalog Ltd. | Programming Language Implementor...,"[dyalog, ltd, ., programming, language, implem..."
4,"Knotch | New York, NY<p>We’re solving a major ...","[knotch, new, york, ,, nywe, ’, re, solving, a..."


In [11]:
x_train, x_test = train_test_split(numpy.array(processed_data['tokens']), test_size=0.2)

In [12]:
def labelize_jobs(jobs, label_type):
    labelized = []
    for i, v in tqdm(enumerate(jobs)):
        label = '%s_%s'%(label_type, i)
        labelized.append(LabeledSentence(v, [label]))
    return labelized

In [13]:
x_train = labelize_jobs(x_train, 'TRAIN')
x_test = labelize_jobs(x_test, 'TEST')

  """
12976it [00:00, 32241.02it/s]
3245it [00:00, 58071.29it/s]


In [14]:
w2v = Word2Vec(size=200, min_count=10)
w2v.build_vocab([ x.words for x in tqdm(x_train) ])

100%|██████████| 12976/12976 [00:00<00:00, 213821.58it/s]


In [15]:
w2v.train(sentences=[ x.words for x in tqdm(x_train) ], total_words=w2v.corpus_total_words, epochs=w2v.epochs)

100%|██████████| 12976/12976 [00:00<00:00, 265273.11it/s]


(8150490, 11634315)

In [16]:
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

output_notebook()
plot_tfidf = bp.figure(plot_width=700, plot_height=600, title="Map of 10000 word vectors",
    tools="pan,wheel_zoom,box_zoom,reset,hover",
    x_axis_type=None, y_axis_type=None, min_border=1)

word_vectors = [w2v[w] for w in list(w2v.wv.vocab.keys())[:5000]]

  # Remove the CWD from sys.path while we load stuff.


In [17]:
from sklearn.manifold import TSNE
tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
tsne_w2v = tsne_model.fit_transform(word_vectors)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 5000 samples in 0.155s...
[t-SNE] Computed neighbors for 5000 samples in 16.228s...
[t-SNE] Computed conditional probabilities for sample 1000 / 5000
[t-SNE] Computed conditional probabilities for sample 2000 / 5000
[t-SNE] Computed conditional probabilities for sample 3000 / 5000
[t-SNE] Computed conditional probabilities for sample 4000 / 5000
[t-SNE] Computed conditional probabilities for sample 5000 / 5000
[t-SNE] Mean sigma: 0.469199
[t-SNE] KL divergence after 250 iterations with early exaggeration: 83.981796
[t-SNE] KL divergence after 1000 iterations: 2.442823


In [20]:
tsne_df = pandas.DataFrame(tsne_w2v, columns=['x', 'y'])
tsne_df['words'] = list(w2v.wv.vocab.keys())[:5000]

plot_tfidf.scatter(x='x', y='y', source=tsne_df)
hover = plot_tfidf.select(dict(type=HoverTool))
hover.tooltips={"word": "@words"}
show(plot_tfidf)