In [2]:
import pandas
pandas.options.mode.chained_assignment = None
import numpy
import gensim
from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import LabeledSentence

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
from pymongo import MongoClient
from config import config

client = MongoClient(config['mongo']['host'], config['mongo']['port'])
db_name = config['mongo']['embedding_data']['db_name']
table_name = config['mongo']['embedding_data']['table_name']

embedding_db = client[db_name]
embedding_table = embedding_db[table_name]

In [4]:
def ingest_embedding():
    data = pandas.DataFrame(embedding_table.find())
    data.drop_duplicates(subset='id', inplace=True)
    data.drop(['_id', 'by', 'id', 'parent', 'date', 'preferred'], axis=1, inplace=True)
    data = data[data['text'].isnull() == False]
    data.reset_index(inplace=True)
    data.drop('index', axis=1, inplace=True)
    return data

In [5]:
data = ingest_embedding()
data.head()

Unnamed: 0,text
0,"Creative Commons | <a href=""https:&#x2F;&#x2F;..."
1,Latacora | Security Engineers | Full-time | Ch...
2,"University of Michigan | Ann Arbor, MI | Full-..."
3,Dyalog Ltd. | Programming Language Implementor...
4,"Knotch | New York, NY<p>We’re solving a major ..."


In [6]:
data.loc[1]['text']

'Latacora | Security Engineers | Full-time | Chicago or Remote<p>You could pick one startup to work for. But why choose? You can also work for a bunch of them simultaneously. That&#x27;s what we did. We&#x27;re Latacora, and we&#x27;re building a security team that runs security teams for startups. We&#x27;re a weird firm: we have only one kind of client, and we work full-time with them for 6-18 months, doing everything a security team does, from software security to cryptography design to AWS and container lockdown.<p>We&#x27;ve been working together since 2005 (when we were called Matasano). And for almost as long, we&#x27;ve been hiring people off the HN hiring thread. We don&#x27;t care about resumes or previous work experience. Rather, we&#x27;re interested in aptitude and level of interest in the problems we&#x27;re working on. No phone screens, no whiteboard interviews. More than you could want to know about our hiring process: <a href="https:&#x2F;&#x2F;latacora.com&#x2F;career

In [7]:
import re
link_pattern = re.compile('<a href[^<]+</a>')

In [8]:
def tokenize(job):
    job = job.lower()
    job = link_pattern.sub('', job)
    job = (
        job.replace('<p>', '')
        .replace('&#x27;', "'")
        .replace('&quot;', '"')
        .replace('|', '')
    )
    tokens = tokenizer.tokenize(job)
    tokens = list(filter(lambda t: not t.startswith('@'), tokens))
    tokens = list(filter(lambda t: not t.startswith('#'), tokens))
    return tokens

In [9]:
def postprocess(data):
    data['tokens'] = data['text'].progress_map(tokenize)
    data = data[data.tokens != 'NC']
    data.reset_index(inplace=True)
    data.drop('index', axis=1, inplace=True)
    return data

In [10]:
processed_data = postprocess(data)

progress-bar: 100%|██████████| 16221/16221 [00:18<00:00, 865.89it/s]


In [11]:
processed_data.head()

Unnamed: 0,text,tokens
0,"Creative Commons | <a href=""https:&#x2F;&#x2F;...","[creative, commons, remote, full-time, senior,..."
1,Latacora | Security Engineers | Full-time | Ch...,"[latacora, security, engineers, full-time, chi..."
2,"University of Michigan | Ann Arbor, MI | Full-...","[university, of, michigan, ann, arbor, ,, mi, ..."
3,Dyalog Ltd. | Programming Language Implementor...,"[dyalog, ltd, ., programming, language, implem..."
4,"Knotch | New York, NY<p>We’re solving a major ...","[knotch, new, york, ,, nywe, ’, re, solving, a..."


In [12]:
x_train, x_test = train_test_split(numpy.array(processed_data['tokens']), test_size=0.2)

In [13]:
def labelize_jobs(jobs, label_type):
    labelized = []
    for i, v in tqdm(enumerate(jobs)):
        label = '%s_%s'%(label_type, i)
        labelized.append(LabeledSentence(v, [label]))
    return labelized

In [14]:
x_train = labelize_jobs(x_train, 'TRAIN')
x_test = labelize_jobs(x_test, 'TEST')

  """
12976it [00:00, 90926.28it/s]
3245it [00:00, 101312.44it/s]


In [15]:
n_dims = 200

In [16]:
w2v = Word2Vec(size=n_dims, min_count=10)
w2v.build_vocab([ x.words for x in tqdm(x_train) ])

100%|██████████| 12976/12976 [00:00<00:00, 135362.05it/s]


In [17]:
w2v.train(sentences=[ x.words for x in tqdm(x_train) ], total_words=w2v.corpus_total_words, epochs=w2v.epochs)

100%|██████████| 12976/12976 [00:00<00:00, 202381.67it/s]


(8111007, 11579560)

In [18]:
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

output_notebook()
plot_tfidf = bp.figure(plot_width=700, plot_height=600, title="Map of 10000 word vectors",
    tools="pan,wheel_zoom,box_zoom,reset,hover",
    x_axis_type=None, y_axis_type=None, min_border=1)

word_vectors = [w2v[w] for w in list(w2v.wv.vocab.keys())[:5000]]

  # Remove the CWD from sys.path while we load stuff.


In [19]:
from sklearn.manifold import TSNE
tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
tsne_w2v = tsne_model.fit_transform(word_vectors)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 5000 samples in 0.196s...
[t-SNE] Computed neighbors for 5000 samples in 20.060s...
[t-SNE] Computed conditional probabilities for sample 1000 / 5000
[t-SNE] Computed conditional probabilities for sample 2000 / 5000
[t-SNE] Computed conditional probabilities for sample 3000 / 5000
[t-SNE] Computed conditional probabilities for sample 4000 / 5000
[t-SNE] Computed conditional probabilities for sample 5000 / 5000
[t-SNE] Mean sigma: 0.483794
[t-SNE] KL divergence after 250 iterations with early exaggeration: 84.252983
[t-SNE] KL divergence after 1000 iterations: 2.392026


In [20]:
tsne_df = pandas.DataFrame(tsne_w2v, columns=['x', 'y'])
tsne_df['words'] = list(w2v.wv.vocab.keys())[:5000]

plot_tfidf.scatter(x='x', y='y', source=tsne_df)
hover = plot_tfidf.select(dict(type=HoverTool))
hover.tooltips={"word": "@words"}
show(plot_tfidf)

In [32]:
embedding_kv_file = 'embedding/job_embedding.kv'

In [21]:
w2v.wv.save(embedding_kv_file)

In [22]:
job_db_name = config['mongo']['data']['db_name']
job_table_name = config['mongo']['data']['table_name']
label_key = config['mongo']['data']['label_key']

job_db = client[job_db_name]
job_table = job_db[job_table_name]

In [23]:
def ingest_labeled_data():
    data = pandas.DataFrame(job_table.find())
    data.drop(['_id', 'by', 'id', 'parent', 'date'], axis=1, inplace=True)
    data = data[data['text'].isnull() == False]
    data = data[data['preferred'].isnull() == False]
    data['preferred'] = data['preferred'].map(lambda x: 1 if x else 0)
    data.reset_index(inplace=True)
    data.drop('index', axis=1, inplace=True)
    return data

In [24]:
labeled_data = ingest_labeled_data()

In [25]:
labeled_data.head()

Unnamed: 0,text,preferred
0,Fern Creek Software | Louisville KY | 100% Rem...,0
1,"Airtable | San Francisco, CA | Onsite or remot...",0
2,"Slab | Software Engineer | San Francisco, CA o...",0
3,Scite | Senior Full-Stack Developer | Remote&...,0
4,InfluxData: Remote | San Francisco | London | ...,0


In [26]:
processed_labeled = postprocess(labeled_data)

progress-bar: 100%|██████████| 602/602 [00:00<00:00, 634.12it/s]


In [27]:
processed_labeled.head()

Unnamed: 0,text,preferred,tokens
0,Fern Creek Software | Louisville KY | 100% Rem...,0,"[fern, creek, software, louisville, ky, 100, %..."
1,"Airtable | San Francisco, CA | Onsite or remot...",0,"[airtable, san, francisco, ,, ca, onsite, or, ..."
2,"Slab | Software Engineer | San Francisco, CA o...",0,"[slab, software, engineer, san, francisco, ,, ..."
3,Scite | Senior Full-Stack Developer | Remote&...,0,"[scite, senior, full-stack, developer, remote,..."
4,InfluxData: Remote | San Francisco | London | ...,0,"[influxdata, :, remote, san, francisco, london..."


In [28]:
xL_train, xL_test, yL_train, yL_test = train_test_split(numpy.array(processed_labeled['tokens']), numpy.array(processed_labeled['preferred']), test_size=0.2)

In [29]:
xL_train = labelize_jobs(xL_train, 'TRAIN')
xL_test = labelize_jobs(xL_test, 'TEST')

  """
481it [00:00, 68031.03it/s]
121it [00:00, 21879.24it/s]


In [30]:
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
matrix = vectorizer.fit_transform([ x.words for x in x_train ]) # Assuming we want the larger vocabulary size, unfortunately won't catch everything new
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
print(f'vocab size: {len(tfidf)}')

vocab size: 8470


In [31]:
def buildWordVector(tokens, size, word_vectors):
    vec = numpy.zeros(size).reshape((1, size))
    count = 0
    for word in tokens:
        try:
            vec += word_vectors[word].reshape((1, size)) * tfidf[word]
            count += 1
        except KeyError:
            continue

    if count != 0:
        vec /= count

    return vec

In [33]:
from gensim.models import KeyedVectors
word_vectors = KeyedVectors.load(embedding_kv_file)

In [108]:
from sklearn.preprocessing import scale
train_vecs = numpy.concatenate([ buildWordVector(tokens, n_dims, word_vectors) for tokens in tqdm(map(lambda x: x.words, xL_train)) ])

test_vecs = numpy.concatenate([ buildWordVector(tokens, n_dims, word_vectors) for tokens in tqdm(map(lambda x: x.words, xL_test)) ])

481it [00:01, 323.52it/s]
121it [00:00, 433.60it/s]


In [117]:
scaled_train_vecs = scaler.transform(train_vecs)
scaled_test_vecs = scaler.transform(test_vecs)

In [35]:
from keras.models import Sequential
from keras.layers import Dense

Using TensorFlow backend.


In [90]:
yL_test

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [118]:
model = Sequential()
model.add(Dense(32, activation='relu', input_dim=n_dims))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(scaled_train_vecs, yL_train, epochs=9, batch_size=32, verbose=2)

Epoch 1/9
 - 0s - loss: 0.4505 - accuracy: 0.8274
Epoch 2/9
 - 0s - loss: 0.2543 - accuracy: 0.9522
Epoch 3/9
 - 0s - loss: 0.1970 - accuracy: 0.9543
Epoch 4/9
 - 0s - loss: 0.1689 - accuracy: 0.9522
Epoch 5/9
 - 0s - loss: 0.1471 - accuracy: 0.9563
Epoch 6/9
 - 0s - loss: 0.1319 - accuracy: 0.9626
Epoch 7/9
 - 0s - loss: 0.1171 - accuracy: 0.9605
Epoch 8/9
 - 0s - loss: 0.1058 - accuracy: 0.9626
Epoch 9/9
 - 0s - loss: 0.0947 - accuracy: 0.9667


<keras.callbacks.callbacks.History at 0x7f3e0ff595f8>

In [119]:
score = model.evaluate(scaled_test_vecs, yL_test, batch_size=30, verbose=2)
print(score[1])

0.9752066135406494


In [48]:
from tensorflow import keras

In [51]:
saved = keras.models.load_model('/mnt/c/Code/job-search/server/personal_models/sentiment_model.tf')

In [53]:
import requests

In [75]:
post = requests.get('https://hacker-news.firebaseio.com/v0/item/22751883.json')
post.json()

{'by': 'bruth',
 'id': 22751883,
 'parent': 22749308,
 'text': 'The Children&#x27;s Hospital of Philadelphia | Bioinformatics software engineer | Philadelphia, PA | Full-time | Onsite (However, Remote now and for the coming months..)<p>The Children’s Hospital of Philadelphia (CHOP) Research Institute and its Dept of Biomedical and Health Informatics (DBHi) are seeking a software engineer to help build an enterprise-level data and informatics platform called “Arcus”. The Arcus team integrates with major scientific initiatives in the Research Institute strategic plan, high-impact research areas such as lifespan, rare diseases, novel devices and therapeutics, and precision health.<p>This role will work with a team of bioinformatics scientists, software engineers and genomics faculty focused on architecting and implementing a cloud-native platform to support storing and analyzing enterprise wide genomic data including data management, harmonized pipelines, and variant warehouse components.

In [76]:
new_job = post.json()

In [77]:
new_tokens = tokenize(new_job['text'])

In [78]:
single_test = numpy.array([new_tokens,])

In [79]:
single_test = labelize_jobs(single_test, 'PREDICT')

  """
1it [00:00, 851.29it/s]


In [80]:
predict_vecs = numpy.concatenate([ buildWordVector(tokens, n_dims, word_vectors) for tokens in tqdm(map(lambda x: x.words, single_test)) ])

1it [00:00, 24.55it/s]


In [124]:
from sklearn.preprocessing import StandardScaler
from pickle import dump

In [112]:
scaler = StandardScaler().fit(train_vecs)

In [114]:
scaler.transform(predict_vecs)

array([[ 1.31252514,  0.48156402,  1.15916264,  0.56565784,  0.57587082,
         0.06923573, -0.5833062 , -1.6959459 ,  0.19064597, -0.69569454,
         0.27237362,  0.27282658,  0.59497158,  0.44978281, -1.23805333,
         0.63953982, -0.80704164,  1.41164511, -0.3419701 ,  0.90978887,
         0.59761763,  1.10278409, -0.19677951,  0.37133202,  0.11944162,
        -0.3438711 , -0.81615127,  0.75340873,  0.05584507,  1.32337547,
         0.0281306 ,  0.45426075,  0.52564594,  0.26666034, -0.37594812,
         0.78885489,  0.42993114, -0.33484473,  1.10306307, -0.38836349,
        -0.2431865 ,  0.69270326,  0.32285316, -0.46231701, -0.50074089,
        -0.79175036, -0.87759125, -1.23988963, -0.0183036 ,  1.06252011,
        -0.02326819, -1.35690064,  1.01371862,  0.41462385, -0.76752816,
        -0.42795738, -0.02156431,  0.4354271 , -0.36503523,  0.39007016,
         0.18291122,  0.47935643,  0.17281484,  1.2254149 ,  0.93716276,
         1.1981988 , -0.86032551,  0.36678593, -1.2

In [116]:
saved.predict_classes(scaler.transform(train_vecs[10:11]))

array([[1]], dtype=int32)

In [125]:
dump(scaler, open('scaler.pkl', 'wb'))

In [126]:
from pickle import load

In [127]:
saved_scaler = load(open('scaler.pkl', 'rb'))

In [128]:
saved_scaler.transform(predict_vecs)

array([[ 1.31252514,  0.48156402,  1.15916264,  0.56565784,  0.57587082,
         0.06923573, -0.5833062 , -1.6959459 ,  0.19064597, -0.69569454,
         0.27237362,  0.27282658,  0.59497158,  0.44978281, -1.23805333,
         0.63953982, -0.80704164,  1.41164511, -0.3419701 ,  0.90978887,
         0.59761763,  1.10278409, -0.19677951,  0.37133202,  0.11944162,
        -0.3438711 , -0.81615127,  0.75340873,  0.05584507,  1.32337547,
         0.0281306 ,  0.45426075,  0.52564594,  0.26666034, -0.37594812,
         0.78885489,  0.42993114, -0.33484473,  1.10306307, -0.38836349,
        -0.2431865 ,  0.69270326,  0.32285316, -0.46231701, -0.50074089,
        -0.79175036, -0.87759125, -1.23988963, -0.0183036 ,  1.06252011,
        -0.02326819, -1.35690064,  1.01371862,  0.41462385, -0.76752816,
        -0.42795738, -0.02156431,  0.4354271 , -0.36503523,  0.39007016,
         0.18291122,  0.47935643,  0.17281484,  1.2254149 ,  0.93716276,
         1.1981988 , -0.86032551,  0.36678593, -1.2