In [1]:
import pandas as pd
import re
import codecs
import multiprocessing
import gensim
from gensim.models import Word2Vec
import nltk
import numpy as np

import os
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector

In [2]:
jobs = pd.read_csv('../../Data/Jobs/monster_com-job_sample.csv')

In [3]:
jobs.head()

Unnamed: 0,country,country_code,date_added,has_expired,job_board,job_description,job_title,job_type,location,organization,page_url,salary,sector,uniq_id
0,United States of America,US,,No,jobs.monster.com,TeamSoft is seeing an IT Support Specialist to...,IT Support Technician Job in Madison,Full Time Employee,"Madison, WI 53702",,http://jobview.monster.com/it-support-technici...,,IT/Software Development,11d599f229a80023d2f40e7c52cd941e
1,United States of America,US,,No,jobs.monster.com,The Wisconsin State Journal is seeking a flexi...,Business Reporter/Editor Job in Madison,Full Time,"Madison, WI 53708",Printing and Publishing,http://jobview.monster.com/business-reporter-e...,,,e4cbb126dabf22159aff90223243ff2a
2,United States of America,US,,No,jobs.monster.com,Report this job About the Job DePuy Synthes Co...,Johnson & Johnson Family of Companies Job Appl...,"Full Time, Employee",DePuy Synthes Companies is a member of Johnson...,Personal and Household Services,http://jobview.monster.com/senior-training-lea...,,,839106b353877fa3d896ffb9c1fe01c0
3,United States of America,US,,No,jobs.monster.com,Why Join Altec? If you’re considering a career...,Engineer - Quality Job in Dixon,Full Time,"Dixon, CA",Altec Industries,http://jobview.monster.com/engineer-quality-jo...,,Experienced (Non-Manager),58435fcab804439efdcaa7ecca0fd783
4,United States of America,US,,No,jobs.monster.com,Position ID# 76162 # Positions 1 State CT C...,Shift Supervisor - Part-Time Job in Camphill,Full Time Employee,"Camphill, PA",Retail,http://jobview.monster.com/shift-supervisor-pa...,,Project/Program Management,64d0272dc8496abfd9523a8df63c184c


In [4]:
# quick clean up
   
job_titles = jobs["job_title"]

job_titles = job_titles.str.lower()
job_titles = job_titles.str.split("job in", n = 1, expand = True)[0] 
job_titles2 = job_titles.str.split("job application for", n = 1, expand = True)[1] \
                        .str.split('|', n = 1, expand = True)[0] \
                        .str.split('-', n = 1, expand = True)[0]
job_titles = job_titles2.combine_first(job_titles)

jobs["job_title"] = job_titles
jobs['job_title'] = jobs['job_title'].fillna('no title')
jobs['sector'] = jobs['sector'].fillna('no sector')

In [5]:
jobs.job_description.to_csv('job_descriptions.txt', header=False, index=False, sep=' ')

In [6]:
# Word2Vec process and train functions

def preprocess_text(text):
    text = re.sub('[^a-zA-Zа-яА-Я1-9]+', ' ', text)
    text = re.sub(' +', ' ', text)
    return text.strip()


def prepare_for_w2v(filename_from, filename_to, lang):
    raw_text = codecs.open(filename_from, "r", encoding='utf-8').read()
    with open(filename_to, 'w', encoding='utf-8') as f:
        for sentence in nltk.sent_tokenize(raw_text, lang):
            print(preprocess_text(sentence.lower()), file=f)
            

def train_word2vec(filename):
    data = gensim.models.word2vec.LineSentence(filename)
    return Word2Vec(data, size=300, window=10, min_count=1, workers=multiprocessing.cpu_count())

In [7]:
prepare_for_w2v('./job_descriptions.txt', 'job_descriptions_prep.txt', 'english')

In [8]:
model_jobdesc = train_word2vec('job_descriptions_prep.txt')

In [9]:
model_jobdesc.save('job_desc_model.model')

In [10]:
words_jd = []
embeddings_jd = []
for word in list(model_jobdesc.wv.vocab):
    embeddings_jd.append(model_jobdesc[word])
    words_jd.append(word)

  after removing the cwd from sys.path.


In [11]:
model_google = gensim.models.KeyedVectors.load_word2vec_format('../../Data/GoogleNews-vectors-negative300.bin', binary=True)

In [12]:
model_goojd = model_google
model_goojd.add(words_jd, embeddings_jd)

In [13]:
job_desc =[]
for i, jd in enumerate(jobs.job_description):
    jdd = ' '
    for sentence in nltk.sent_tokenize(jd, 'english'):
        jdd = jdd + ' ' + preprocess_text(sentence.lower())
    job_desc.append(jdd)

In [14]:
# split jos description into words
for i ,_ in enumerate(job_desc):
    job_desc[i] = job_desc[i].split()

In [15]:
# score job descriptions
job_scored = []
for i ,_ in enumerate(job_desc):
    job_scored.append(model_goojd[job_desc[i]].mean(axis=0))

In [28]:
# Metadata
jobs[['job_title','sector']].to_csv('./project-tensorboard/log_desc/job_desc_meta.tsv', header=True, index=False, sep='\t')

In [29]:
## Get working directory
PATH = os.getcwd()

## Path to save the embedding and checkpoints generated
LOG_DIR = PATH + './project-tensorboard/log_desc/'

metadata = os.path.join(LOG_DIR, 'job_desc_meta.tsv')

## TensorFlow Variable from data
tf_data = tf.Variable(np.asarray(job_scored))

In [30]:
## Running TensorFlow Session
with tf.Session() as sess:
    saver = tf.train.Saver([tf_data])
    sess.run(tf_data.initializer)
    saver.save(sess, os.path.join(LOG_DIR, 'tf_data.ckpt'))
    config = projector.ProjectorConfig()
    
# One can add multiple embeddings.
    embedding = config.embeddings.add()
    embedding.tensor_name = tf_data.name
    # Link this tensor to its metadata(Labels) file
    embedding.metadata_path = metadata
    # Saves a config file that TensorBoard will read during startup.
    projector.visualize_embeddings(tf.summary.FileWriter(LOG_DIR), config)

In [22]:
np.asarray(job_scored).shape

(22000, 300)

In [21]:
tf_data.name

'Variable:0'