In [1]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec, FastText
from gensim.models.phrases import Phrases, Phraser
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import sys
sys.path.insert(0, "../src/")
import util as util

%autosave 5

Autosaving every 5 seconds


In [2]:
queries = ["full-stack software engineer", "engineering manager","aspiring human resources"]
maxVisibleCandidates = 15

In [3]:
originalCandidates = pd.read_csv('../data/raw/potential-talents-AspiringHumanResources-SeekingHumanResources.csv')
originalCandidates.loc[originalCandidates['connection'] == '500+ ','connection'] = '500'
originalCandidates['connection'] = originalCandidates['connection'].astype(int)

# Spacy

In [4]:
"""
for query in queries:
    print(query)
    candidates = originalCandidates.copy()
    candidates['fit'] = candidates['job_title'].apply(lambda x: util.scoreSpacySimilary(query,x))
    candidates = candidates.sort_values(by=['fit','connection'],ascending=False)
    candidates = candidates.reset_index(drop=True)
    for i in range(maxVisibleCandidates):
        print(f"{i}: {candidates.loc[i,'job_title']} {candidates.loc[i,'fit']}")
    print('---')
"""

'\nfor query in queries:\n    print(query)\n    candidates = originalCandidates.copy()\n    candidates[\'fit\'] = candidates[\'job_title\'].apply(lambda x: util.scoreSpacySimilary(query,x))\n    candidates = candidates.sort_values(by=[\'fit\',\'connection\'],ascending=False)\n    candidates = candidates.reset_index(drop=True)\n    for i in range(maxVisibleCandidates):\n        print(f"{i}: {candidates.loc[i,\'job_title\']} {candidates.loc[i,\'fit\']}")\n    print(\'---\')\n'

# Bag Of Words

In [5]:
"""
vectorObject = CountVectorizer()

util.scoreViaVectorMethod(originalCandidates,vectorObject,queries,maxVisibleCandidates)
"""

'\nvectorObject = CountVectorizer()\n\nutil.scoreViaVectorMethod(originalCandidates,vectorObject,queries,maxVisibleCandidates)\n'

# TF-IDF

In [6]:
"""
util.scoreViaVectorMethod(originalCandidates,TfidfVectorizer(),queries,maxVisibleCandidates)
"""

'\nutil.scoreViaVectorMethod(originalCandidates,TfidfVectorizer(),queries,maxVisibleCandidates)\n'

# Word2Vec

In [7]:
"""
candidates = originalCandidates.copy()
job_titles = candidates[['job_title']].values.ravel()
jobSentences = [title.split() for title in job_titles]
querySentences = [title.split() for title in queries]
allSentences = jobSentences + querySentences
model = Word2Vec(sentences=allSentences, window=5, min_count=1, workers=4)

jobVectors = np.array([(model.wv[jobSentence]).mean(axis=0) for jobSentence in jobSentences])

for query in queries:
    print(query)
    query_vector = (model.wv[query.split()]).mean(axis=0).reshape(1,-1)
    
    candidates['fit'] = cosine_similarity(jobVectors,query_vector)
    util.sortAndDisplay(candidates,maxVisibleCandidates)
"""

"\ncandidates = originalCandidates.copy()\njob_titles = candidates[['job_title']].values.ravel()\njobSentences = [title.split() for title in job_titles]\nquerySentences = [title.split() for title in queries]\nallSentences = jobSentences + querySentences\nmodel = Word2Vec(sentences=allSentences, window=5, min_count=1, workers=4)\n\njobVectors = np.array([(model.wv[jobSentence]).mean(axis=0) for jobSentence in jobSentences])\n\nfor query in queries:\n    print(query)\n    query_vector = (model.wv[query.split()]).mean(axis=0).reshape(1,-1)\n    \n    candidates['fit'] = cosine_similarity(jobVectors,query_vector)\n    util.sortAndDisplay(candidates,maxVisibleCandidates)\n"

# Glove

In [8]:
"""
candidates = originalCandidates.copy()
job_titles = candidates[['job_title']].values.ravel()
allPhrases = list(job_titles) + queries
allWords = set([word.lower() for word in (" ".join(allPhrases)).split()])
tokenizer = Tokenizer()
tokenizer.fit_on_texts(allWords)
embedding_dim = 50
embedding_matrix_vocab = util.embedding_for_vocab(
    '../glove.6B.50d.txt', tokenizer.word_index,
    embedding_dim)
for query in queries:
    print(query)
    candidates = originalCandidates.copy()
    candidates['fit'] = candidates['job_title'].apply(lambda x: util.retrieveGloveSimilarityScore(x,query,embedding_matrix_vocab,tokenizer))
    util.sortAndDisplay(candidates,maxVisibleCandidates)
"""

'\ncandidates = originalCandidates.copy()\njob_titles = candidates[[\'job_title\']].values.ravel()\nallPhrases = list(job_titles) + queries\nallWords = set([word.lower() for word in (" ".join(allPhrases)).split()])\ntokenizer = Tokenizer()\ntokenizer.fit_on_texts(allWords)\nembedding_dim = 50\nembedding_matrix_vocab = util.embedding_for_vocab(\n    \'../glove.6B.50d.txt\', tokenizer.word_index,\n    embedding_dim)\nfor query in queries:\n    print(query)\n    candidates = originalCandidates.copy()\n    candidates[\'fit\'] = candidates[\'job_title\'].apply(lambda x: util.retrieveGloveSimilarityScore(x,query,embedding_matrix_vocab,tokenizer))\n    util.sortAndDisplay(candidates,maxVisibleCandidates)\n'

# FastText

In [9]:
candidates = originalCandidates.copy()
job_titles = candidates[['job_title']].values.ravel()
allPhrases = list(job_titles) + queries
jobSentences = [title.split() for title in list(job_titles)]
sentences = [title.split() for title in allPhrases]
phrases = Phrases(sentences, min_count = 30, progress_per = 10000)
sentences = phrases[sentences]
model = FastText(window = 5, min_count = 5, workers = 4, min_n = 1, max_n = 4)
model.build_vocab(sentences)
jobVectors = np.array([(model.wv[jobSentence]).mean(axis=0) for jobSentence in jobSentences])

for query in queries:
    print(query)
    query_vector = (model.wv[query.split()]).mean(axis=0).reshape(1,-1)
    
    candidates['fit'] = cosine_similarity(jobVectors,query_vector)
    util.sortAndDisplay(candidates,maxVisibleCandidates)


full-stack software engineer
0: Seeking employment opportunities within Customer Service or Patient Care 0.4767184853553772
1: Aspiring Human Resources Management student seeking an internship 0.4364079236984253
2: Aspiring Human Resources Management student seeking an internship 0.4364079236984253
3: Retired Army National Guard Recruiter, office manager,  seeking a position in Human Resources. 0.42836588621139526
4: Human Resources professional for the world leader in GIS software 0.4236786961555481
5: Director of Human Resources North America, Groupe Beneteau 0.4156807065010071
6: Admissions Representative at Community medical center long beach 0.41557878255844116
7: Seeking Human Resources Opportunities 0.39169004559516907
8: Seeking Human Resources Opportunities 0.39169004559516907
9: Bachelor of Science in Biology from Victoria University of Wellington 0.39092615246772766
10: Experienced Retail Manager and aspiring Human Resources Professional 0.3867005705833435
11: Seeking Human 

# Bert

# Sbert

# Conclusion