In [1]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec, FastText
from gensim.models.phrases import Phrases, Phraser
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from sentence_transformers import SentenceTransformer

import sys
sys.path.insert(0, "../src/")
import util as util

%autosave 5

  from .autonotebook import tqdm as notebook_tqdm


Autosaving every 5 seconds


In [2]:
queries = ["full-stack software engineer", "engineering manager","aspiring human resources"]
maxVisibleCandidates = 15

In [3]:
originalCandidates = pd.read_csv('../data/raw/potential-talents-AspiringHumanResources-SeekingHumanResources.csv')
originalCandidates = originalCandidates.drop_duplicates(subset=['job_title'])
originalCandidates.loc[originalCandidates['connection'] == '500+ ','connection'] = '500'
originalCandidates['connection'] = originalCandidates['connection'].astype(int)

# Bag Of Words

In [5]:
vectorObject = CountVectorizer()

util.scoreViaVectorMethod(originalCandidates,vectorObject,queries,maxVisibleCandidates)

full-stack software engineer
1: Junior MES Engineer| Information Systems 0.22360679774997896
2: Human Resources professional for the world leader in GIS software 0.15811388300841897
3: Native English Teacher at EPIK (English Program in Korea) 0.0
4: People Development Coordinator at Ryan 0.0
5: Advisory Board Member at Celal Bayar University 0.0
6: HR Senior Specialist 0.0
7: Seeking Human Resources HRIS and Generalist Positions 0.0
8: SVP, CHRO, Marketing & Communications, CSR Officer | ENGIE | Houston | The Woodlands | Energy | GPHR | SPHR 0.0
9: Human Resources Coordinator at InterContinental Buckhead Atlanta 0.0
10: Aspiring Human Resources Management student seeking an internship 0.0
11: Human Resources, Staffing and Recruiting Professional 0.0
12: Human Resources Specialist at Luxottica 0.0
13: Director of Human Resources North America, Groupe Beneteau 0.0
14: Human Resources Generalist at ScottMadden, Inc. 0.0
15: Nortia Staffing is seeking Human Resources, Payroll & Administrat

# TF-IDF

In [6]:
util.scoreViaVectorMethod(originalCandidates,TfidfVectorizer(),queries,maxVisibleCandidates)

full-stack software engineer
1: Junior MES Engineer| Information Systems 0.2037092997193693
2: Human Resources professional for the world leader in GIS software 0.17080174729822886
3: Native English Teacher at EPIK (English Program in Korea) 0.0
4: People Development Coordinator at Ryan 0.0
5: Advisory Board Member at Celal Bayar University 0.0
6: HR Senior Specialist 0.0
7: Seeking Human Resources HRIS and Generalist Positions 0.0
8: SVP, CHRO, Marketing & Communications, CSR Officer | ENGIE | Houston | The Woodlands | Energy | GPHR | SPHR 0.0
9: Human Resources Coordinator at InterContinental Buckhead Atlanta 0.0
10: Aspiring Human Resources Management student seeking an internship 0.0
11: Human Resources, Staffing and Recruiting Professional 0.0
12: Human Resources Specialist at Luxottica 0.0
13: Director of Human Resources North America, Groupe Beneteau 0.0
14: Human Resources Generalist at ScottMadden, Inc. 0.0
15: Nortia Staffing is seeking Human Resources, Payroll & Administrati

# Word2Vec

In [7]:
candidates = originalCandidates.copy()
job_titles = candidates[['job_title']].values.ravel()
jobSentences = [title.split() for title in job_titles]
querySentences = [title.split() for title in queries]
allSentences = jobSentences + querySentences
model = Word2Vec(sentences=allSentences, window=5, min_count=1, workers=4)

jobVectors = np.array([(model.wv[jobSentence]).mean(axis=0) for jobSentence in jobSentences])

for query in queries:
    print(query)
    query_vector = (model.wv[query.split()]).mean(axis=0).reshape(1,-1)
    
    candidates['fit'] = cosine_similarity(jobVectors,query_vector)
    util.sortAndDisplay(candidates,maxVisibleCandidates)

full-stack software engineer
1: Human Resources|
Conflict Management|
Policies & Procedures|Talent Management|Benefits & Compensation 0.20969724655151367
2: People Development Coordinator at Ryan 0.20388048887252808
3: Human Resources professional for the world leader in GIS software 0.14653201401233673
4: SVP, CHRO, Marketing & Communications, CSR Officer | ENGIE | Houston | The Woodlands | Energy | GPHR | SPHR 0.13757851719856262
5: Aspiring Human Resources Manager | Graduating May 2020 | Seeking an Entry-Level Human Resources Position in St. Louis 0.10032840073108673
6: Undergraduate Research Assistant at Styczynski Lab 0.09760184586048126
7: Always set them up for Success 0.09000013768672943
8: Seeking employment opportunities within Customer Service or Patient Care 0.07851476222276688
9: Native English Teacher at EPIK (English Program in Korea) 0.07720156013965607
10: Director Of Administration at Excellence Logging 0.0608261302113533
11: Bachelor of Science in Biology from Victor

# Glove

In [8]:
candidates = originalCandidates.copy()
job_titles = candidates[['job_title']].values.ravel()
allPhrases = list(job_titles) + queries
allWords = set([word.lower() for word in (" ".join(allPhrases)).split()])
tokenizer = Tokenizer()
tokenizer.fit_on_texts(allWords)
embedding_dim = 50
embedding_matrix_vocab = util.embedding_for_vocab(
    '../data/glove.6B.50d.txt', tokenizer.word_index,
    embedding_dim)
for query in queries:
    print(query)
    candidates = originalCandidates.copy()
    candidates['fit'] = candidates['job_title'].apply(lambda x: util.retrieveGloveSimilarityScore(x,query,embedding_matrix_vocab,tokenizer))
    util.sortAndDisplay(candidates,maxVisibleCandidates)

full-stack software engineer
1: Junior MES Engineer| Information Systems 0.7622295725044975
2: Information Systems Specialist and Programmer with a love for data and organization. 0.7619851493150511
3: Student at Indiana University Kokomo - Business Management - 
Retail Manager at Delphi Hardware and Paint 0.7088287689594017
4: Human Resources professional for the world leader in GIS software 0.7065329628656409
5: Business Management Major and Aspiring Human Resources Manager 0.6926389851724996
6: Experienced Retail Manager and aspiring Human Resources Professional 0.6811633980833228
7: SVP, CHRO, Marketing & Communications, CSR Officer | ENGIE | Houston | The Woodlands | Energy | GPHR | SPHR 0.666725839343354
8: Business Intelligence and Analytics at Travelers 0.6637405483853753
9: Aspiring Human Resources Management student seeking an internship 0.6500709587289313
10: 2019 C.T. Bauer College of Business Graduate (Magna Cum Laude) and aspiring Human Resources professional 0.6386441347

# FastText

In [9]:
candidates = originalCandidates.copy()
job_titles = candidates[['job_title']].values.ravel()
allPhrases = list(job_titles) + queries
jobSentences = [title.split() for title in list(job_titles)]
sentences = [title.split() for title in allPhrases]
phrases = Phrases(sentences, min_count = 30, progress_per = 10000)
sentences = phrases[sentences]
model = FastText(window = 5, min_count = 5, workers = 4, min_n = 1, max_n = 4)
model.build_vocab(sentences)
jobVectors = np.array([(model.wv[jobSentence]).mean(axis=0) for jobSentence in jobSentences])

for query in queries:
    print(query)
    query_vector = (model.wv[query.split()]).mean(axis=0).reshape(1,-1)
    
    candidates['fit'] = cosine_similarity(jobVectors,query_vector)
    util.sortAndDisplay(candidates,maxVisibleCandidates)


full-stack software engineer
1: Seeking employment opportunities within Customer Service or Patient Care 0.47579312324523926
2: Aspiring Human Resources Management student seeking an internship 0.43933990597724915
3: Director of Human Resources North America, Groupe Beneteau 0.4246407151222229
4: Admissions Representative at Community medical center long beach 0.41557878255844116
5: Retired Army National Guard Recruiter, office manager,  seeking a position in Human Resources. 0.4057430028915405
6: Human Resources professional for the world leader in GIS software 0.3997538089752197
7: Seeking Human Resources Opportunities 0.383891761302948
8: Seeking Human  Resources Opportunities. Open to travel and relocation. 0.3805246949195862
9: Student at Humber College and Aspiring Human Resources Generalist 0.37552499771118164
10: 2019 C.T. Bauer College of Business Graduate (Magna Cum Laude) and aspiring Human Resources professional 0.3710193634033203
11: Bachelor of Science in Biology from Vic

# Bert

In [10]:
candidates = originalCandidates.copy()
job_titles = candidates[['job_title']].values.ravel()

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

job_title_tokens = [tokenizer.encode(title, add_special_tokens=True, max_length=128, truncation=True) for title in job_titles]
job_title_input_ids = [torch.tensor(tokens).unsqueeze(0) for tokens in job_title_tokens]
with torch.no_grad():
    job_title_embeddings = [model(title)[0] for title in job_title_input_ids]

for query in queries:
    print(query)
    query_tokens = tokenizer.encode(query, add_special_tokens=True, max_length=128, truncation=True)
    query_input_ids = torch.tensor(query_tokens).unsqueeze(0)
    with torch.no_grad():
        query_embedding = model(query_input_ids)[0]
    
    scores = [cosine_similarity(query_embedding,job_title_embedding) for job_title_embedding in job_title_embeddings]
    candidates['fit'] = np.array(scores).reshape(-1,1)
    util.sortAndDisplay(candidates,maxVisibleCandidates)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


full-stack software engineer
1: SVP, CHRO, Marketing & Communications, CSR Officer | ENGIE | Houston | The Woodlands | Energy | GPHR | SPHR 0.9999988079071045
2: Student at Chapman University 0.9999772906303406
3: Nortia Staffing is seeking Human Resources, Payroll & Administrative Professionals!!  (408) 709-2621 0.9999467730522156
4: Human Resources|
Conflict Management|
Policies & Procedures|Talent Management|Benefits & Compensation 0.999929666519165
5: HR Manager at Endemol Shine North America 0.9999216794967651
6: Student at Humber College and Aspiring Human Resources Generalist 0.9999035000801086
7: Seeking Human  Resources Opportunities. Open to travel and relocation. 0.9998388290405273
8: Experienced Retail Manager and aspiring Human Resources Professional 0.9997940063476562
9: Business Intelligence and Analytics at Travelers 0.9997907876968384
10: Human Resources Coordinator at InterContinental Buckhead Atlanta 0.9996907114982605
11: Human Resources Management Major 0.999654412

# Sbert

In [11]:
candidates = originalCandidates.copy()
job_titles = candidates[['job_title']].values.ravel()

model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
job_title_embeddings = model.encode(job_titles)
for query in queries:
    print(query)
    
    query_embedding = model.encode(query)
    queryArray = query_embedding.reshape(1,-1)
    similarities = cosine_similarity(queryArray, job_title_embeddings)
    candidates['fit'] = np.array(similarities).reshape(-1,1)
    util.sortAndDisplay(candidates,maxVisibleCandidates)

full-stack software engineer
1: Human Resources professional for the world leader in GIS software 0.45468375086784363
2: Information Systems Specialist and Programmer with a love for data and organization. 0.45314186811447144
3: Junior MES Engineer| Information Systems 0.4411553144454956
4: Director Of Administration at Excellence Logging 0.4336070120334625
5: Director Human Resources  at EY 0.3725767731666565
6: Senior Human Resources Business Partner at Heil Environmental 0.3614225387573242
7: Business Management Major and Aspiring Human Resources Manager 0.3590739071369171
8: Aspiring Human Resources Specialist 0.3582056164741516
9: Aspiring Human Resources Professional 0.3471512794494629
10: Human Resources, Staffing and Recruiting Professional 0.34141218662261963
11: Human Resources Management Major 0.33707278966903687
12: Human Resources Professional 0.3347281813621521
13: Aspiring Human Resources Professional | An energetic and Team-Focused Leader 0.33343350887298584
14: Aspirin

# Conclusion

# Bag Of Words
    - For software engineer, it grabbed 2 out the 3 propper matches. engineering manager, 1st 6 work, rest are zero. HR is fine
# TF-IDF
    - Software engineer, it grabbed 2 out the 3 propper matches. 0.0 score matches made the top 15. Mix of managers for engineering manager. HR search is good.
# word2Vec
    - For software developer, best matches were number 3 & 13. A lot of HR managers were pulled for engineering manager. HR search is good.
# Glove
    - HR is still good. Manager search is ok. For software engineer search, top 3 searches are in positions 1,2, and 4
# FastText
    - HR good. Manager is bringing up non-technical managers first, but manager is still being brought up. Software engineer search is a mess.
# Bert
    - Very off. Not exactly following what others are doing
# Sbert
    - HR and Manager searches are good. Software Engineer search grabbed 3 out of the 3 propper matches

We will go with SBERT based on how well it was able to grab the 3 most software like positions in the software engineer search

# Packaging SBERT

In [None]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
util.saveModel(model,'SBERT')