In [None]:
!pip install PyPDF2

In [2]:
import pandas as pd
import numpy as np
import enchant
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.cluster import KMeans 
import PyPDF2
import sys
import time
import spacy
import warnings
warnings.filterwarnings('ignore')
sys.path.append("..")
from modules import helper_functions as hf
from modules import similarity_functions as sf
# Load the en_core_web_sm model
nlp = spacy.load('en_core_web_sm')
stopwords = spacy.lang.en.stop_words.STOP_WORDS

In [3]:
# Load data
df_jobs = pd.read_csv("../data/df_job_final.csv", usecols=['title', 'department', 'description_combined'])
df_resume = pd.read_csv("../data/data_resume_cc.csv", usecols=['Category', 'Resume_c'])
# department mapping between job and resume labels
THRESHOLD = 50
df_jobs = hf.get_map_category(df_jobs, 'department', THRESHOLD )
df_resume = hf.get_map_category(df_resume, 'Category', THRESHOLD)
df_jobs_nan = df_jobs[df_jobs['department'].isna() == True]
df_jobs = df_jobs[df_jobs['department'].isna() == False]
# load models
vec = hf.load_tfidf('./pretrained/tfidf_job.pkl')
svm_clf = hf.load_tfidf('./pretrained/tfidf_clf.pkl')
kmeans = hf.load_tfidf('./pretrained/tfidf_cluster.pkl')
df_jobs['cluster'] = kmeans.labels_

In [4]:
# start processing resume
start = time.time()
pdffileobj = open('../data/resume_puck.pdf','rb')
pdfreader = PyPDF2.PdfFileReader(pdffileobj)
pages = pdfreader.numPages
resume = ''
for i in range(0, pages):
    pageobj = pdfreader.getPage(i)
    resume = resume + pageobj.extractText() + ' '

In [5]:
##lemmatize Resume
lemmas = [token.lemma_ for token in nlp(resume)]
resume = [lemma.lower() for lemma in lemmas if lemma.isalpha() and lemma not in stopwords or lemma == '-PRON-']
resume = ' '.join(resume)

In [6]:
pred_department = svm_clf.predict([resume])[0]
resume_matrix = vec.transform([resume])[:, :15240]
pred_cluster = kmeans.predict(resume_matrix)[0]

In [7]:
jobs_top_n = hf.get_top_n_jobs_from_clf(df_jobs=df_jobs,
                            pred_department=pred_department,
                            resume=resume,
                            vec=vec,
                            sim_func=sf.cal_cosine_similarity)
rec_clf = df_jobs.iloc[jobs_top_n]
rec_clf_filtered = rec_clf[rec_clf['department'] == pred_department]
rec_clf_filtered

Unnamed: 0,title,department,description_combined,cluster
591,sr ui/ux designer/developer,it,"['company', 'bcg', 'digital', 'ventures', 'bos...",0
4583,architect node js,it,"['hey', 'hope', '-pron-', 'great', 'send', '-p...",0
247,visual designer,it,"['closely', 'work', 'member', 'product', 'team...",0


In [8]:
jobs_top_n = hf.get_top_n_jobs_from_cluster(df_jobs, pred_cluster, resume, vec, sf.cal_cosine_similarity)
rec_cluster = df_jobs.iloc[jobs_top_n]
rec_cluster_filtered = rec_cluster[rec_cluster['department'] == pred_department]
rec_cluster_filtered

Unnamed: 0,title,department,description_combined,cluster
2752,hardware expert,it,"['hardware', 'expert', 'm', 'mission', 'enable...",0
2776,sr android developer,it,"['sr', 'android', 'developer', 'job', 'new', '...",0
3911,web developer,it,"['en', 'gag', 'or', 'young', 'company', 'build...",0


In [9]:
# assume last 50% of the resumes got offers
applicant_pool_with_offer = df_resume[:(int)(len(df_resume)*0.5)]
application_pool = applicant_pool_with_offer[applicant_pool_with_offer['Category'] == pred_department]
# build resume similarity matrix
temp = application_pool['Resume_c'].append(pd.Series(resume))
matrix =vec.transform(temp)
term_matrix = matrix.todense()
cossim = sf.cal_cosine_similarity(term_matrix)
# get similar resumes based on given resume
index_similar_applicant = np.asarray(cossim[-1][np.where(cossim[-1] < 1)]).argsort()[::-1][:10]
cf_jobs = hf.get_top_n_jobs_from_cf(df_jobs, df_resume, index_similar_applicant, svm_clf, vec, sf.cal_cosine_similarity, 5)
# recommend jobs based on similar resumes
rec_from_cf = df_jobs.iloc[cf_jobs]
rec_from_cf_filtered = rec_from_cf[rec_from_cf['department'] == pred_department]

In [10]:
list_filtered_rec = [rec_clf_filtered, rec_cluster_filtered, rec_from_cf_filtered]
final_rec = pd.concat(list_filtered_rec)
final_rec = final_rec.drop_duplicates()
end = time.time()
final_rec

Unnamed: 0,title,department,description_combined,cluster
591,sr ui/ux designer/developer,it,"['company', 'bcg', 'digital', 'ventures', 'bos...",0
4583,architect node js,it,"['hey', 'hope', '-pron-', 'great', 'send', '-p...",0
247,visual designer,it,"['closely', 'work', 'member', 'product', 'team...",0
2752,hardware expert,it,"['hardware', 'expert', 'm', 'mission', 'enable...",0
2776,sr android developer,it,"['sr', 'android', 'developer', 'job', 'new', '...",0
3911,web developer,it,"['en', 'gag', 'or', 'young', 'company', 'build...",0
995,database administration engineer dba,it,"['dba', 'role', 'largely', 'responsible', 'pro...",0
143,product designer,it,"['closely', 'work', 'member', 'product', 'team...",0
1595,lead developer - mobile apps,it,"['engage', 'mobile', 'native', 'application', ...",0
488,senior systems administrator - linux,it,"['apply', '-pron-', 'previous', 'experience', ...",0


In [11]:
print(f'Time spent: {end-start}')

Time spent: 3.5111353397369385
