In [1]:
import pandas as pd
import os
import spacy
from spacy.matcher import Matcher
import PyPDF2
import csv
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from ftfy import fix_text
from sklearn.neighbors import NearestNeighbors
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize



In [2]:
# Load dataset
file_path = os.path.join('..', 'Data', 'jd_structured_data.csv')
jd_df = pd.read_csv(file_path)
jd_df.tail()

Unnamed: 0,Job Title,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Competitors,Average Salary,Average Revenue,Processed_JD
952,"Project Scientist - Auton Lab, Robotics Institute",2.6,Software Engineering Institute,"Pittsburgh, PA","Pittsburgh, PA",750.5,1984,College / University,Colleges & Universities,Education,-1,73.5,24319.000761,The Auton Lab Carnegie Mellon University large...
953,Data Science Manager,3.2,"Numeric, LLC","Allentown, PA","Chadds Ford, PA",25.5,-1,Company - Private,Staffing & Outsourcing,Business Services,-1,127.5,7.5,Data Science ManagerResponsibilities: Oversee ...
954,Data Engineer,4.8,IGNW,"Austin, TX","Portland, OR",350.5,2015,Company - Private,IT Services,Information Technology,Slalom,103.1539,37.5,Loading... Title: Data Engineer Location: Aust...
955,Research Scientist – Security and Privacy,3.6,Riverside Research Institute,"Beavercreek, OH","Arlington, VA",750.5,1967,Nonprofit Organization,Federal Agencies,Government,-1,93.5,75.0,Returning Candidate? Log back Career Portal cl...
956,Lawyer,2.0,-,-,-,-,1990,M,M,M,M,90.2,22.0,RWE Windpower NL BV To start as soon as possib...


In [26]:
#Uses PDF reader to get all the text from CV
def extract_text_from_pdf(file_path:str):
    with open(file_path, 'rb') as f:
        pdf_reader = PyPDF2.PdfReader(f)
        text = ''
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text


file_path = os.path.join('..', 'Data', 'skills.csv')
skills_data = pd.read_csv(file_path)
skills_data = [row for row in skills_data]


file_path = os.path.join('..', 'Data/CVs', 'CV_Ayse.pdf')

pdf_as_text = extract_text_from_pdf(file_path)

# Create a regular expression pattern to match skills
pattern = re.compile(r'\b(' + '|'.join(re.escape(skill) for skill in skills_data) + r')\b', re.IGNORECASE)

# Find skills in the text using the regular expression pattern
skills = set(match.group(0) for match in pattern.finditer(pdf_as_text))

print(skills)


{'Mining', 'Computer Science', 'Video', 'c', 'statsmodels', 'reports', 'Data Analysis', 'MATLAB', 'statistics', 'English', 'PySpark', 'Python', 'Spanish', 'Big Data', 'French', 'Benchmarking', 'Teaching', 'pandas', 'numpy', 'social media', 'matplotlib', 'AI', 'Java', 'Machine Learning', 'Programming', 'Social Media', 'German'}


In [27]:

def ngrams(string, n=3):
    string = fix_text(string) # fix text
    string = string.encode("ascii", errors="ignore").decode() #remove non ascii chars
    string = string.lower()
    chars_to_remove = [")","(",".","|","[","]","{","}","'"]
    rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
    string = re.sub(rx, '', string)
    string = string.replace('&', 'and')
    string = string.replace(',', ' ')
    string = string.replace('-', ' ')
    string = string.title() # normalise case - capital at start of each word
    string = re.sub(' +',' ',string).strip() # get rid of multiple spaces and replace with a single
    string = ' '+ string +' ' # pad names for ngrams...
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

In [28]:
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams, lowercase=False)
tfidf = vectorizer.fit_transform(skills)

In [29]:
#Ali's Code 
#In this code, I commented the part where we extract skills from the job description and train the model only on them becuase I assume we could lose information


#Convert all skills into one string
s = " ".join(skills)

#Store Job descriptions in a dataframe
jd_test = (jd_df['Processed_JD'].values.astype('U'))


#Find skills in the job descriptions using the regular expression pattern
#queries = [] 
#for jd in jd_test: 
 #   skills_ds = " ".join(set(match.group(0) for match in pattern.finditer(jd)))
  #  queries.append(skills_ds)


#Convert job descriptions into Doc2Vec training instances
tagged_data = [TaggedDocument(words=word_tokenize(doc.lower()),
                             tags=[str(i)]) for i,
               doc in enumerate(jd_test)]
 
# train the Doc2vec model
model = Doc2Vec(vector_size=10,
                min_count=2, epochs=100)
model.build_vocab(tagged_data)
model.train(tagged_data,
            total_examples=model.corpus_count,
            epochs=model.epochs)


In [30]:
#Convert skills from CV to a vector
inferred_vector_dm = model.infer_vector(word_tokenize(s.lower()))  
#Find most similar job descriptions to our CV
sim = model.docvecs.most_similar([inferred_vector_dm],topn=len(jd_test))
sorted_list = sorted(sim, key=lambda x: x[0]) 
dist = [t[1] for t in sorted_list]
sim = np.reshape(dist,[-1,1])    
matches = pd.DataFrame(sim, columns=['Match confidence'])

# Following recommends Top 5 Jobs based on candidate resume:
jd_df['match']=matches['Match confidence']
jd_df.sort_values('match').tail(5)

  sim = model.docvecs.most_similar([inferred_vector_dm],topn=len(jd_test))


Unnamed: 0,Job Title,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Competitors,Average Salary,Average Revenue,Processed_JD,match
525,RESEARCH COMPUTER SCIENTIST - RESEARCH ENGINEE...,3.9,Southwest Research Institute,"San Antonio, TX","San Antonio, TX",3000.5,1947,Nonprofit Organization,Research & Development,Business Services,"Los Alamos National Laboratory, Battelle, SRI ...",71.5,250500.0,Serve software developer researcher team devel...,0.712511
833,Data Scientist,-1.0,WeRide.ai,"San Jose, CA",-1,-1.0,-1,-1,-1,-1,-1,103.1539,1.0,This role requires work closely variety teams ...,0.726647
548,Principal Machine Learning Scientist,4.7,Sage Intacct,"San Francisco, CA","San Jose, CA",750.5,1999,Subsidiary or Business Segment,Computer Hardware & Software,Information Technology,-1,232.5,24319.000761,"Today, nearly every business world bookkeeping...",0.740848
65,Senior Data Scientist,4.4,The David J. Joseph Company,"Cincinnati, OH","Cincinnati, OH",350.5,1885,Subsidiary or Business Segment,Metals Brokers,Mining & Metals,-1,107.0,24319.000761,Overview Everyone wants work people respect. E...,0.741015
40,Data Engineer,3.5,Lancer Insurance,"Long Beach, NY","Long Beach, NY",350.5,1985,Company - Private,Insurance Carriers,Insurance,-1,106.0,300.0,Lancer Insurance Company looking Data Engineer...,0.787413
