In [217]:
import pandas as pd
import os
import spacy
from spacy.matcher import Matcher
import PyPDF2
import csv
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from ftfy import fix_text
from sklearn.neighbors import NearestNeighbors

In [4]:
# Load dataset
file_path = os.path.join('..', 'Data', 'jd_structured_data.csv')
jd_df = pd.read_csv(file_path)
jd_df.head()

Unnamed: 0,Job Title,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Competitors,Average Salary,Average Revenue,Processed_JD
0,Data Scientist,3.8,Tecolote Research,"Albuquerque, NM","Goleta, CA",750.5,1973,Company - Private,Aerospace & Defense,Aerospace & Defense,-1,72.0,75.0,"Data Scientist Location: Albuquerque, Educatio..."
1,Healthcare Data Scientist,3.4,University of Maryland Medical System,"Linthicum, MD","Baltimore, MD",10000.0,1984,Other Organization,Health Care Services & Hospitals,Health Care,-1,87.5,3500.0,What You Will Do: General Summary The Healthca...
2,Data Scientist,4.8,KnowBe4,"Clearwater, FL","Clearwater, FL",750.5,2010,Company - Private,Security Services,Business Services,-1,85.0,300.0,"KnowBe4, Inc. high growth information security..."
3,Data Scientist,3.8,PNNL,"Richland, WA","Richland, WA",3000.5,1965,Government,Energy,"Oil, Gas, Energy & Utilities","Oak Ridge National Laboratory, National Renewa...",76.5,250500.0,*Organization Job ID** Job ID: 310709 Director...
4,Data Scientist,2.9,Affinity Solutions,"New York, NY","New York, NY",125.5,1998,Company - Private,Advertising & Marketing,Business Services,"Commerce Signals, Cardlytics, Yodlee",114.5,24319.000761,Data Scientist Affinity Solutions Marketing Cl...


In [227]:
def extract_text_from_pdf(file_path:str):
    with open(file_path, 'rb') as f:
        pdf_reader = PyPDF2.PdfReader(f)
        text = ''
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text

file_path = os.path.join('..', 'Data', 'skills.csv')
skills_data = pd.read_csv(file_path)
skills_data = [row for row in skills_data]
skills_data.extend(['data mining', ' text mining'])

file_path = os.path.join('..', 'Data\\CVs', 'CV_Law.pdf')
pdf_as_text = extract_text_from_pdf(file_path)

# Create a regular expression pattern to match skills
pattern = re.compile(r'\b(' + '|'.join(re.escape(skill) for skill in skills_data) + r')\b', re.IGNORECASE)

# Find skills in the text using the regular expression pattern
skills = set(match.group(0) for match in pattern.finditer(pdf_as_text))

print(skills)

{'regulations', 'email', 'due diligence', 'Legal', 'English', 'writing', 'drafting', 'French', 'international', 'contracts', 'legal', 'supervising', 'presentation', 'analysis', 'German', 'compliance', 'C', 'Drafting', 'communication'}


In [228]:
def ngrams(string, n=3):
    string = fix_text(string) # fix text
    string = string.encode("ascii", errors="ignore").decode() #remove non ascii chars
    string = string.lower()
    chars_to_remove = [")","(",".","|","[","]","{","}","'"]
    rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
    string = re.sub(rx, '', string)
    string = string.replace('&', 'and')
    string = string.replace(',', ' ')
    string = string.replace('-', ' ')
    string = string.title() # normalise case - capital at start of each word
    string = re.sub(' +',' ',string).strip() # get rid of multiple spaces and replace with a single
    string = ' '+ string +' ' # pad names for ngrams...
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

In [229]:
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams, lowercase=False)
tfidf = vectorizer.fit_transform(skills)

In [230]:
nbrs = NearestNeighbors(n_neighbors=1, n_jobs=-1).fit(tfidf)
jd_test = (jd_df['Processed_JD'].values.astype('U'))

def getNearestN(query):
  queryTFIDF_ = vectorizer.transform(query)
  distances, indices = nbrs.kneighbors(queryTFIDF_)
  return distances, indices

distances, indices = getNearestN(jd_test)
test = list(jd_test) 
matches = []

for i,j in enumerate(indices):
    dist=round(distances[i][0],2)
  
    temp = [dist]
    matches.append(temp)
    
matches = pd.DataFrame(matches, columns=['Match confidence'])

# Following recommends Top 5 Jobs based on candidate resume:
jd_df['match']=matches['Match confidence']
jd_df.head(5).sort_values('match')

Unnamed: 0,Job Title,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Competitors,Average Salary,Average Revenue,Processed_JD,match
1,Healthcare Data Scientist,3.4,University of Maryland Medical System,"Linthicum, MD","Baltimore, MD",10000.0,1984,Other Organization,Health Care Services & Hospitals,Health Care,-1,87.5,3500.0,What You Will Do: General Summary The Healthca...,0.95
4,Data Scientist,2.9,Affinity Solutions,"New York, NY","New York, NY",125.5,1998,Company - Private,Advertising & Marketing,Business Services,"Commerce Signals, Cardlytics, Yodlee",114.5,24319.000761,Data Scientist Affinity Solutions Marketing Cl...,1.02
3,Data Scientist,3.8,PNNL,"Richland, WA","Richland, WA",3000.5,1965,Government,Energy,"Oil, Gas, Energy & Utilities","Oak Ridge National Laboratory, National Renewa...",76.5,250500.0,*Organization Job ID** Job ID: 310709 Director...,1.06
2,Data Scientist,4.8,KnowBe4,"Clearwater, FL","Clearwater, FL",750.5,2010,Company - Private,Security Services,Business Services,-1,85.0,300.0,"KnowBe4, Inc. high growth information security...",1.1
0,Data Scientist,3.8,Tecolote Research,"Albuquerque, NM","Goleta, CA",750.5,1973,Company - Private,Aerospace & Defense,Aerospace & Defense,-1,72.0,75.0,"Data Scientist Location: Albuquerque, Educatio...",1.12
