In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import re

import string
import nltk
nltk.download('stopwords')

#Stop words present in the library
stopwords = nltk.corpus.stopwords.words('english')

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\m84246307\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Load data
df = pd.read_csv("../data/jobpostings.csv")

# Remove all duplicate rows 
df = df.drop_duplicates(keep='last')

In [9]:
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree

#defining the function to remove stopwords from tokenized text
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stopwords])

def preprocessing(description):
    
    # remove punctuation
    description = remove_punctuation(str(description))
                                                            
    # lowering the text
    description = description.lower()
    
    # remove stopwords
    description = remove_stopwords(description)
    
    return description

In [10]:
df_tech = pd.read_excel("../data/Technology Skills.xlsx")
tech_skills = list(df_tech["Example"].unique())

df_skill = pd.read_excel("../data/Skills.xlsx")
regular_skills = list(df_skill["Element Name"].unique())

In [11]:
train, test = train_test_split(df, test_size=0.2, random_state=14, shuffle=True)
train = train.reset_index()
test = test.reset_index()

In [12]:
# Preprocessing...
for idx, row in tqdm(test.iterrows()):
    test['Job Description'][idx] = preprocessing(test['Job Description'][idx])

37726it [01:18, 482.15it/s]


In [13]:
import spacy
from spacy.tokens import DocBin

# model_test = test["Job Description"][10]

# load the trained model
nlp_output = spacy.load("../output/model-best")

category_list = []

# Preprocessing...
for idx, row in tqdm(test.iterrows()):
    # pass our test instance into the trained pipeline
    doc = nlp_output(test["Job Description"][idx])
    
    obj = {"Job Id": test["Job Id"][idx],"Entity Values": list(doc.ents)}
    category_list.append(obj)

37726it [26:48, 23.45it/s]


In [None]:
cols = ['JOB ID', 'Entity Values']
lst = []
for idx in tqdm(range(len(category_list))):
    entities = [str(s) for s in category_list[idx]["Entity Values"]]
    lst.append([category_list[idx]["Job Id"], 
                entities])
df_categories = pd.DataFrame(lst, columns=cols)

In [None]:
df_categories

# Create Similarity Score

In [73]:
# !python -m spacy download en_core_web_md

In [15]:
nlp = spacy.load('en_core_web_md')

In [45]:
x_embed = [nlp(skill).vector for skill in x]
y_embed = [nlp(skill).vector for skill in y]

In [42]:
def cos_similarity(x,y):
    """ return cosine similarity between two lists """
    x_embed = nlp(str(x)).vector
    y_embed = nlp(str(y)).vector
    
    numerator = sum(a*b for a,b in zip(x_embed,y_embed))
    squared_sum_x = sum(i*i for i in x_embed)
    squared_sum_y = sum(i*i for i in y_embed)
    
    denominator = squared_sum_x*squared_sum_y
    return round(numerator/float(denominator),3)

In [43]:
def find_similarity(first_set, sec_set):
    sums = 0.0
    for fs in first_set:
        for ss in sec_set:
            sums += cos_similarity(fs, ss)
            
            # If they both in tech skills, reward
            if (str(fs) in tech_skills) and (str(ss) in tech_skills):
                print("tech")
                sums += 0.01
                
            # If they both in regular skills, reward   
            if (str(fs) in regular_skills) and (str(ss) in regular_skills):
                print("regular")
                sums += 0.01
    return sums

In [46]:
lst = []
for idx in range(len(category_list)):
    for i in range(idx+1, len(category_list)):
        if (len(category_list[idx]["Entity Values"]) == 0) or (len(category_list[i]["Entity Values"]) == 0):
            continue
        score = find_similarity(category_list[idx]["Entity Values"], category_list[i]["Entity Values"])
        
        lst.append([category_list[idx]["Job Id"], 
                    category_list[idx]["Job Id"], 
                    score])
        if score > 0.004:
            print(score)

0.012000000000000004
0.006
0.005
0.007
0.005
0.006
0.005
0.005
0.015
0.005
0.005
0.006
0.005


KeyboardInterrupt: 

In [110]:
category_list[9]["Entity Values"]

[monitoring, impact, asset protection techniques, writing, speaking]

In [24]:
cols = ['JOB ID A', 'JOB ID B', 'Similarity Score']
lst = []
for idx in range(len(category_list)):
    for i in range(idx+1, len(category_list)):
        if (len(category_list[idx]["Entity Values"]) == 0) or (len(category_list[i]["Entity Values"]) == 0):
            continue
        score = find_similarity(category_list[idx]["Entity Values"], category_list[i]["Entity Values"])
        
        lst.append([category_list[idx]["Job Id"], 
                    category_list[idx]["Job Id"], 
                    score])

In [28]:
df

Unnamed: 0,Job Id,Job Title,SOC Code,Job Description,Company Name,Skills,Qualification,City,State,Zipcode,Job Opening Date,Job Closing Date,Status,Website Url
0,89c41c519c3c491929e3082f0ee1d557,"Editor, Celebrations",27-3041.00,"<br/><br/>Gannett Co., Inc. (NYSE: GCI) is a s...",Gannett,"[Local Media, Editing, Journalism]","[Bachelor of Journalism (B.J.), Master of Jour...",Boston,Massachusetts,2108.0,2021-09-23,2021-09-30,CLOSED,https://www.gannett.com
1,ac0c91f394fa77a00ad72ee3440cb4b7,Software Engineer II,51-8021.00,Overview </b> <br/><br/>Reporting to the...,ERT,"[Java, Application Architecture, CI, Data Stru...",[Bachelor of Computer Science (B.C.S.)],Medford,Massachusetts,2155.0,2021-04-29,2021-09-30,CLOSED,http://www.ert.com
2,4b5748411c4496f56ef33645a27840e0,Principal Software Architect,17-1011.00,<br/> <br/>Digital technology has forever chan...,"Sovos Compliance, LLC.","[Java, CSS, Government Compliance, Global Comp...",,Wilmington,Massachusetts,1887.0,2021-01-29,2021-09-30,CLOSED,https://sovos.com
3,7a7dac1bc98365216833008c0fbd063d,Strategy Program Manager,27-2012.03,<br/> <br/> <b>Build your future with Sovos</b...,"Sovos Compliance, LLC.","[Government Compliance, Global Compliance, Com...",,Wilmington,Massachusetts,1887.0,2021-08-27,2021-09-30,CLOSED,https://sovos.com
4,304aff90fd39fead183f48206f4070c7,Senior Contract Manager,11-9199.00,Description<br/> <br/>LaBella Associates was e...,LaBella Associates,"[Legal, Disciplinary]","[Bachelor of Engineering (B.E./B.Eng.), Any Ba...",Boston,Massachusetts,2108.0,2021-03-11,2021-09-30,CLOSED,http://www.labellapc.com
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188622,459849bc27bbcb163fb705af2f01e678,Research Assistant I Lab,25-1191.00,</title></head><body><b>Auto req ID</b><br/> <...,Harvard University,"[In Vivo, Molecular Biology, Cloning, Gel Elec...",,Boston,Massachusetts,2108.0,2021-07-10,,OPEN,https://harvard.edu
188623,217b96a9bc7f005fbb75ee4c1cc40932,Finance and Admin Coor,43-1011.00,</title></head><body><b>Auto req ID</b><br/> <...,Harvard University,"[Accounting, General Administration]",,Cambridge,Massachusetts,2138.0,2021-06-25,,OPEN,https://harvard.edu
188624,f8fd3beb5dc778800272cf58aa9e8bb4,Systems Engineer,15-1199.02,</title></head><body><b>Auto req ID</b><br/> <...,Harvard University,"[HTTP, ITIL, Software Development, HTTPS, Ansi...",,Cambridge,Massachusetts,2138.0,2021-07-03,,OPEN,https://harvard.edu
188625,a1e3777919425e3b6c1a8217fbefa7af,Property Operations Assistant,13-1199.00,</title></head><body><b>Auto req ID</b><br/> <...,Harvard University,,,Cambridge,Massachusetts,2138.0,2021-05-15,,OPEN,https://harvard.edu


In [34]:
# define columns to normalize
x = df.iloc[:,9:10]

#normalize values in first two columns only 
df.iloc[:,9:10] = (x-x.min())/ (x.max() - x.min())
df

Unnamed: 0,Job Id,Job Title,SOC Code,Job Description,Company Name,Skills,Qualification,City,State,Zipcode,Job Opening Date,Job Closing Date,Status,Website Url
0,89c41c519c3c491929e3082f0ee1d557,"Editor, Celebrations",27-3041.00,"<br/><br/>Gannett Co., Inc. (NYSE: GCI) is a s...",Gannett,"[Local Media, Editing, Journalism]","[Bachelor of Journalism (B.J.), Master of Jour...",Boston,Massachusetts,0.618781,2021-09-23,2021-09-30,CLOSED,https://www.gannett.com
1,ac0c91f394fa77a00ad72ee3440cb4b7,Software Engineer II,51-8021.00,Overview </b> <br/><br/>Reporting to the...,ERT,"[Java, Application Architecture, CI, Data Stru...",[Bachelor of Computer Science (B.C.S.)],Medford,Massachusetts,0.645053,2021-04-29,2021-09-30,CLOSED,http://www.ert.com
2,4b5748411c4496f56ef33645a27840e0,Principal Software Architect,17-1011.00,<br/> <br/>Digital technology has forever chan...,"Sovos Compliance, LLC.","[Java, CSS, Government Compliance, Global Comp...",,Wilmington,Massachusetts,0.495249,2021-01-29,2021-09-30,CLOSED,https://sovos.com
3,7a7dac1bc98365216833008c0fbd063d,Strategy Program Manager,27-2012.03,<br/> <br/> <b>Build your future with Sovos</b...,"Sovos Compliance, LLC.","[Government Compliance, Global Compliance, Com...",,Wilmington,Massachusetts,0.495249,2021-08-27,2021-09-30,CLOSED,https://sovos.com
4,304aff90fd39fead183f48206f4070c7,Senior Contract Manager,11-9199.00,Description<br/> <br/>LaBella Associates was e...,LaBella Associates,"[Legal, Disciplinary]","[Bachelor of Engineering (B.E./B.Eng.), Any Ba...",Boston,Massachusetts,0.618781,2021-03-11,2021-09-30,CLOSED,http://www.labellapc.com
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188622,459849bc27bbcb163fb705af2f01e678,Research Assistant I Lab,25-1191.00,</title></head><body><b>Auto req ID</b><br/> <...,Harvard University,"[In Vivo, Molecular Biology, Cloning, Gel Elec...",,Boston,Massachusetts,0.618781,2021-07-10,,OPEN,https://harvard.edu
188623,217b96a9bc7f005fbb75ee4c1cc40932,Finance and Admin Coor,43-1011.00,</title></head><body><b>Auto req ID</b><br/> <...,Harvard University,"[Accounting, General Administration]",,Cambridge,Massachusetts,0.635551,2021-06-25,,OPEN,https://harvard.edu
188624,f8fd3beb5dc778800272cf58aa9e8bb4,Systems Engineer,15-1199.02,</title></head><body><b>Auto req ID</b><br/> <...,Harvard University,"[HTTP, ITIL, Software Development, HTTPS, Ansi...",,Cambridge,Massachusetts,0.635551,2021-07-03,,OPEN,https://harvard.edu
188625,a1e3777919425e3b6c1a8217fbefa7af,Property Operations Assistant,13-1199.00,</title></head><body><b>Auto req ID</b><br/> <...,Harvard University,,,Cambridge,Massachusetts,0.635551,2021-05-15,,OPEN,https://harvard.edu


In [None]:
df_similarities = pd.DataFrame(lst, columns=cols)