In [1]:
import rltk
import csv
from datetime import datetime
tokenizer = rltk.tokenizer.crf_tokenizer.crf_tokenizer.CrfTokenizer()

In [2]:
import re
def cleantitle(title):
    title = title.lower()
    title = re.sub(r"\(.*\)", "", title)
    title = " ".join([x for x in title.split() if not any(c.isdigit() for c in x)])
    
    title = title.replace("full time", "").replace("part time", "").replace("full-time", "").replace("part-time", "")
    title = title.replace("|", "-")
    titlelist = title.split("-")
    '''longest = 0
    title = ""
    for token in titlelist:
        if len(token) > longest:
            longest = len(token)
            title = token'''
    possibleword = ["service", "associate", "representative", "clerk"]
    char3 = ["ent", "ant"]
    char2 = ["or", "er", "st"]
    if len(titlelist) == 0:
        return titlelist[0]
    for token in titlelist:
        s = token.strip()
        if s.split(" ")[-1] in possibleword or s.split(" ")[-1][-2:] in char2 or s.split(" ")[-1][-3:] in char3:
            title = token
    return title

In [4]:
a = "Legal Assistant/Scheduler (Orlando)"
print(cleantitle(a))

legal assistant/scheduler


In [6]:
class glassdoorRecord(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = ''

    @rltk.cached_property
    def id(self):
        return self.raw_object['ID']
    
    # attribute 1: job title
    @rltk.cached_property
    def title_string(self):
        title = cleantitle(self.raw_object['job_title'])
        return title
    
    # attribute 2: title tokens
    @rltk.cached_property
    def title_tokens(self):
        return set(tokenizer.tokenize(self.title_string))
    
    # attribute 3: company name
    @rltk.cached_property
    def company_string(self):
        return self.raw_object['company_name']
    
    # attribute 4: location (city)
    @rltk.cached_property
    def city_string(self):
        return self.raw_object['city']
    
    # attribute 5: last three characters in job title
    @rltk.cached_property
    def last3title(self):
        chars = self.title_string[-3:]
        return chars

class linkedinRecord(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = ''

    @rltk.cached_property
    def id(self):
        return self.raw_object['ID']
    
    # attribute 1: job title
    @rltk.cached_property
    def title_string(self):
        title = cleantitle(self.raw_object['job_title'])
        return title
    
    # attribute 2: title tokens
    @rltk.cached_property
    def title_tokens(self):
        return set(tokenizer.tokenize(self.title_string))
    
    # attribute 3: company name
    @rltk.cached_property
    def company_string(self):
        return self.raw_object['company_name']
    
    # attribute 4: location (city)
    @rltk.cached_property
    def city_string(self):
        return self.raw_object['city']
    
    # attribute 5: last three characters in job title
    @rltk.cached_property
    def last3title(self):
        chars = self.title_string[-3:]
        return chars

In [7]:
dir_ = '../csvfile_category/'
glassdoor_file = dir_ + 'glassdoor_req.csv'
linkedin_file = dir_ + 'linkedin_req.csv'

ds_glassdoor = rltk.Dataset(rltk.CSVReader(glassdoor_file),record_class=glassdoorRecord)
ds_linkedin = rltk.Dataset(rltk.CSVReader(linkedin_file),record_class=linkedinRecord)

In [8]:
df_glassdoor = ds_glassdoor.generate_dataframe()
df_linkedin = ds_linkedin.generate_dataframe()

### blocking

In [9]:
bg = rltk.HashBlockGenerator()

block = bg.generate(
    bg.block(ds_glassdoor, property_="city_string"),
    bg.block(ds_linkedin, property_="city_string")
)

### entity linking

In [10]:
def title_string_similarity(s1, s2):
    return rltk.jaro_winkler_similarity(s1, s2)
    
def company_string_similarity(s1, s2):
    return 1-rltk.levenshtein_distance(s1, s2)

def city_string_similarity(s1, s2):
    if s1 == s2:
        return 1
    return 0

In [15]:
# threshold value to determine if we are confident the record match
MY_TRESH = 0.93

# entity linkage scoring function
def rule_based_method(r1, r2):
    score_1 = title_string_similarity(r1.title_string, r2.title_string)
    score_2 = company_string_similarity(r1.company_string, r2.company_string)
    #score_3 = city_string_similarity(r1.city_string, r2.city_string)
    total = 0.65 * score_1 + 0.35 * score_2
    
    # return two values: boolean if they match or not, float to determine confidence
    return total > MY_TRESH, total

In [16]:
matchPairs = []
for r_glassdoor, r_linkedin in rltk.get_record_pairs(ds_glassdoor, ds_linkedin, block=block):
    result, confidence = rule_based_method(r_glassdoor, r_linkedin)
    if result == True:
        matchPairs.append([r_glassdoor.id, r_linkedin.id, confidence])

In [17]:
len(matchPairs)

29

In [18]:
for ids in matchPairs:
    print(ids)
    r_glassdoor = ds_glassdoor.get_record(ids[0])
    r_linkedin = ds_linkedin.get_record(ids[1])
    print(r_glassdoor.title_string, ", ", r_glassdoor.company_string, ", ", r_glassdoor.city_string)
    print(r_linkedin.title_string, ", ", r_linkedin.company_string, ", ", r_linkedin.city_string, "\n")

['256', '766', 1.0]
counsel, transactions ,  Cepheid ,  New York
counsel, transactions ,  Cepheid ,  New York 

['464', '1326', 1.0]
senior web designer ,  The New York Times ,  New York
senior web designer ,  The New York Times ,  New York 

['85', '71', 1.0]
data scientist ,  Matlen Silver ,  New York
data scientist ,  Matlen Silver ,  New York 

['467', '1478', 1.0]
junior graphic designer ,  Cartier ,  New York
junior graphic designer ,  Cartier ,  New York 

['59', '3', 1.0]
comcast cybersecurity: data scientist ,  Comcast ,  Washington
comcast cybersecurity: data scientist ,  Comcast ,  Washington 

['1515', '262', 1.0]
software test engineer ,  SpaceX ,  Redmond
software test engineer ,  SpaceX ,  Redmond 

['1166', '3522', 0.9923529411764707]
oracle qa tester ,  Gleeds ,  
oracle qa tester  ,  Gleeds ,   

['1580', '4919', 1.0]
product manager ,  Act! LLC ,  
product manager ,  Act! LLC ,   

['978', '2895', 1.0]
business analyst ll ,  Liberty Mutual Insurance ,  
business anal

In [99]:
with open(dir_ + '../linkage_category/job_linkage.csv', mode='w') as file:
    writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    writer.writerow(["glassdoor.id", "linkedin.id"])
    for row in matchPairs:
        writer.writerow((row[0], row[1]))