In [77]:
import rltk
import csv
from datetime import datetime
tokenizer = rltk.tokenizer.crf_tokenizer.crf_tokenizer.CrfTokenizer()

In [78]:
def processskill(string):
    skills = string.split(";")
    newskills = []
    for skill in skills:
        newskill = skill.lower()
        newskill = newskill.replace("[^a-zA-Z]+", "")
        newskills.append(newskill)
    return ";".join(newskills)

In [79]:
class glassdoorRecord(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = ''

    @rltk.cached_property
    def id(self):
        return self.raw_object['ID']
    
    # attribute 1: job title
    @rltk.cached_property
    def title_string(self):
        return self.raw_object['job_title']
    
    # attribute 2: company name
    @rltk.cached_property
    def company_string(self):
        return self.raw_object['company_name']
    
    # attribute 3: location (city)
    @rltk.cached_property
    def city_string(self):
        return self.raw_object['city']
    
    # attribute 4: location(state)
    @rltk.cached_property
    def state_string(self):
        return self.raw_object['state/area']
    
    # attribute 5: job_details
    @rltk.cached_property
    def desc_string(self):
        return self.raw_object['job_details']
    
    # attribute 6: salary
    @rltk.cached_property
    def salary_string(self):
        return self.raw_object['salary']
    
    # attribute 7: url
    @rltk.cached_property
    def url_string(self):
        return self.raw_object['url']
    
    # attribute 8: diploma
    @rltk.cached_property
    def diploma_string(self):
        return self.raw_object['DIPLOMA']
    
    # attribute 9: skills
    @rltk.cached_property
    def skill_list(self):
        skills = self.raw_object['SKILLS']
        return processskill(skills)
    
    # attribute 10: diploma major
    @rltk.cached_property
    def diploma_major_string(self):
        return self.raw_object['DIPLOMA_MAJOR']
    
    # attribute 11: experience
    @rltk.cached_property
    def experience_string(self):
        return self.raw_object['EXPERIENCE']
    
    # attribute 12: source
    @rltk.cached_property
    def source_string(self):
        return self.raw_object['source']
    
    # attribute 13: work type
    @rltk.cached_property
    def worktype_string(self):
        return self.raw_object['work_type']
    
    # attribute 14: category
    @rltk.cached_property
    def category_string(self):
        return self.raw_object['job_category']
    
    # attribute 15: location
    @rltk.cached_property
    def location_string(self):
        return self.raw_object['location']
    
class linkedinRecord(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = ''

    @rltk.cached_property
    def id(self):
        return self.raw_object['ID']
    
    # attribute 1: job title
    @rltk.cached_property
    def title_string(self):
        return self.raw_object['job_title']
    
    # attribute 2: company name
    @rltk.cached_property
    def company_string(self):
        return self.raw_object['company_name']
    
    # attribute 3: location (city)
    @rltk.cached_property
    def city_string(self):
        return self.raw_object['city']
    
    # attribute 4: location(state)
    @rltk.cached_property
    def state_string(self):
        return self.raw_object['state/area']
    
    # attribute 5: job_details
    @rltk.cached_property
    def desc_string(self):
        return self.raw_object['job_details']
    
    # attribute 6: salary
    @rltk.cached_property
    def salary_string(self):
        return self.raw_object['salary']
    
    # attribute 7: url
    @rltk.cached_property
    def url_string(self):
        return self.raw_object['url']
    
    # attribute 8: diploma
    @rltk.cached_property
    def diploma_string(self):
        return self.raw_object['DIPLOMA']
    
    # attribute 9: skills
    @rltk.cached_property
    def skill_list(self):
        skills = self.raw_object['SKILLS']
        return processskill(skills)
    
    # attribute 10: diploma major
    @rltk.cached_property
    def diploma_major_string(self):
        return self.raw_object['DIPLOMA_MAJOR']
    
    # attribute 11: experience
    @rltk.cached_property
    def experience_string(self):
        return self.raw_object['EXPERIENCE']
    
    # attribute 12: source
    @rltk.cached_property
    def source_string(self):
        return self.raw_object['source']
    
    # attribute 13: work type
    @rltk.cached_property
    def worktype_string(self):
        return self.raw_object['work_type']
    
    # attribute 14: category
    @rltk.cached_property
    def category_string(self):
        return self.raw_object['job_category']
    
    # attribute 15: location
    @rltk.cached_property
    def location_string(self):
        return self.raw_object['location']

In [80]:
class gCompanyRecord(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = ''

    @rltk.cached_property
    def id(self):
        return self.raw_object['ID']
    
    # attribute 1: company name
    @rltk.cached_property
    def name_string(self):
        return self.raw_object['name']
    
    # attribute 2: headquarter(city)
    @rltk.cached_property
    def city_string(self):
        return self.raw_object['city']
    
    # attribute 2: headquarter(state)
    @rltk.cached_property
    def state_string(self):
        return self.raw_object['state/area']
    
    # attribute 3: company size
    @rltk.cached_property
    def size_string(self):
        return self.raw_object['size']
    
    # attribute 4：website
    @rltk.cached_property
    def website_string(self):
        website = self.raw_object['company_url']
        if website != "":
            if website[-1] != "/":
                website += "/" 
        return website
    
    # attribute 5：industry
    @rltk.cached_property
    def industry_string(self):
        return self.raw_object['industry']
    
    # attribute 6：desc
    @rltk.cached_property
    def desc_string(self):
        return self.raw_object['description']
    
    # attribute 7：revenue
    @rltk.cached_property
    def revenue_string(self):
        return self.raw_object['revenue']
    
    # attribute 8：company_type
    @rltk.cached_property
    def type_string(self):
        return self.raw_object['company_type']
    
    # attribute 9：founded year
    @rltk.cached_property
    def founded_string(self):
        return self.raw_object['founded']
    
    # attribute 10：img url
    @rltk.cached_property
    def img_string(self):
        return self.raw_object['company_logo']
    
    # attribute 11: location
    @rltk.cached_property
    def location_string(self):
        return self.raw_object['headquarter']
    
class lCompanyRecord(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = ''

    @rltk.cached_property
    def id(self):
        return self.raw_object['ID']
    
    # attribute 1: company name
    @rltk.cached_property
    def name_string(self):
        return self.raw_object['company_name']
    
    # attribute 2: headquarter(city)
    @rltk.cached_property
    def city_string(self):
        return self.raw_object['city']
    
    # attribute 2: headquarter(state)
    @rltk.cached_property
    def state_string(self):
        return self.raw_object['state/area']
    
    # attribute 3: company size
    @rltk.cached_property
    def size_string(self):
        return self.raw_object['size']
    
    # attribute 4：website
    @rltk.cached_property
    def website_string(self):
        website = self.raw_object['website']
        if website != "":
            if website[-1] != "/":
                website += "/" 
        return website
    
    # attribute 5：industry
    @rltk.cached_property
    def industry_string(self):
        return self.raw_object['industry']
    
    # attribute 6：desc
    @rltk.cached_property
    def desc_string(self):
        return self.raw_object['description']
    
    # attribute 7：founded year
    @rltk.cached_property
    def founded_string(self):
        return self.raw_object['founded']
    
    # attribute 8：img url
    @rltk.cached_property
    def img_string(self):
        return self.raw_object['img_url']
    
    # attribute 9: location
    @rltk.cached_property
    def location_string(self):
        return self.raw_object['headquarters']
    

class wCompanyRecord(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = ''

    @rltk.cached_property
    def id(self):
        return self.raw_object['item']
    # attribute 1: company name
    @rltk.cached_property
    def name_string(self):
        return self.raw_object['itemLabel']
    
    # attribute 2: headquarter
    @rltk.cached_property
    def headquarter_string(self):
        return self.raw_object['locationLabel']
    
    # attribute 4：website
    @rltk.cached_property
    def website_string(self):
        website = self.raw_object['website']
        if website != "":
            if website[-1] != "/":
                website += "/" 
        return website
    
    # attribute 5：founded year
    @rltk.cached_property
    def founded_string(self):
        return self.raw_object['founded'].split("-")[0]

In [81]:
dir_ = '../csvfile_category/'
glassdoor_file = dir_ + 'glassdoor_req.csv'
linkedin_file = dir_ + 'linkedin_req.csv'

ds_glassdoor = rltk.Dataset(rltk.CSVReader(glassdoor_file),record_class=glassdoorRecord)
ds_linkedin = rltk.Dataset(rltk.CSVReader(linkedin_file),record_class=linkedinRecord)

gc_file = dir_ + 'glassdoor_company_withid.csv'
lc_file = dir_ + 'linkedin_company_withid.csv'
wc_file = dir_ + 'wikidata_company.csv'

ds_linkedin_c = rltk.Dataset(rltk.CSVReader(lc_file),record_class=lCompanyRecord)
ds_glassdoor_c = rltk.Dataset(rltk.CSVReader(gc_file),record_class=gCompanyRecord)
ds_wikidata_c = rltk.Dataset(rltk.CSVReader(wc_file),record_class=wCompanyRecord)


### read matched pairs

In [82]:
job_link = []
with open(dir_+'linkage/job_linkage.csv', encoding='utf-8', errors="replace") as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if len(row) <= 1:
            continue
        if line_count == 0:
            columns = row
            line_count += 1
        else:
            job_link.append(row)


In [83]:
gl_link = []
with open(dir_+'linkage/g_l_linkage.csv', encoding='utf-8', errors="replace") as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if len(row) <= 1:
            continue
        if line_count == 0:
            columns = row
            line_count += 1
        else:
            gl_link.append(row)

wl_link = []
with open(dir_+'linkage/w_l_linkage.csv', encoding='utf-8', errors="replace") as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if len(row) <= 1:
            continue
        if line_count == 0:
            columns = row
            line_count += 1
        else:
            wl_link.append(row)


wg_link = []
with open(dir_+'linkage/w_g_linkage.csv', encoding='utf-8', errors="replace") as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if len(row) <= 1:
            continue
        if line_count == 0:
            columns = row
            line_count += 1
        else:
            wg_link.append(row)

glw_link = []
with open(dir_+'linkage/g_l_w_linkage.csv', encoding='utf-8', errors="replace") as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if len(row) <= 1:
            continue
        if line_count == 0:
            columns = row
            line_count += 1
        else:
            glw_link.append(row)


In [84]:
len(gl_link)

64

In [70]:
for id1, id2 in gl_link:
    rg = ds_glassdoor_c.get_record(id1)
    rl = ds_linkedin_c.get_record(id2)
    print(id1, rg.name_string, id2, rl.name_string)

5 The Home Depot 2434 The Home Depot
6 USAA 2633 USAA
18 BP 1168 bp
592 Citi 1756 Citi
739 CapstoneONE Search 1944 CapstoneONE Search
765 Expedition Technology Inc 2229 Expedition Technology Inc
25 Liberty Mutual Insurance 2314 Liberty Mutual Insurance
29 Ursus, Inc. 1926 Ursus, Inc.
30 Harnham 69 Harnham
31 Apex Systems 2501 Apex Systems
35 Matlen Silver 2086 Matlen Silver
143 confidential 2134 Confidential
117 Confidential 2134 Confidential
52 Analog Devices 1858 Analog Devices
324 Paciolan 703 Paciolan
60 Peraton 2388 Peraton
146 DISYS 1769 DISYS
90 Air Products 2396 Air Products
95 CyberCoders 1107 CyberCoders
102 Navy Federal Credit Union 1673 Navy Federal Credit Union
921 DICK'S Sporting Goods 1022 DICK'S Sporting Goods
392 SpaceX 1560 SpaceX
152 Boutique Recruiting 179 Boutique Recruiting
155 GPac 2182 gpac
160 CBRE 2427 CBRE
198 US Foods 2343 US Foods
175 CPS Energy 1696 CPS Energy
936 Act! LLC 2680 Act! LLC
202 Halff Associates 1235 Halff Associates
203 IFG Companies 1352 IFG 

In [85]:
import pandas as pd
from rdflib import Graph, URIRef, Literal, XSD, Namespace, RDF

In [88]:
def checkexist(str2, list1):
    exist = False
    for str1 in list1:
        if rltk.jaro_winkler_similarity(str1, str2) > 0.85:
            exist = True
    return exist

def combine_list(str1, str2):
    list1 = str1.split(";")
    list2 = str2.split(";")
    newlist = list1
    for skill in list2:
        if not checkexist(skill, list1):
            newlist.append(skill)
    return ";".join(newlist)

def combine_str(str1, str2):
    if str1 != "":
        return str1
    else:
        return str2

In [89]:
def combine_job(r1, r2):
    info = {}
    info["job_title"] = r1.title_string
    info['job_description'] = r1.desc_string
    info['salary'] = r1.salary_string
    info['work_type'] = r2.worktype_string
    info['url'] = r1.url_string
    info['skills'] = combine_list(r1.skill_list, r2.skill_list)
    info['diploma'] = combine_str(r1.diploma_string, r2.diploma_string)
    info['major'] = combine_str(r1.diploma_major_string, r2.diploma_major_string)
    info['experience'] = combine_str(r1.experience_string, r2.experience_string)
    info['source'] = r1.source_string
    info['company'] = r1.company_string
    info['city'] = r2.city_string
    info['state'] = r2.state_string
    info['category'] = r2.category_string
    info['location'] = r2.location_string
    return info

In [90]:
job_info = []
link_l = [i[1] for i in job_link]
for job in ds_linkedin:
    if job.id not in link_l:
        info = {}
        info["job_title"] = job.title_string
        info['job_description'] = job.desc_string
        info['salary'] = job.salary_string
        info['work_type'] = job.worktype_string
        info['url'] = job.url_string
        info['skills'] = job.skill_list
        info['diploma'] = job.diploma_string
        info['major'] = job.diploma_major_string
        info['experience'] = job.experience_string
        info['source'] = job.source_string
        info['company'] = job.company_string
        info['city'] = job.city_string
        info['state'] = job.state_string
        info['category'] = job.category_string
        info['location'] = job.location_string
        job_info.append(info)
link_g = [i[0] for i in job_link]
for job in ds_glassdoor:
    if job.id not in link_g:
        info = {}
        info["job_title"] = job.title_string
        info['job_description'] = job.desc_string
        info['salary'] = job.salary_string
        info['work_type'] = job.worktype_string
        info['url'] = job.url_string
        info['skills'] = job.skill_list
        info['diploma'] = job.diploma_string
        info['major'] = job.diploma_major_string
        info['experience'] = job.experience_string
        info['source'] = job.source_string
        info['company'] = job.company_string
        info['city'] = job.city_string
        info['state'] = job.state_string
        info['category'] = job.category_string
        info['location'] = job.location_string
        job_info.append(info)

for id1, id2 in job_link:
    rg = ds_glassdoor.get_record(id1)
    rl = ds_linkedin.get_record(id2)
    info = combine_job(rg, rl)
    job_info.append(info)

In [91]:
len(job_info)

6631

In [92]:
def combine_founded(str1, str2): # r1: glassdoor, r2: linkedin
    if str1 == "":
        return str2
    return str1

In [93]:
company_info = []

link_l_c3 = [i[1] for i in glw_link] # three linkage
link_l_c2 = [i[1] for i in wl_link if i[1] not in link_l_c3] # wikidata - linkedin 
link_l_c = [i[1] for i in gl_link if i[1] not in link_l_c3 and i[1] not in link_l_c2] # glassdoor - linkedin 

# only linkedin
for company in ds_linkedin_c:
    if company.id not in link_l_c and company.id not in link_l_c2 and company.id not in link_l_c3:
        info = {}
        info["company_name"] = company.name_string
        info['city'] = company.city_string
        info['state'] = company.state_string
        info['size'] = company.size_string
        info['website'] = company.website_string
        info['industry'] = company.industry_string
        info['description'] = company.desc_string
        info['founded'] = company.founded_string
        info['logo_url'] = company.img_string
        info['location'] = company.location_string
        company_info.append(info)

# only glassdoor
link_g_c3 = [i[0] for i in glw_link] # three linkage
link_g_c2 = [i[1] for i in wg_link if i[1] not in link_g_c3] # wikidata - glassdoor 
link_g_c = [i[0] for i in gl_link if i[0] not in link_g_c3 and i[0] not in link_g_c2] # glassdoor - linkedin 

for company in ds_glassdoor_c:
    if company.id not in link_g_c and company.id not in link_g_c2 and company.id not in link_g_c3:
        info = {}
        info["company_name"] = company.name_string
        info['city'] = company.city_string
        info['state'] = company.state_string
        info['size'] = company.size_string
        info['website'] = company.website_string
        info['industry'] = company.industry_string
        info['description'] = company.desc_string
        info['revenue'] = company.revenue_string
        info['founded'] = company.founded_string
        info['company_type'] = company.type_string
        info['logo_url'] = company.img_string
        info['location'] = company.location_string
        company_info.append(info)

# glassdoor - linkedin
for id1, id2 in gl_link:
    if id1 not in link_g_c3 and id1 not in link_g_c2:
        rg = ds_glassdoor_c.get_record(id1)
        rl = ds_linkedin_c.get_record(id2)
        if rg.name_string == "Ursus, Inc.":
            print("add as glassdoor-linkedin")
        info = {}
        info["company_name"] = rg.name_string
        info['city'] = rg.city_string
        info['state'] = rg.state_string
        info['size'] = rl.size_string
        info['website'] = rg.website_string
        info['industry'] = rg.industry_string
        info['description'] = rg.desc_string
        info['revenue'] = rg.revenue_string
        info['founded'] = rg.founded_string
        info['company_type'] = rg.type_string
        info['logo_url'] = rl.img_string
        info['location'] = rg.location_string
        company_info.append(info)

# glassdoor - wikidata
for id1, id2 in wg_link:
    if id2 not in link_g_c3 and id2 not in link_g_c:
        rg = ds_glassdoor_c.get_record(id2)
        rw = ds_wikidata_c.get_record(id1)
        info = {}
        info["company_name"] = rg.name_string
        info['city'] = rg.city_string
        info['state'] = rg.state_string
        info['size'] = rg.size_string
        info['website'] = rg.website_string
        info['industry'] = rg.industry_string
        info['description'] = rg.desc_string
        info['revenue'] = rg.revenue_string
        info['founded'] = combine_founded(rw.founded_string, rg.founded_string)
        info['company_type'] = rg.type_string
        info['logo_url'] = rg.img_string
        info['location'] = rg.location_string
        company_info.append(info)
        
# linkedin - wikidata
for id1, id2 in wl_link:
    if id2 not in link_l_c3 and id2 not in link_l_c:
        rl = ds_linkedin_c.get_record(id2)
        rw = ds_wikidata_c.get_record(id1)
        info = {}
        info["company_name"] = rl.name_string
        info['city'] = rl.city_string
        info['state'] = rl.state_string
        info['size'] = rl.size_string
        info['website'] = rl.website_string
        info['industry'] = rl.industry_string
        info['description'] = rl.desc_string
        info['founded'] = rw.founded_string
        info['logo_url'] = rl.img_string
        info['location'] = rl.location_string
        company_info.append(info)

# glassdoor - wikidata - linkedin
for id1, id2, id3 in glw_link:
    rg = ds_glassdoor_c.get_record(id1)
    rw = ds_wikidata_c.get_record(id3)
    rl = ds_linkedin_c.get_record(id2)
    
    info = {}
    info["company_name"] = rg.name_string
    info['city'] = rg.city_string
    info['state'] = rg.state_string
    info['size'] = rl.size_string
    info['website'] = rg.website_string
    info['industry'] = rg.industry_string
    info['description'] = rg.desc_string
    info['revenue'] = rg.revenue_string
    info['founded'] = combine_founded(rw.founded_string, rg.founded_string)
    info['company_type'] = rg.type_string
    info['logo_url'] = rl.img_string
    info['location'] = rg.location_string
    company_info.append(info)

add as glassdoor-linkedin


In [94]:
len(company_info)

3555

In [95]:
import csv

In [97]:
header = ["job_title", 'job_description', 'salary','work_type','url','skills','diploma','major','experience','source','company','city','state', 'location', 'category']
with open('../csvfile_category/job_positions_full.csv', 'w') as f:  # You will need 'wb' mode in Python 2.x
    writer = csv.DictWriter(f, fieldnames = header)
    writer.writeheader()
    for job in job_info:
        writer.writerow(job) 

In [98]:
header_c = ["company_name",'city','state','location', 'size','website','industry','description','revenue','founded','company_type', 'logo_url']
with open('../csvfile_category/company_full.csv', 'w') as f:  # You will need 'wb' mode in Python 2.x
    writer = csv.DictWriter(f, fieldnames = header_c)
    writer.writeheader()
    for company in company_info:
        writer.writerow(company) 