In [4]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir('/content/drive/My Drive/Colab Notebooks/')
import numpy as np
import pandas as pd
import csv
import nltk
from nltk.probability import *

Mounted at /content/drive


In [5]:
def preprocess(skillsfile,job_info_file): # read skills and make them into a dictionary with links as keys
  i = 0
  fails = ["sorry","unable","apologize","cannot", "frameworks languages softwares","does not contain","no technical","do not have access","does not mention anything","no data","no relevant"]
  with open(skillsfile) as f:
    skills_dict = dict()
    reader = csv.reader(f)
    header = next(reader, None)  # skip table head
    all_skills = set() # we store all possible skills here
    for skills in reader:
      #if i > 10000: break
      if i%100000==0:
        print(i,"rows processed in skills file")
      i+=1
      skills = [x.strip().replace("\"","").lower() for x in skills]
      link = skills[0]
      skills = skills[1].split(",")
      skills = set([s.strip() for s in skills if not any(f in s for f in fails)])
      skills_dict[link] = skills # keys are links to jobs and values are the list of skills required for job
      for skill in skills:
        all_skills.add(skill)

  with open(job_info_file) as f: # to join the two files by links
    i = 0
    reader = csv.reader(f)
    header = next(reader, None)  # skip table head
    jobs_dict = dict()
    for row in reader:
      if i%100000 == 0:
        print(i," rows processed in job posting file")
      i += 1
      if len(row) < 6:
        continue
      job_link = row[0].strip().lower()
      last_time = row[1].strip() # last time the link was visited
      title = row[5].strip()
      if len(row) > 6:
        company = row[6].strip()
      else:
        company = ""
      if len(row) > 7:
        location = row[7].strip()
      else:
        location = ""
      if job_link not in skills_dict: # some job postings don't have skill requirements available. If we took them as having no requirements, we would get 50 000 extra jobs that anyone is qualified for. Hence we skip them completely.
        continue
      else:
        skills_req = skills_dict[job_link]
      jobs_dict[job_link] = [last_time,title,company,location,skills_req]
  return jobs_dict, all_skills

In [6]:
def get_jobs_info(jobs_dict, user_skills):
  input = set([x.strip().lower() for x in user_skills])
  fitting_jobs = dict()
  skills_missing_1 = dict()
  skills_missing_2 = dict()
  skills_missing_3 = dict()
  input_len = len(input)
  for key in jobs_dict:
    skills = jobs_dict[key][4] # the list of skills required for the job
    if len(skills) - input_len <= 3: # the difference in required skills is not too big
      diff = set(skills) - input
      value = jobs_dict[key][:4] + list(diff) # job link, last_time,title,company,location and missing skills. If all skills are met, the list "diff" will be empty.
      if len(diff) == 0 or diff == {''} or diff == {'n/a'} or diff == {'no requirements'} or diff == {'none'} or diff == {'no specific skills'}:
        fitting_jobs[key] = value # all requirements met for the job
      elif len(diff) == 1:
        skills_missing_1[key] = value #the same as previous but we add the missing skills as well
      elif len(diff) == 2:
        skills_missing_2[key] = value
      elif len(diff) == 3:
        skills_missing_3[key] = value
  return fitting_jobs, skills_missing_1, skills_missing_2, skills_missing_3

In [7]:
skills_file = "job_skills.csv"
job_postings_file = "linkedin_job_postings.csv"
jobs_dict = dict()
jobs_dict, all_skills = preprocess(skills_file,job_postings_file) # all_skills is a set of all possible skills, could add to web app. Takes ~2min to run.

0 rows processed in skills file
100000 rows processed in skills file
200000 rows processed in skills file
300000 rows processed in skills file
400000 rows processed in skills file
500000 rows processed in skills file
600000 rows processed in skills file
700000 rows processed in skills file
800000 rows processed in skills file
900000 rows processed in skills file
1000000 rows processed in skills file
1100000 rows processed in skills file
1200000 rows processed in skills file
0  rows processed in job posting file
100000  rows processed in job posting file
200000  rows processed in job posting file
300000  rows processed in job posting file
400000  rows processed in job posting file
500000  rows processed in job posting file
600000  rows processed in job posting file
700000  rows processed in job posting file
800000  rows processed in job posting file
900000  rows processed in job posting file
1000000  rows processed in job posting file
1100000  rows processed in job posting file
1200000 

In [8]:
input = ["example skill","microsoft","computer","money laundering","python","java"]
fitting_jobs, skills_missing_1, skills_missing_2, skills_missing_3 = get_jobs_info(jobs_dict, input)

In [9]:
fitting_jobs # all jobs for which the user is qualified. An entry consists of: the LinkedIn link, date it was checked, the job title, the company and the location

{'https://www.linkedin.com/jobs/view/behavioral-health-counselor-travel-position-at-sandstone-care-3803493931': ['2024-01-21 07:40:11.491706+00',
  'Behavioral Health Counselor (Travel Position)',
  'Sandstone Care',
  'Chantilly, VA',
  ''],
 'https://www.linkedin.com/jobs/view/volunteer-bereavement-volunteer-at-cardinal-hospice-care-3803993564': ['2024-01-21 07:44:16.834727+00',
  'Volunteer: Bereavement Volunteer',
  'Cardinal Hospice Care',
  'Jacksonville, NC',
  ''],
 'https://www.linkedin.com/jobs/view/rn-per-diem-at-people-inc-3736174816': ['2024-01-21 07:13:13.433253+00',
  'RN PER DIEM',
  'People Inc.',
  'Rochester, NY',
  ''],
 'https://www.linkedin.com/jobs/view/administrative-assistant-dcn-temp-position-at-foundation-partners-group-3727249313': ['2024-01-20 13:47:25.4771+00',
  'Administrative Assistant DCN - Temp Position',
  'Foundation Partners Group',
  'Santa Ana, CA',
  ''],
 'https://www.linkedin.com/jobs/view/radiologic-technologist-f-t-day-evening-at-yale-new-ha

In [10]:
len(fitting_jobs) # the amount of all qualified jobs

2632

In [11]:
len(all_skills) # the amount of skills mentioned in files

2772502

In [12]:
missing_1 = FreqDist() # find out the most common skills the user is missing
for key in skills_missing_1:
  for skill in skills_missing_1[key][4:]:
    if skill not in missing_1:
      missing_1[skill] = 1
    else:
      missing_1[skill] += 1
missing_skills_all = FreqDist()
for ms in [skills_missing_1,skills_missing_2,skills_missing_3]:
  for key in ms:
    for skill in ms[key][4:]:
      if skill not in missing_skills_all:
        missing_skills_all[skill] = 1
      else:
        missing_skills_all[skill] += 1

In [13]:
missing_1.most_common(10) #10 most common skills that are the only reasons of not meeting job requirements

# Problem: this suffers from the unequal distribution of jobs in the dataset. For example, there seems to be many entries for medical personnel and it shows up no matter which skills the user has entered.

[('nursing', 38),
 ('equal opportunity employer', 23),
 ('appleone', 21),
 ('covid19 vaccination', 19),
 ('physician', 19),
 ('registered nurse', 19),
 ('communication', 17),
 ('volunteer', 17),
 ('everify', 14),
 ('volunteering', 14)]

In [14]:
missing_skills_all.most_common(10) #10 most common skills that the user is missing (from jobs that require 1-3 more skills, so that we don't suggest the user totally random skills)

[('physician', 483),
 ('registered nurse', 416),
 ('nursing', 360),
 ('customer service', 336),
 ('healthcare', 272),
 ('hospitality', 163),
 ('radiology', 154),
 ('communication', 151),
 ('communication skills', 132),
 ('travel nursing', 128)]

In [29]:
def get_jobs_info_dict(jobs_dict, user_skills):
  input = set([x.strip().lower() for x in user_skills])
  fitting_jobs = dict()
  skills_missing_1 = dict()
  skills_missing_2 = dict()
  skills_missing_3 = dict()
  input_len = len(input)
  for key in jobs_dict:
    skills = jobs_dict[key][4] # the list of skills required for the job
    if len(skills) - input_len <= 3: # the difference in required skills is not too big
      diff = set(skills) - input
      if len(skills) == 0:
        skills = {''}
      value = jobs_dict[key][:4] + list(diff) # job link, last_time,title,company,location and missing skills. If all skills are met, the list "diff" will be empty.
      if len(diff) == 0:
        fitting_jobs[key] = {'job_name':jobs_dict[key][2],'required':skills} # all requirements met for the job
      elif diff == {''} or diff == {'n/a'} or diff == {'no requirements'} or diff == {'none'} or diff == {'no specific skills'}:
        fitting_jobs[key] = {'job_name':jobs_dict[key][2],'required':{''}}
      elif len(diff) == 1:
        skills_missing_1[key] = {'job_name':jobs_dict[key][2],'required':skills,'missing':diff} #the same as previous but we add the missing skills as well
      elif len(diff) == 2:
        skills_missing_2[key] = {'job_name':jobs_dict[key][2],'required':skills,'missing':diff}
      elif len(diff) == 3:
        skills_missing_3[key] = {'job_name':jobs_dict[key][2],'required':skills,'missing':diff}
  result = dict()
  result['fitting_jobs'] = fitting_jobs
  result['skills_missing_1'] = skills_missing_1
  result['skills_missing_2'] = skills_missing_2
  result['skills_missing_3'] = skills_missing_3
  return result

In [30]:
input = ["example skill","microsoft","computer","money laundering","python","java"]
result = get_jobs_info_dict(jobs_dict, input)

In [32]:
result['skills_missing_1']

{'https://www.linkedin.com/jobs/view/excellence-and-equity-lead-at-clickjobs-io-3805103412': {'job_name': 'ClickJobs.io',
  'required': {'microsoft teams'},
  'missing': {'microsoft teams'}},
 'https://www.linkedin.com/jobs/view/lead-painter-c-at-crash-champions-3800985769': {'job_name': 'Crash Champions',
  'required': {'benefits'},
  'missing': {'benefits'}},
 'https://www.linkedin.com/jobs/view/force-intelligence-officer-3pi-knowledge-hub-at-clickjobs-io-3805334549': {'job_name': 'ClickJobs.io',
  'required': {'staffordshire police'},
  'missing': {'staffordshire police'}},
 'https://www.linkedin.com/jobs/view/classroom-assistant-k-12-ese-at-broward-county-public-schools-3774032931': {'job_name': 'Broward County Public Schools',
  'required': {'computer skills'},
  'missing': {'computer skills'}},
 'https://www.linkedin.com/jobs/view/automotive-sales-consultant-mills-ford-lincoln-hourly-or-commission-pay-structures-available%21-at-mills-automotive-group-3804700245': {'job_name': 'Mi