In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import ast
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import helpers
import ollama

#### Get ESCO embeddings

In [None]:
esco = pd.read_csv('datasets/skills_en.csv')

In [None]:
esco_skills = esco['preferredLabel'].tolist()

In [None]:
response = ollama.embed(model="nomic-embed-text", input=esco_skills)

In [None]:
esco['embeddings'] = response['embeddings']

In [None]:
esco.to_parquet('processed/esco_skills.parquet', index=False)

In [3]:
esco = pd.read_parquet('processed/esco_skills.parquet')

#### Preprocess dice

In [None]:
dice = pd.read_csv('datasets/dice_com-job_us_sample.csv')

In [None]:
dice

In [None]:
dice['skills'] = dice['skills'].fillna('')

In [None]:
dice['skills'] = dice['skills'].str.lower().str.strip()

In [None]:
dice[dice['skills'].str.contains('see')]

In [None]:
dice[~dice['skills'].str.contains('see')]['skills']

In [None]:
dice_w_skills = helpers.get_list(data=dice, 
                                 col='jobdescription', 
                                 doc_type='job', 
                                 max_workers=10,
                                 model="mistral:instruct")

In [None]:
dice_w_skills

In [None]:
dice_w_skills.to_parquet('processed/dice_job_descriptions_with_skills.parquet', index=False)

In [None]:
dice_w_skills = pd.read_parquet('processed/dice_job_descriptions_with_skills.parquet')

In [None]:
dice_w_skills = helpers.fill_missing_skills(data=dice_w_skills,
                                           skills_col='skills',
                                           doc_type='job',
                                           max_workers=3)

In [None]:
dice_w_skills

In [None]:
matched = helpers.match_all_skills_con(dice_w_skills, esco, threshold=0.8)

In [None]:
matched.to_parquet('processed/dice_job_descriptions_matched.parquet', index=False)

In [None]:
jobs_matched = pd.read_parquet('processed/dice_job_descriptions_matched.parquet')

In [None]:
jobs_matched['matched_skills_ordered'] = jobs_matched['matched_skills'].apply(lambda x: sorted(set(x)))

In [None]:
skill_embeddings = helpers.embed_skills_list(jobs_matched['matched_skills'].tolist())

In [None]:
skill_embeddings_ordered = helpers.embed_skills_list(jobs_matched['matched_skills_ordered'].tolist())

In [None]:
jobs_matched['skill_embeddings'] = skill_embeddings['embeddings']

In [None]:
jobs_matched['skill_embeddings_ordered'] = skill_embeddings_ordered['embeddings']

In [None]:
matched

In [None]:
jobs_matched.to_parquet('processed/dice_job_descriptions_embeddings.parquet', index=False)

In [5]:
matched = pd.read_parquet('processed/dice_job_descriptions_embeddings.parquet')

In [6]:
matched

Unnamed: 0,advertiserurl,company,employmenttype_jobstatus,jobdescription,jobid,joblocation_address,jobtitle,postdate,shift,site_name,skills,uniq_id,extracted_skills,matched_skills,matched_skills_ordered,skill_embeddings,skill_embeddings_ordered
0,https://www.dice.com/jobs/detail/AUTOMATION-TE...,"Digital Intelligence Systems, LLC","C2H Corp-To-Corp, C2H Independent, C2H W2, 3 M...",Looking for Selenium engineers...must have sol...,Dice Id : 10110693,"Atlanta, GA",AUTOMATION TEST ENGINEER,1 hour ago,Telecommuting not available|Travel not required,,see below,418ff92580b270ef4e7c14f0ddfc36b4,"[Selenium, Java, Data Structures, Object Orien...","[Java (computer programming), information stru...","[Groovy, Java (computer programming), Oracle R...","[0.00816865, 0.04228201, -0.12763296, -0.06408...","[0.0012203407, 0.048476767, -0.1365127, -0.069..."
1,https://www.dice.com/jobs/detail/Information-S...,University of Chicago/IT Services,Full Time,The University of Chicago has a rapidly growin...,Dice Id : 10114469,"Chicago, IL",Information Security Engineer,1 week ago,Telecommuting not available|Travel not required,,"linux/unix, network monitoring, incident respo...",8aec88cba08d53da65ab99cf20f6f9d9,"[Incident Response, Information Security Asses...","[investigate security issues, documentation ty...","[documentation types, investigate security iss...","[0.018865215, 0.036568727, -0.16978557, 0.0041...","[0.018206175, 0.013558769, -0.1604094, 0.00365..."
2,https://www.dice.com/jobs/detail/Business-Solu...,"Galaxy Systems, Inc.",Full Time,"GalaxE.SolutionsEvery day, our solutions affec...",Dice Id : CXGALXYS,"Schaumburg, IL",Business Solutions Architect,2 weeks ago,Telecommuting not available|Travel not required,,"enterprise solutions architecture, business in...",46baa1f69ac07779274bcd90b85d9a72,"[Business Intelligence, Data Analysis, Data Wa...","[business intelligence, perform data analysis,...","[apply change management, business intelligenc...","[0.011157077, -0.003909247, -0.17394324, 0.010...","[0.011065557, -0.012342164, -0.18372022, 0.000..."
3,https://www.dice.com/jobs/detail/Java-Develope...,TransTech LLC,Full Time,Java DeveloperFull-time/direct-hireBolingbrook...,Dice Id : 10113627,"Bolingbrook, IL","Java Developer (mid level)- FT- GREAT culture,...",2 weeks ago,Telecommuting not available|Travel not required,,please see job description,3941b2f206ae0f900c4fba4ac0b18719,"[Java, JDBC, Multithreading, Linux/AIX/Unix, S...","[Java (computer programming), SQL, information...","[Java (computer programming), SQL, communicati...","[0.000571077, 0.026021862, -0.13816436, -0.033...","[0.00030429946, 0.02412445, -0.13305221, -0.03..."
4,https://www.dice.com/jobs/detail/DevOps-Engine...,Matrix Resources,Full Time,Midtown based high tech firm has an immediate ...,Dice Id : matrixga,"Atlanta, GA",DevOps Engineer,48 minutes ago,Telecommuting not available|Travel not required,,"configuration management, developer, linux, ma...",45efa1f6bc65acc32bbbb953a1ed13b7,"[DevOps, Project Management, Scripting, Config...","[DevOps, project management, project configura...","[Ansible, DevOps, manipulate puppets, project ...","[-0.017652616, 0.057342496, -0.13930643, -0.05...","[-0.010731407, 0.060096607, -0.1377153, -0.060..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21995,https://www.dice.com/jobs/detail/Web-Designer-...,IAC Publishing,Full Time,Company Description We are searching for a ta...,Dice Id : 10112803,"Oakland, CA",Web Designer,3 weeks ago,Telecommuting not available|Travel not required,,"ui/ux mobile apps, interaction design, digital...",86e27ce6b7e631e55d69d142c7d43df2,"[Python, Project Management, Data Analysis, UI...","[Python (computer programming), project manage...","[Agile development, Python (computer programmi...","[-0.036120027, 0.06199489, -0.13738641, -0.028...","[-0.011015506, 0.052001737, -0.14096637, 0.005..."
21996,https://www.dice.com/jobs/detail/Senior-Front-...,Omega Solutions Inc,Full Time,CONTACT - priya@omegasolutioninc.com / 408-45...,Dice Id : 10289500,"San Francisco, CA",Senior Front End Web Developer - Full Time at ...,3 weeks ago,Telecommuting not available|Travel not required,,"javascript, html5, css3, bootstrap, ajax, reac...",4287c7ee3317ccf1edd76e238cf8e584,"[JavaScript, HTML5, CSS3, Bootstrap, AJAX, Rea...","[JavaScript, AJAX, Angular, SQL, NoSQL, Postgr...","[AJAX, Agile development, Android (mobile oper...","[0.0070498004, 0.031095242, -0.17534529, -0.02...","[0.010783807, 0.024714513, -0.16904819, -0.010..."
21997,https://www.dice.com/jobs/detail/QA-Analyst-Sa...,San Francisco Health Plan,Full Time,Do you take pride in your work knowing that th...,Dice Id : 10115761,"San Francisco, CA",QA Analyst,2 weeks ago,Telecommuting not available|Travel not required,,"sdlc, alm, sql, t-sql, redgate, team foundatio...",d7512f0181d69f83f96db38cd77a4d08,"[Python, Project Management, Data Analysis, SD...","[Python (computer programming), project manage...","[Python (computer programming), SQL, SQL Serve...","[-0.016395764, 0.056502767, -0.1392101, -0.032...","[-0.010273166, 0.05747749, -0.14182803, -0.035..."
21998,https://www.dice.com/jobs/detail/Tech-Lead%252...,IAC Publishing,Full Time,Company Description What We Can Offer YouAs th...,Dice Id : 10112803,"Oakland, CA",Tech Lead-Full Stack,2 weeks ago,Telecommuting not available|Travel not required,,"python, ruby, go, clojure, java, nosql-databas...",ec375268b494b3bcbed1635d64226112,"[JavaScript, MySQL, Node.js, React, Angular.js...","[JavaScript, MySQL, Angular, Python (computer ...","[Angular, Java (computer programming), JavaScr...","[-0.02242549, 0.021542227, -0.15049833, -0.073...","[-0.013301806, 0.019487908, -0.15060024, -0.06..."


In [17]:
matched_avg = helpers.average_skills(matched, esco)

100%|██████████| 22000/22000 [00:01<00:00, 16887.10it/s]


In [18]:
matched_avg

Unnamed: 0,advertiserurl,company,employmenttype_jobstatus,jobdescription,jobid,joblocation_address,jobtitle,postdate,shift,site_name,skills,uniq_id,extracted_skills,matched_skills,matched_skills_ordered,skill_embeddings,skill_embeddings_ordered,avg_skill_embedding
0,https://www.dice.com/jobs/detail/AUTOMATION-TE...,"Digital Intelligence Systems, LLC","C2H Corp-To-Corp, C2H Independent, C2H W2, 3 M...",Looking for Selenium engineers...must have sol...,Dice Id : 10110693,"Atlanta, GA",AUTOMATION TEST ENGINEER,1 hour ago,Telecommuting not available|Travel not required,,see below,418ff92580b270ef4e7c14f0ddfc36b4,"[Selenium, Java, Data Structures, Object Orien...","[Java (computer programming), information stru...","[Groovy, Java (computer programming), Oracle R...","[0.00816865, 0.04228201, -0.12763296, -0.06408...","[0.0012203407, 0.048476767, -0.1365127, -0.069...","[0.012229671666666667, 0.02882406688888889, -0..."
1,https://www.dice.com/jobs/detail/Information-S...,University of Chicago/IT Services,Full Time,The University of Chicago has a rapidly growin...,Dice Id : 10114469,"Chicago, IL",Information Security Engineer,1 week ago,Telecommuting not available|Travel not required,,"linux/unix, network monitoring, incident respo...",8aec88cba08d53da65ab99cf20f6f9d9,"[Incident Response, Information Security Asses...","[investigate security issues, documentation ty...","[documentation types, investigate security iss...","[0.018865215, 0.036568727, -0.16978557, 0.0041...","[0.018206175, 0.013558769, -0.1604094, 0.00365...","[0.02598377266666667, 0.018216321833333337, -0..."
2,https://www.dice.com/jobs/detail/Business-Solu...,"Galaxy Systems, Inc.",Full Time,"GalaxE.SolutionsEvery day, our solutions affec...",Dice Id : CXGALXYS,"Schaumburg, IL",Business Solutions Architect,2 weeks ago,Telecommuting not available|Travel not required,,"enterprise solutions architecture, business in...",46baa1f69ac07779274bcd90b85d9a72,"[Business Intelligence, Data Analysis, Data Wa...","[business intelligence, perform data analysis,...","[apply change management, business intelligenc...","[0.011157077, -0.003909247, -0.17394324, 0.010...","[0.011065557, -0.012342164, -0.18372022, 0.000...","[0.007513185861666667, -0.007068083333333332, ..."
3,https://www.dice.com/jobs/detail/Java-Develope...,TransTech LLC,Full Time,Java DeveloperFull-time/direct-hireBolingbrook...,Dice Id : 10113627,"Bolingbrook, IL","Java Developer (mid level)- FT- GREAT culture,...",2 weeks ago,Telecommuting not available|Travel not required,,please see job description,3941b2f206ae0f900c4fba4ac0b18719,"[Java, JDBC, Multithreading, Linux/AIX/Unix, S...","[Java (computer programming), SQL, information...","[Java (computer programming), SQL, communicati...","[0.000571077, 0.026021862, -0.13816436, -0.033...","[0.00030429946, 0.02412445, -0.13305221, -0.03...","[0.010603966424999998, 0.014678564500000001, -..."
4,https://www.dice.com/jobs/detail/DevOps-Engine...,Matrix Resources,Full Time,Midtown based high tech firm has an immediate ...,Dice Id : matrixga,"Atlanta, GA",DevOps Engineer,48 minutes ago,Telecommuting not available|Travel not required,,"configuration management, developer, linux, ma...",45efa1f6bc65acc32bbbb953a1ed13b7,"[DevOps, Project Management, Scripting, Config...","[DevOps, project management, project configura...","[Ansible, DevOps, manipulate puppets, project ...","[-0.017652616, 0.057342496, -0.13930643, -0.05...","[-0.010731407, 0.060096607, -0.1377153, -0.060...","[0.0009501145119999992, 0.033562203, -0.132539..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21995,https://www.dice.com/jobs/detail/Web-Designer-...,IAC Publishing,Full Time,Company Description We are searching for a ta...,Dice Id : 10112803,"Oakland, CA",Web Designer,3 weeks ago,Telecommuting not available|Travel not required,,"ui/ux mobile apps, interaction design, digital...",86e27ce6b7e631e55d69d142c7d43df2,"[Python, Project Management, Data Analysis, UI...","[Python (computer programming), project manage...","[Agile development, Python (computer programmi...","[-0.036120027, 0.06199489, -0.13738641, -0.028...","[-0.011015506, 0.052001737, -0.14096637, 0.005...","[-0.006177530739999999, 0.03827761362, -0.1344..."
21996,https://www.dice.com/jobs/detail/Senior-Front-...,Omega Solutions Inc,Full Time,CONTACT - priya@omegasolutioninc.com / 408-45...,Dice Id : 10289500,"San Francisco, CA",Senior Front End Web Developer - Full Time at ...,3 weeks ago,Telecommuting not available|Travel not required,,"javascript, html5, css3, bootstrap, ajax, reac...",4287c7ee3317ccf1edd76e238cf8e584,"[JavaScript, HTML5, CSS3, Bootstrap, AJAX, Rea...","[JavaScript, AJAX, Angular, SQL, NoSQL, Postgr...","[AJAX, Agile development, Android (mobile oper...","[0.0070498004, 0.031095242, -0.17534529, -0.02...","[0.010783807, 0.024714513, -0.16904819, -0.010...","[0.011379253499999999, 0.02097830681818182, -0..."
21997,https://www.dice.com/jobs/detail/QA-Analyst-Sa...,San Francisco Health Plan,Full Time,Do you take pride in your work knowing that th...,Dice Id : 10115761,"San Francisco, CA",QA Analyst,2 weeks ago,Telecommuting not available|Travel not required,,"sdlc, alm, sql, t-sql, redgate, team foundatio...",d7512f0181d69f83f96db38cd77a4d08,"[Python, Project Management, Data Analysis, SD...","[Python (computer programming), project manage...","[Python (computer programming), SQL, SQL Serve...","[-0.016395764, 0.056502767, -0.1392101, -0.032...","[-0.010273166, 0.05747749, -0.14182803, -0.035...","[0.0116049258625, 0.023297636, -0.143752904375..."
21998,https://www.dice.com/jobs/detail/Tech-Lead%252...,IAC Publishing,Full Time,Company Description What We Can Offer YouAs th...,Dice Id : 10112803,"Oakland, CA",Tech Lead-Full Stack,2 weeks ago,Telecommuting not available|Travel not required,,"python, ruby, go, clojure, java, nosql-databas...",ec375268b494b3bcbed1635d64226112,"[JavaScript, MySQL, Node.js, React, Angular.js...","[JavaScript, MySQL, Angular, Python (computer ...","[Angular, Java (computer programming), JavaScr...","[-0.02242549, 0.021542227, -0.15049833, -0.073...","[-0.013301806, 0.019487908, -0.15060024, -0.06...","[0.0022520325000000003, 0.018204138399999997, ..."


In [19]:
matched_avg.to_parquet('processed/dice_job_descriptions_embeddings.parquet', index=False)

#### Preprocessing Resume

In [None]:
resume = pd.read_csv('datasets/Resume.csv')

In [None]:
resume_w_skills = helpers.get_list(data=resume, 
                                   col='Resume_str',
                                   doc_type='resume', 
                                   max_workers=10,
                                   model="mistral:instruct")

In [None]:
resume_w_skills.to_parquet('processed/resume_w_skills.parquet', index=False)

In [None]:
resume = pd.read_parquet('processed/resume_w_skills.parquet')

In [None]:
resume = helpers.fill_missing_skills(resume, esco)

In [None]:
resume

In [None]:
resume['length'] = resume['Resume_str'].str.len()

In [None]:
resume[['Resume_str', 'extracted_skills', 'length']]

In [None]:
print(resume['Resume_str'].iloc[94])

In [None]:
skills = resume['extracted_skills'].iloc[0]

In [None]:
matched = helpers.match_closest_skills(skills, esco, threhold=0.8)

In [None]:
matched_list, matched_scores = matched

In [None]:
matched_list

In [None]:
matched_scores

In [None]:
comp_df = pd.DataFrame(columns=['skill', 'matched_skill', 'score'], data=list(zip(skills, matched_list, matched_scores)))

In [None]:
comp_df

In [None]:
resume_matched = helpers.match_all_skills_con(resume, esco, threshold=0.8)

In [None]:
resume_matched

In [None]:
resume_matched.to_parquet('processed/resume_matched.parquet', index=False)

In [None]:
resume_matched = pd.read_parquet('processed/resume_matched.parquet')

In [None]:
resume_matched['matched_skills_ordered'] = resume_matched['matched_skills'].apply(lambda x: sorted(set(x)))

In [None]:
skill_embeddings = helpers.embed_skills_list(resume_matched['matched_skills'].tolist())

In [None]:
skill_embeddings_ordered = helpers.embed_skills_list(resume_matched['matched_skills_ordered'].tolist())

In [None]:
resume_matched['skill_embeddings'] = skill_embeddings['embeddings']

In [None]:
resume_matched['skill_embeddings_ordered'] = skill_embeddings_ordered['embeddings']

In [None]:
resume_matched

In [None]:
resume_matched.to_parquet('processed/resume_embeddings.parquet', index=False)

In [20]:
resume_matched = pd.read_parquet('processed/resume_embeddings.parquet')

In [22]:
resume_matched_avg =helpers.average_skills(resume_matched, esco)

100%|██████████| 2484/2484 [00:00<00:00, 12114.34it/s]


In [23]:
resume_matched_avg.to_parquet('processed/resume_embeddings.parquet', index=False)