In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import ast
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import helpers
import ollama

#### Get ESCO embeddings

In [None]:
esco = pd.read_csv('datasets/skills_en.csv')

In [None]:
esco_skills = esco['preferredLabel'].tolist()

In [None]:
response = ollama.embed(model="nomic-embed-text", input=esco_skills)

In [None]:
esco['embeddings'] = response['embeddings']

In [None]:
esco.to_parquet('processed/esco_skills.parquet', index=False)

In [5]:
esco = pd.read_parquet('processed/esco_skills.parquet')

#### Preprocess dice

In [None]:
dice = pd.read_csv('datasets/dice_com-job_us_sample.csv')

In [None]:
dice

In [None]:
dice['skills'] = dice['skills'].fillna('')

In [None]:
dice['skills'] = dice['skills'].str.lower().str.strip()

In [None]:
dice[dice['skills'].str.contains('see')]

In [None]:
dice[~dice['skills'].str.contains('see')]['skills']

In [None]:
dice_w_skills = helpers.get_list(data=dice, 
                                 col='jobdescription', 
                                 doc_type='job', 
                                 max_workers=10,
                                 model="mistral:instruct")

In [None]:
dice_w_skills

In [None]:
dice_w_skills.to_parquet('processed/dice_job_descriptions_with_skills.parquet', index=False)

In [5]:
dice_w_skills = pd.read_parquet('processed/dice_job_descriptions_with_skills.parquet')

In [None]:
dice_w_skills = helpers.fill_missing_skills(data=dice_w_skills,
                                           skills_col='skills',
                                           doc_type='job',
                                           max_workers=3)

In [None]:
dice_w_skills

In [6]:
matched = helpers.match_all_skills_con(dice_w_skills, esco, threshold=0.8)

100%|██████████| 22000/22000 [3:18:39<00:00,  1.85it/s]  


In [7]:
matched.to_parquet('processed/dice_job_descriptions_matched.parquet', index=False)

In [3]:
matched = pd.read_parquet('processed/dice_job_descriptions_matched.parquet')

In [4]:
matched

Unnamed: 0,advertiserurl,company,employmenttype_jobstatus,jobdescription,jobid,joblocation_address,jobtitle,postdate,shift,site_name,skills,uniq_id,extracted_skills,matched_skills
0,https://www.dice.com/jobs/detail/AUTOMATION-TE...,"Digital Intelligence Systems, LLC","C2H Corp-To-Corp, C2H Independent, C2H W2, 3 M...",Looking for Selenium engineers...must have sol...,Dice Id : 10110693,"Atlanta, GA",AUTOMATION TEST ENGINEER,1 hour ago,Telecommuting not available|Travel not required,,see below,418ff92580b270ef4e7c14f0ddfc36b4,"[Selenium, Java, Data Structures, Object Orien...","[Java (computer programming), information stru..."
1,https://www.dice.com/jobs/detail/Information-S...,University of Chicago/IT Services,Full Time,The University of Chicago has a rapidly growin...,Dice Id : 10114469,"Chicago, IL",Information Security Engineer,1 week ago,Telecommuting not available|Travel not required,,"linux/unix, network monitoring, incident respo...",8aec88cba08d53da65ab99cf20f6f9d9,"[Incident Response, Information Security Asses...","[investigate security issues, documentation ty..."
2,https://www.dice.com/jobs/detail/Business-Solu...,"Galaxy Systems, Inc.",Full Time,"GalaxE.SolutionsEvery day, our solutions affec...",Dice Id : CXGALXYS,"Schaumburg, IL",Business Solutions Architect,2 weeks ago,Telecommuting not available|Travel not required,,"enterprise solutions architecture, business in...",46baa1f69ac07779274bcd90b85d9a72,"[Business Intelligence, Data Analysis, Data Wa...","[business intelligence, perform data analysis,..."
3,https://www.dice.com/jobs/detail/Java-Develope...,TransTech LLC,Full Time,Java DeveloperFull-time/direct-hireBolingbrook...,Dice Id : 10113627,"Bolingbrook, IL","Java Developer (mid level)- FT- GREAT culture,...",2 weeks ago,Telecommuting not available|Travel not required,,please see job description,3941b2f206ae0f900c4fba4ac0b18719,"[Java, JDBC, Multithreading, Linux/AIX/Unix, S...","[Java (computer programming), SQL, information..."
4,https://www.dice.com/jobs/detail/DevOps-Engine...,Matrix Resources,Full Time,Midtown based high tech firm has an immediate ...,Dice Id : matrixga,"Atlanta, GA",DevOps Engineer,48 minutes ago,Telecommuting not available|Travel not required,,"configuration management, developer, linux, ma...",45efa1f6bc65acc32bbbb953a1ed13b7,"[DevOps, Project Management, Scripting, Config...","[DevOps, project management, project configura..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21995,https://www.dice.com/jobs/detail/Web-Designer-...,IAC Publishing,Full Time,Company Description We are searching for a ta...,Dice Id : 10112803,"Oakland, CA",Web Designer,3 weeks ago,Telecommuting not available|Travel not required,,"ui/ux mobile apps, interaction design, digital...",86e27ce6b7e631e55d69d142c7d43df2,"[Python, Project Management, Data Analysis, UI...","[Python (computer programming), project manage..."
21996,https://www.dice.com/jobs/detail/Senior-Front-...,Omega Solutions Inc,Full Time,CONTACT - priya@omegasolutioninc.com / 408-45...,Dice Id : 10289500,"San Francisco, CA",Senior Front End Web Developer - Full Time at ...,3 weeks ago,Telecommuting not available|Travel not required,,"javascript, html5, css3, bootstrap, ajax, reac...",4287c7ee3317ccf1edd76e238cf8e584,"[JavaScript, HTML5, CSS3, Bootstrap, AJAX, Rea...","[JavaScript, AJAX, Angular, SQL, NoSQL, Postgr..."
21997,https://www.dice.com/jobs/detail/QA-Analyst-Sa...,San Francisco Health Plan,Full Time,Do you take pride in your work knowing that th...,Dice Id : 10115761,"San Francisco, CA",QA Analyst,2 weeks ago,Telecommuting not available|Travel not required,,"sdlc, alm, sql, t-sql, redgate, team foundatio...",d7512f0181d69f83f96db38cd77a4d08,"[Python, Project Management, Data Analysis, SD...","[Python (computer programming), project manage..."
21998,https://www.dice.com/jobs/detail/Tech-Lead%252...,IAC Publishing,Full Time,Company Description What We Can Offer YouAs th...,Dice Id : 10112803,"Oakland, CA",Tech Lead-Full Stack,2 weeks ago,Telecommuting not available|Travel not required,,"python, ruby, go, clojure, java, nosql-databas...",ec375268b494b3bcbed1635d64226112,"[JavaScript, MySQL, Node.js, React, Angular.js...","[JavaScript, MySQL, Angular, Python (computer ..."


In [5]:
dice_skills = matched['matched_skills'].tolist()

In [6]:
# join each list in dice_skills into a single string
joined_dice_skills = [','.join(skills) for skills in dice_skills]

In [7]:
response = ollama.embed(model="nomic-embed-text", input=joined_dice_skills)

In [9]:
len(response['embeddings'][0])

768

In [13]:
matched['embeddings'] = response['embeddings']

In [14]:
matched

Unnamed: 0,advertiserurl,company,employmenttype_jobstatus,jobdescription,jobid,joblocation_address,jobtitle,postdate,shift,site_name,skills,uniq_id,extracted_skills,matched_skills,embeddings
0,https://www.dice.com/jobs/detail/AUTOMATION-TE...,"Digital Intelligence Systems, LLC","C2H Corp-To-Corp, C2H Independent, C2H W2, 3 M...",Looking for Selenium engineers...must have sol...,Dice Id : 10110693,"Atlanta, GA",AUTOMATION TEST ENGINEER,1 hour ago,Telecommuting not available|Travel not required,,see below,418ff92580b270ef4e7c14f0ddfc36b4,"[Selenium, Java, Data Structures, Object Orien...","[Java (computer programming), information stru...","[0.00816865, 0.04228201, -0.12763296, -0.06408..."
1,https://www.dice.com/jobs/detail/Information-S...,University of Chicago/IT Services,Full Time,The University of Chicago has a rapidly growin...,Dice Id : 10114469,"Chicago, IL",Information Security Engineer,1 week ago,Telecommuting not available|Travel not required,,"linux/unix, network monitoring, incident respo...",8aec88cba08d53da65ab99cf20f6f9d9,"[Incident Response, Information Security Asses...","[investigate security issues, documentation ty...","[0.018865215, 0.036568727, -0.16978557, 0.0041..."
2,https://www.dice.com/jobs/detail/Business-Solu...,"Galaxy Systems, Inc.",Full Time,"GalaxE.SolutionsEvery day, our solutions affec...",Dice Id : CXGALXYS,"Schaumburg, IL",Business Solutions Architect,2 weeks ago,Telecommuting not available|Travel not required,,"enterprise solutions architecture, business in...",46baa1f69ac07779274bcd90b85d9a72,"[Business Intelligence, Data Analysis, Data Wa...","[business intelligence, perform data analysis,...","[0.011157077, -0.003909247, -0.17394324, 0.010..."
3,https://www.dice.com/jobs/detail/Java-Develope...,TransTech LLC,Full Time,Java DeveloperFull-time/direct-hireBolingbrook...,Dice Id : 10113627,"Bolingbrook, IL","Java Developer (mid level)- FT- GREAT culture,...",2 weeks ago,Telecommuting not available|Travel not required,,please see job description,3941b2f206ae0f900c4fba4ac0b18719,"[Java, JDBC, Multithreading, Linux/AIX/Unix, S...","[Java (computer programming), SQL, information...","[0.000571077, 0.026021862, -0.13816436, -0.033..."
4,https://www.dice.com/jobs/detail/DevOps-Engine...,Matrix Resources,Full Time,Midtown based high tech firm has an immediate ...,Dice Id : matrixga,"Atlanta, GA",DevOps Engineer,48 minutes ago,Telecommuting not available|Travel not required,,"configuration management, developer, linux, ma...",45efa1f6bc65acc32bbbb953a1ed13b7,"[DevOps, Project Management, Scripting, Config...","[DevOps, project management, project configura...","[-0.017652616, 0.057342496, -0.13930643, -0.05..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21995,https://www.dice.com/jobs/detail/Web-Designer-...,IAC Publishing,Full Time,Company Description We are searching for a ta...,Dice Id : 10112803,"Oakland, CA",Web Designer,3 weeks ago,Telecommuting not available|Travel not required,,"ui/ux mobile apps, interaction design, digital...",86e27ce6b7e631e55d69d142c7d43df2,"[Python, Project Management, Data Analysis, UI...","[Python (computer programming), project manage...","[-0.036120027, 0.06199489, -0.13738641, -0.028..."
21996,https://www.dice.com/jobs/detail/Senior-Front-...,Omega Solutions Inc,Full Time,CONTACT - priya@omegasolutioninc.com / 408-45...,Dice Id : 10289500,"San Francisco, CA",Senior Front End Web Developer - Full Time at ...,3 weeks ago,Telecommuting not available|Travel not required,,"javascript, html5, css3, bootstrap, ajax, reac...",4287c7ee3317ccf1edd76e238cf8e584,"[JavaScript, HTML5, CSS3, Bootstrap, AJAX, Rea...","[JavaScript, AJAX, Angular, SQL, NoSQL, Postgr...","[0.0070498004, 0.031095242, -0.17534529, -0.02..."
21997,https://www.dice.com/jobs/detail/QA-Analyst-Sa...,San Francisco Health Plan,Full Time,Do you take pride in your work knowing that th...,Dice Id : 10115761,"San Francisco, CA",QA Analyst,2 weeks ago,Telecommuting not available|Travel not required,,"sdlc, alm, sql, t-sql, redgate, team foundatio...",d7512f0181d69f83f96db38cd77a4d08,"[Python, Project Management, Data Analysis, SD...","[Python (computer programming), project manage...","[-0.016395764, 0.056502767, -0.1392101, -0.032..."
21998,https://www.dice.com/jobs/detail/Tech-Lead%252...,IAC Publishing,Full Time,Company Description What We Can Offer YouAs th...,Dice Id : 10112803,"Oakland, CA",Tech Lead-Full Stack,2 weeks ago,Telecommuting not available|Travel not required,,"python, ruby, go, clojure, java, nosql-databas...",ec375268b494b3bcbed1635d64226112,"[JavaScript, MySQL, Node.js, React, Angular.js...","[JavaScript, MySQL, Angular, Python (computer ...","[-0.02242549, 0.021542227, -0.15049833, -0.073..."


In [15]:
matched.to_parquet('processed/dice_job_descriptions_embeddings.parquet', index=False)

#### Preprocessing Resume

In [None]:
resume = pd.read_csv('datasets/Resume.csv')

In [None]:
job = dice['jobdescription'][0]

In [None]:
prompt = helpers.get_prompt(job, 'job description')

In [None]:
print(prompt)

In [None]:
resp = helpers.get_response(prompt, model="mistral:instruct")

In [None]:
ast.literal_eval(resp['response'].strip())

In [None]:
resume_w_skills = helpers.get_list(data=resume, 
                                   col='Resume_str',
                                   doc_type='resume', 
                                   max_workers=10,
                                   model="mistral:instruct")

In [None]:
resume_w_skills.to_parquet('processed/resume_w_skills.parquet', index=False)

#### debugging

In [None]:
resume = pd.read_parquet('processed/resume_w_skills.parquet')

In [None]:
resume = helpers.fill_missing_skills(resume, esco)

In [None]:
resume

In [None]:
resume['length'] = resume['Resume_str'].str.len()

In [None]:
resume[['Resume_str', 'extracted_skills', 'length']]

In [None]:
print(resume['Resume_str'].iloc[94])

In [None]:
prompt = helpers.get_prompt(resume['Resume_str'].iloc[45], 'resume')

In [None]:
response = helpers.get_response(prompt)

In [None]:
response['response'].strip()

In [None]:
skills = resume['extracted_skills'].iloc[0]

In [None]:
embeddings = esco['embeddings']

In [None]:
embeddings = embeddings.tolist()

In [None]:
embedding = helpers.get_embedding(skills[0])

In [None]:
cosine_similarity([embedding], embeddings)

In [None]:
matched = helpers.match_closest_skills(skills, esco, threhold=0.8)

In [None]:
matched_list, matched_scores = matched

In [None]:
matched_list

In [None]:
matched_scores

In [None]:
comp_df = pd.DataFrame(columns=['skill', 'matched_skill', 'score'], data=list(zip(skills, matched_list, matched_scores)))

In [None]:
comp_df

In [None]:
resume_matched = helpers.match_all_skills_con(resume, esco, threshold=0.8)

In [None]:
resume_matched

In [None]:
resume_matched.to_parquet('processed/resume_matched.parquet', index=False)

In [16]:
resume_matched = pd.read_parquet('processed/resume_matched.parquet')

In [17]:
resume_matched 

Unnamed: 0,ID,Resume_str,Resume_html,Category,extracted_skills,length,matched_skills
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR,"[Accounting, Ads, Advertising, Analytical skil...",5442,"[accounting, create advertisements, outdoor ad..."
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR,"[Adobe Photoshop, ADP, Asset Management, brand...",5572,"[Adobe Photoshop, asset management, maintain c..."
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR,"[Recruiting, FMLA/EEO/FLSA, HRIS Development, ...",7720,"[recruit members, foreign affairs policy devel..."
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR,"[Type 50 wpm and 10-Key by touch, Microsoft pr...",2855,"[personnel management, customer service, resol..."
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR,"[ADA, ADP, art, agency, benefits, Benefits Adm...",9172,"[create artwork, coaching techniques, coaching..."
...,...,...,...,...,...,...,...
2479,99416532,RANK: SGT/E-5 NON- COMMISSIONED OFFIC...,"<div class=""fontsize fontface vmargins hmargin...",AVIATION,"[Secret Clearance, Stock Control, Management, ...",5533,"[follow stock control instructions, operations..."
2480,24589765,"GOVERNMENT RELATIONS, COMMUNICATIONS ...","<div class=""fontsize fontface vmargins hmargin...",AVIATION,"[arbitration, agency, budgets, Budget, continu...",7108,"[examine budgets, update budget, continuous im..."
2481,31605080,GEEK SQUAD AGENT Professional...,"<div class=""fontsize fontface vmargins hmargin...",AVIATION,"[Active Directory, Hardware, Customer Service,...",2020,"[hardware materials, customer service, manage ..."
2482,21190805,PROGRAM DIRECTOR / OFFICE MANAGER ...,"<div class=""fontsize fontface vmargins hmargin...",AVIATION,"[Adobe, CPR, Customer Service, Customer Care, ...",5074,"[customer service, maintain customer service, ..."


In [18]:
matched_skills = resume_matched['matched_skills'].tolist()

In [20]:
joined_matched_skills = [','.join(skills) for skills in matched_skills]

In [21]:
response = ollama.embed(model="nomic-embed-text", input=joined_matched_skills)

In [22]:
resume_matched['embeddings'] = response['embeddings']

In [23]:
resume_matched

Unnamed: 0,ID,Resume_str,Resume_html,Category,extracted_skills,length,matched_skills,embeddings
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR,"[Accounting, Ads, Advertising, Analytical skil...",5442,"[accounting, create advertisements, outdoor ad...","[-0.051119804, 0.051380936, -0.1956489, -0.000..."
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR,"[Adobe Photoshop, ADP, Asset Management, brand...",5572,"[Adobe Photoshop, asset management, maintain c...","[-0.013715399, 0.01982399, -0.15967667, 0.0198..."
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR,"[Recruiting, FMLA/EEO/FLSA, HRIS Development, ...",7720,"[recruit members, foreign affairs policy devel...","[-0.037161566, 0.0036097213, -0.19848743, -0.0..."
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR,"[Type 50 wpm and 10-Key by touch, Microsoft pr...",2855,"[personnel management, customer service, resol...","[-0.004071229, -0.0062129777, -0.16848274, -0...."
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR,"[ADA, ADP, art, agency, benefits, Benefits Adm...",9172,"[create artwork, coaching techniques, coaching...","[0.008319367, 0.02057786, -0.17201392, 0.01393..."
...,...,...,...,...,...,...,...,...
2479,99416532,RANK: SGT/E-5 NON- COMMISSIONED OFFIC...,"<div class=""fontsize fontface vmargins hmargin...",AVIATION,"[Secret Clearance, Stock Control, Management, ...",5533,"[follow stock control instructions, operations...","[0.0049826736, 0.0045687603, -0.19545835, -0.0..."
2480,24589765,"GOVERNMENT RELATIONS, COMMUNICATIONS ...","<div class=""fontsize fontface vmargins hmargin...",AVIATION,"[arbitration, agency, budgets, Budget, continu...",7108,"[examine budgets, update budget, continuous im...","[0.04153785, 0.038161203, -0.16648234, -0.0115..."
2481,31605080,GEEK SQUAD AGENT Professional...,"<div class=""fontsize fontface vmargins hmargin...",AVIATION,"[Active Directory, Hardware, Customer Service,...",2020,"[hardware materials, customer service, manage ...","[0.010476048, 0.028934365, -0.16972545, 0.0036..."
2482,21190805,PROGRAM DIRECTOR / OFFICE MANAGER ...,"<div class=""fontsize fontface vmargins hmargin...",AVIATION,"[Adobe, CPR, Customer Service, Customer Care, ...",5074,"[customer service, maintain customer service, ...","[-0.013092378, 0.055586472, -0.17801957, -0.01..."


In [24]:
resume_matched.to_parquet('processed/resume_embeddings.parquet', index=False)

#### Tech Resume Data

In [8]:
tech = pd.read_csv('datasets/UpdatedResumeDataSet.csv')

In [9]:
tech

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."
...,...,...
957,Testing,Computer Skills: â¢ Proficient in MS office (...
958,Testing,â Willingness to accept the challenges. â ...
959,Testing,"PERSONAL SKILLS â¢ Quick learner, â¢ Eagerne..."
960,Testing,COMPUTER SKILLS & SOFTWARE KNOWLEDGE MS-Power ...


In [10]:
print(tech['Resume'].iloc[0])

Skills * Programming Languages: Python (pandas, numpy, scipy, scikit-learn, matplotlib), Sql, Java, JavaScript/JQuery. * Machine learning: Regression, SVM, NaÃ¯ve Bayes, KNN, Random Forest, Decision Trees, Boosting techniques, Cluster Analysis, Word Embedding, Sentiment Analysis, Natural Language processing, Dimensionality reduction, Topic Modelling (LDA, NMF), PCA & Neural Nets. * Database Visualizations: Mysql, SqlServer, Cassandra, Hbase, ElasticSearch D3.js, DC.js, Plotly, kibana, matplotlib, ggplot, Tableau. * Others: Regular Expression, HTML, CSS, Angular 6, Logstash, Kafka, Python Flask, Git, Docker, computer vision - Open CV and understanding of Deep learning.Education Details 

Data Science Assurance Associate 

Data Science Assurance Associate - Ernst & Young LLP
Skill Details 
JAVASCRIPT- Exprience - 24 months
jQuery- Exprience - 24 months
Python- Exprience - 24 monthsCompany Details 
company - Ernst & Young LLP
description - Fraud Investigations and Dispute Services   Assur