In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import ast
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import helpers
import ollama

#### Get ESCO embeddings

In [None]:
esco = pd.read_csv('datasets/skills_en.csv')

In [None]:
esco_skills = esco['preferredLabel'].tolist()

In [None]:
response = ollama.embed(model="nomic-embed-text", input=esco_skills)

In [None]:
esco['embeddings'] = response['embeddings']

In [None]:
esco.to_parquet('processed/esco_skills.parquet', index=False)

In [None]:
esco = pd.read_parquet('processed/esco_skills.parquet')

#### Preprocess dice

In [None]:
dice = pd.read_csv('datasets/dice_com-job_us_sample.csv')

In [None]:
dice

In [None]:
dice['skills'] = dice['skills'].fillna('')

In [None]:
dice['skills'] = dice['skills'].str.lower().str.strip()

In [None]:
dice[dice['skills'].str.contains('see')]

In [None]:
dice[~dice['skills'].str.contains('see')]['skills']

In [None]:
dice_w_skills = helpers.get_list(data=dice, 
                                 col='jobdescription', 
                                 doc_type='job', 
                                 max_workers=10,
                                 model="mistral:instruct")

In [None]:
dice_w_skills

In [None]:
dice_w_skills.to_parquet('processed/dice_job_descriptions_with_skills.parquet', index=False)

In [None]:
dice_w_skills = pd.read_parquet('processed/dice_job_descriptions_with_skills.parquet')

In [None]:
dice_w_skills = helpers.fill_missing_skills(data=dice_w_skills,
                                           skills_col='skills',
                                           doc_type='job',
                                           max_workers=3)

In [None]:
dice_w_skills

In [None]:
matched = helpers.match_all_skills_con(dice_w_skills, esco, threshold=0.8)

In [None]:
matched.to_parquet('processed/dice_job_descriptions_matched.parquet', index=False)

In [None]:
jobs_matched = pd.read_parquet('processed/dice_job_descriptions_matched.parquet')

In [None]:
jobs_matched['matched_skills_ordered'] = jobs_matched['matched_skills'].apply(lambda x: sorted(set(x)))

In [None]:
skill_embeddings = helpers.embed_skills_list(jobs_matched['matched_skills'].tolist())

In [None]:
skill_embeddings_ordered = helpers.embed_skills_list(jobs_matched['matched_skills_ordered'].tolist())

In [None]:
jobs_matched['skill_embeddings'] = skill_embeddings['embeddings']

In [None]:
jobs_matched['skill_embeddings_ordered'] = skill_embeddings_ordered['embeddings']

In [None]:
matched

In [None]:
jobs_matched.to_parquet('processed/dice_job_descriptions_embeddings.parquet', index=False)

#### Preprocessing Resume

In [None]:
resume = pd.read_csv('datasets/Resume.csv')

In [None]:
resume_w_skills = helpers.get_list(data=resume, 
                                   col='Resume_str',
                                   doc_type='resume', 
                                   max_workers=10,
                                   model="mistral:instruct")

In [None]:
resume_w_skills.to_parquet('processed/resume_w_skills.parquet', index=False)

In [None]:
resume = pd.read_parquet('processed/resume_w_skills.parquet')

In [None]:
resume = helpers.fill_missing_skills(resume, esco)

In [None]:
resume

In [None]:
resume['length'] = resume['Resume_str'].str.len()

In [None]:
resume[['Resume_str', 'extracted_skills', 'length']]

In [None]:
print(resume['Resume_str'].iloc[94])

In [None]:
skills = resume['extracted_skills'].iloc[0]

In [None]:
matched = helpers.match_closest_skills(skills, esco, threhold=0.8)

In [None]:
matched_list, matched_scores = matched

In [None]:
matched_list

In [None]:
matched_scores

In [None]:
comp_df = pd.DataFrame(columns=['skill', 'matched_skill', 'score'], data=list(zip(skills, matched_list, matched_scores)))

In [None]:
comp_df

In [None]:
resume_matched = helpers.match_all_skills_con(resume, esco, threshold=0.8)

In [None]:
resume_matched

In [None]:
resume_matched.to_parquet('processed/resume_matched.parquet', index=False)

In [None]:
resume_matched = pd.read_parquet('processed/resume_matched.parquet')

In [None]:
resume_matched['matched_skills_ordered'] = resume_matched['matched_skills'].apply(lambda x: sorted(set(x)))

In [None]:
skill_embeddings = helpers.embed_skills_list(resume_matched['matched_skills'].tolist())

In [None]:
skill_embeddings_ordered = helpers.embed_skills_list(resume_matched['matched_skills_ordered'].tolist())

In [None]:
resume_matched['skill_embeddings'] = skill_embeddings['embeddings']

In [None]:
resume_matched['skill_embeddings_ordered'] = skill_embeddings_ordered['embeddings']

In [None]:
resume_matched

In [None]:
resume_matched.to_parquet('processed/resume_embeddings.parquet', index=False)