In [1]:
import pandas as pd, skill_cat, multiprocessing.dummy
from jobs_skills_weights import *
import synonyms

In [2]:
skill_cat._refresh()
synonyms._refresh()

# What is the goal here?
For each job, create a dict from (a skill name that job wants) to (a float indicating the importance of that skill)

In [3]:
# What fraction of skill weight comes from being listed in a job description?
# 1-list_weight gives the weight of being in a category
list_weight = 4/5.0 

In [4]:
jobs = get_jobs(bookmarked=False)
raw_job_details = get_raw_job_details(jobs)

In [5]:
skill_cat._refresh()
synonyms._refresh()

In [6]:
job_skill_text_data = pd.concat(
    {key: extract_skills_data(value) for key, value in raw_job_details.items()},
    names=["id"]
).groupby(level=[0,2,3]).sum().query("`skill`.isin(@skill_cat.skill_to_categories.keys())").sort_index()

jst_counts = job_skill_text_data.pop("count")
job_skill_text_data["share of job"] = jst_counts / jst_counts.groupby(level=0).sum()

In [7]:
job_skill_text_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,share of job
id,skill,skill text,Unnamed: 3_level_1
0124e218-2d33-41e8-907b-5228ea386455,automation,Automate,0.033333
0124e218-2d33-41e8-907b-5228ea386455,communication,Communication Skills,0.033333
0124e218-2d33-41e8-907b-5228ea386455,communication,Communications,0.033333
0124e218-2d33-41e8-907b-5228ea386455,computer science,Computer Science,0.033333
0124e218-2d33-41e8-907b-5228ea386455,containerization,DevOps,0.066667
...,...,...,...
fdd15faa-0804-4b07-9f55-715436213e75,software engineering,Software Engineering,0.036364
fdd15faa-0804-4b07-9f55-715436213e75,strategy,Strategy,0.018182
fdd15faa-0804-4b07-9f55-715436213e75,teamwork,Teamwork,0.018182
fdd15faa-0804-4b07-9f55-715436213e75,tensorflow,TensorFlow,0.018182


In [8]:
jobs_categories_skills_data = pd.DataFrame([{
        "id":job_id,
        "category":category,
        "skill":skill_name,
        "share of job":len(categories_skill_is_in), # This skill will appear in `n` categories, so we'll divide it by `n` to keep it conserved.
    } for job_id in raw_job_details.keys()
        for skill_name, categories_skill_is_in in skill_cat.skill_to_categories.items()
            for category in categories_skill_is_in
]).set_index(["id", "category", "skill"])

In [9]:
jcst_data = (job_skill_text_data / jobs_categories_skills_data).reset_index().fillna({"share of job":0}) # jcst_data => jobs_categories_skills_text_data

jcst_data["skill text"].fillna(jcst_data["skill"].apply(skill_cat.skill_to_skill_title.get), inplace=True)

jcst_data = jcst_data.set_index(["id","category","skill","skill text"]).sort_index()

In [10]:
jcst_data.isna().all()

share of job    False
dtype: bool

In [11]:
from skill_weights import category_biases

jcst_data["share of job"] = (jcst_data["share of job"] * (1-category_biases.sum())) + (category_biases/jcst_data.groupby(level=["id","category"]).size())

In [12]:
jcst_wc = (jcst_data*list_weight + (jcst_data*(1-list_weight)).groupby(level=[0,1]).mean())

In [13]:
jcst_wc

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,share of job
id,category,skill,skill text,Unnamed: 4_level_1
0124e218-2d33-41e8-907b-5228ea386455,Academic,computer science,Computer Science,0.011556
0124e218-2d33-41e8-907b-5228ea386455,Academic,experimentation,Experimentation,0.000889
0124e218-2d33-41e8-907b-5228ea386455,Academic,research,Research,0.000889
0124e218-2d33-41e8-907b-5228ea386455,Admin,automation,Automate,0.023018
0124e218-2d33-41e8-907b-5228ea386455,Admin,aws,AWS,0.001684
...,...,...,...,...
fdd15faa-0804-4b07-9f55-715436213e75,Unix,unix,Unix,0.000436
fdd15faa-0804-4b07-9f55-715436213e75,Visual,graphic design,Graphic Design,0.000000
fdd15faa-0804-4b07-9f55-715436213e75,Visual,usability,Usability,0.000000
fdd15faa-0804-4b07-9f55-715436213e75,Webdev,javascript,Javascript,0.000000


In [14]:
jst_wc = jcst_wc.groupby(level=[0,2,3]).sum()

In [15]:
jst_wc.loc["02cd1859-6731-4c7e-a1b5-3c0d22d2f483"].sort_values("share of job", ascending=False).head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,share of job
skill,skill text,Unnamed: 2_level_1
data,Data,0.188714
machine learning,Machine Learning,0.107761
end to end,End To End,0.039642
aws,AWS,0.03333
data,Data Science,0.026326
infrastructure,Infrastructure,0.021373
automation,Automate,0.020537
python,Python,0.02007
managing requirements,Business Problems,0.019741
remote work,Remote Work,0.016866
