In [1]:
import pandas as pd, skill_cat, multiprocessing.dummy
from jobs_skills_weights import *
import synonyms

In [2]:
skill_cat._refresh()
synonyms._refresh()

# What is the goal here?
For each job, create a dict from (a skill name that job wants) to (a float indicating the importance of that skill)

In [3]:
# What fraction of skill weight comes from being listed in a job description?
# 1-list_weight gives the weight of being in a category
list_weight = 4/5.0 

In [4]:
jobs = get_jobs(bookmarked=False)
raw_job_details = get_raw_job_details(jobs)

In [5]:
skill_cat._refresh()
synonyms._refresh()

In [14]:
# Construct dataframe from raw job details, 
#   summing between skills listed under multiple teal categories, 
#   and filtered by skills I have categories for (a loose standin for skills present on the resume).
job_skill_text_counts = pd.concat(
    {key: extract_skills_data(value) for key, value in raw_job_details.items()},
    names=["id"]
).groupby(level=[0,2,3]).sum().query("`skill`.isin(@skill_cat.skill_to_categories.keys())").sort_index()

In [62]:
job_skill_text_counts

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count
id,skill,skill text,Unnamed: 3_level_1
0124e218-2d33-41e8-907b-5228ea386455,automation,automating,1
0124e218-2d33-41e8-907b-5228ea386455,communication,communication,1
0124e218-2d33-41e8-907b-5228ea386455,communication,communication skills,1
0124e218-2d33-41e8-907b-5228ea386455,computer science,computer science,1
0124e218-2d33-41e8-907b-5228ea386455,containerization,devops,2
...,...,...,...
fdd15faa-0804-4b07-9f55-715436213e75,software engineering,software engineering,2
fdd15faa-0804-4b07-9f55-715436213e75,strategy,strategy,1
fdd15faa-0804-4b07-9f55-715436213e75,teamwork,teamwork,1
fdd15faa-0804-4b07-9f55-715436213e75,tensorflow,tensorflow,1


## Investingating the distribution of synonyms

In [7]:
job_skill_text_counts

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count
id,skill,skill text,Unnamed: 3_level_1
0124e218-2d33-41e8-907b-5228ea386455,automation,automating,1
0124e218-2d33-41e8-907b-5228ea386455,communication,communication,1
0124e218-2d33-41e8-907b-5228ea386455,communication,communication skills,1
0124e218-2d33-41e8-907b-5228ea386455,computer science,computer science,1
0124e218-2d33-41e8-907b-5228ea386455,containerization,devops,2
...,...,...,...
fdd15faa-0804-4b07-9f55-715436213e75,software engineering,software engineering,2
fdd15faa-0804-4b07-9f55-715436213e75,strategy,strategy,1
fdd15faa-0804-4b07-9f55-715436213e75,teamwork,teamwork,1
fdd15faa-0804-4b07-9f55-715436213e75,tensorflow,tensorflow,1


In [9]:
num_synonyms = job_skill_text_counts.reset_index().set_index(["id", "skill"])["skill text"].groupby(level=[0,1]).nunique()

In [10]:
num_synonyms

id                                    skill               
0124e218-2d33-41e8-907b-5228ea386455  automation              1
                                      communication           2
                                      computer science        1
                                      containerization        3
                                      data                    1
                                                             ..
fdd15faa-0804-4b07-9f55-715436213e75  software engineering    1
                                      strategy                1
                                      teamwork                1
                                      tensorflow              1
                                      testing                 1
Name: skill text, Length: 1947, dtype: int64

In [11]:
# What skills have the highest mean number of synonyms within a job listing?
num_synonyms.groupby(level=1).mean().sort_values(ascending=False).head(20)

skill
containerization         2.194030
algebra                  2.000000
communication            1.959184
machine learning         1.760417
scale                    1.659091
network architectures    1.500000
analysis                 1.454545
collaboration            1.333333
optimization             1.277778
web development          1.250000
implementation           1.226415
agile                    1.222222
data pipelines           1.210526
azure databricks         1.200000
project management       1.185185
managing requirements    1.125000
research                 1.120000
prototyping              1.111111
remote work              1.111111
automation               1.052632
Name: skill text, dtype: float64

In [12]:
# What is the average number of synonyms for a skill in a job listing?
num_synonyms.mean()

1.1715459681561375

## Computing `skill weights`

In [65]:
job_skills_data = job_skill_text_counts.groupby(level=[0,1]).agg(**{
    "share of skill":("count",lambda jstc: {key[2]:count/jstc.sum() for key, count in jstc.items()}),
    #"share of skill":("count",lambda jstc: [(key[2],count/jstc.sum()) for key, count in jstc.items()]),
    "count":("count","sum")
})

In [66]:
job_skills_data

Unnamed: 0_level_0,Unnamed: 1_level_0,share of skill,count
id,skill,Unnamed: 2_level_1,Unnamed: 3_level_1
0124e218-2d33-41e8-907b-5228ea386455,automation,{'automating': 1.0},1
0124e218-2d33-41e8-907b-5228ea386455,communication,"{'communication': 0.5, 'communication skills':...",2
0124e218-2d33-41e8-907b-5228ea386455,computer science,{'computer science': 1.0},1
0124e218-2d33-41e8-907b-5228ea386455,containerization,"{'devops': 0.5, 'docker': 0.25, 'kubernetes': ...",4
0124e218-2d33-41e8-907b-5228ea386455,data,{'data': 1.0},3
...,...,...,...
fdd15faa-0804-4b07-9f55-715436213e75,software engineering,{'software engineering': 1.0},2
fdd15faa-0804-4b07-9f55-715436213e75,strategy,{'strategy': 1.0},1
fdd15faa-0804-4b07-9f55-715436213e75,teamwork,{'teamwork': 1.0},1
fdd15faa-0804-4b07-9f55-715436213e75,tensorflow,{'tensorflow': 1.0},1


In [70]:
jobs_categories_skills_data = pd.DataFrame([{
        "id":job_id,
        "category":category,
        "skill":skill_name,
        "count":job_skills_data["count"].get((job_id, skill_name), 0)/float(len(categories_skill_is_in)),
        "share of skill":job_skills_data["share of skill"].get((job_id, skill_name), {skill_name:1.0})
        #"share of skill":job_skills_data["share of skill"].get((job_id, skill_name), [(skill_name,1.0)])
    } for job_id in raw_job_details.keys()
        for skill_name, categories_skill_is_in in skill_cat.skill_to_categories.items()
            for category in categories_skill_is_in
]).set_index(["id", "category", "skill"]).sort_index() # Sort the index so it's easy to read. Not a requirement.

In [71]:
jobs_categories_skills_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,share of skill
id,category,skill,Unnamed: 3_level_1,Unnamed: 4_level_1
0124e218-2d33-41e8-907b-5228ea386455,Academic,computer science,0.5,{'computer science': 1.0}
0124e218-2d33-41e8-907b-5228ea386455,Academic,experimentation,0.0,{'experimentation': 1.0}
0124e218-2d33-41e8-907b-5228ea386455,Academic,research,0.0,{'research': 1.0}
0124e218-2d33-41e8-907b-5228ea386455,Admin,automation,1.0,{'automating': 1.0}
0124e218-2d33-41e8-907b-5228ea386455,Admin,aws,0.0,{'aws': 1.0}
...,...,...,...,...
fdd15faa-0804-4b07-9f55-715436213e75,Unix,unix,0.0,{'unix': 1.0}
fdd15faa-0804-4b07-9f55-715436213e75,Visual,graphic design,0.0,{'graphic design': 1.0}
fdd15faa-0804-4b07-9f55-715436213e75,Visual,usability,0.0,{'usability': 1.0}
fdd15faa-0804-4b07-9f55-715436213e75,WebDev,javascript,0.0,{'javascript': 1.0}


In [72]:
counts = jobs_categories_skills_data.pop("count")
jobs_categories_skills_data["count"] = counts*list_weight + (counts*(1-list_weight)).groupby(level=[0,1]).mean()

In [73]:
jobs_categories_skills_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,share of skill,count
id,category,skill,Unnamed: 3_level_1,Unnamed: 4_level_1
0124e218-2d33-41e8-907b-5228ea386455,Academic,computer science,{'computer science': 1.0},0.433333
0124e218-2d33-41e8-907b-5228ea386455,Academic,experimentation,{'experimentation': 1.0},0.033333
0124e218-2d33-41e8-907b-5228ea386455,Academic,research,{'research': 1.0},0.033333
0124e218-2d33-41e8-907b-5228ea386455,Admin,automation,{'automating': 1.0},0.870588
0124e218-2d33-41e8-907b-5228ea386455,Admin,aws,{'aws': 1.0},0.070588
...,...,...,...,...
fdd15faa-0804-4b07-9f55-715436213e75,Unix,unix,{'unix': 1.0},0.033333
fdd15faa-0804-4b07-9f55-715436213e75,Visual,graphic design,{'graphic design': 1.0},0.000000
fdd15faa-0804-4b07-9f55-715436213e75,Visual,usability,{'usability': 1.0},0.000000
fdd15faa-0804-4b07-9f55-715436213e75,WebDev,javascript,{'javascript': 1.0},0.000000


In [74]:
jobs_categories_skills_data.explode("share of skill")

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,share of skill,count
id,category,skill,Unnamed: 3_level_1,Unnamed: 4_level_1
0124e218-2d33-41e8-907b-5228ea386455,Academic,computer science,computer science,0.433333
0124e218-2d33-41e8-907b-5228ea386455,Academic,experimentation,experimentation,0.033333
0124e218-2d33-41e8-907b-5228ea386455,Academic,research,research,0.033333
0124e218-2d33-41e8-907b-5228ea386455,Admin,automation,automating,0.870588
0124e218-2d33-41e8-907b-5228ea386455,Admin,aws,aws,0.070588
...,...,...,...,...
fdd15faa-0804-4b07-9f55-715436213e75,Unix,unix,unix,0.033333
fdd15faa-0804-4b07-9f55-715436213e75,Visual,graphic design,graphic design,0.000000
fdd15faa-0804-4b07-9f55-715436213e75,Visual,usability,usability,0.000000
fdd15faa-0804-4b07-9f55-715436213e75,WebDev,javascript,javascript,0.000000
