In [1]:
import pandas as pd, skill_cat, multiprocessing.dummy
import jobs_skills_weights as jsw
from skill_weights import category_biases

# What is the goal here?
For each job, create a dict from (a skill name that job wants) to (a float indicating the importance of that skill)

In [2]:
# What fraction of skill weight comes from being listed in a job description?
# 1-list_weight gives the weight of being in a category
list_weight = 4/5.0 

In [3]:
jobs = jsw.get_jobs(bookmarked=False)
raw_job_details = jsw.get_raw_job_details(jobs)

In [4]:
collapse_categories=False
use_category_bias=True

In [5]:
# Construct dataframe from raw job details, 
#   summing between skills listed under multiple teal categories, 
#   and filtered by skills I have categories for (a loose standin for skills present on the resume).
job_skill_text_data = pd.concat(
    {key: jsw.extract_skill_counts(value) for key, value in raw_job_details.items()},
    names=["id"]
).groupby(level=["id","skill","skill text"]).sum()#.query("`skill`.isin(@skill_cat.skill_to_categories.keys())").sort_index()

# Compute the share of their job each entry represents
jst_counts = job_skill_text_data.pop("count")
job_skill_text_data["share of job"] = jst_counts / jst_counts.groupby(level="id").sum()

In [6]:
job_skill_text_data.sum()

share of job    101.0
dtype: float64

In [7]:
# Prepare a dataframe of every combination of (job, category, skill-in-category), 
#   prefilled with the share of the job listing each skill represents or zero if not present, 
#   divided equally among the categories that skill is in.
#
# This could probably be done with some kind of numpy iterable, which could be a lot faster.
    # #OPTIMIZE_IF_NEEDED
jobs_categories_skills_data = pd.DataFrame([{
        "id":job_id,
        "category":category,
        "skill":skill_name,
        "skill text default":skill_cat.skill_to_skill_title[skill_name],
        "num categories":len(categories_skill_is_in), # This skill will appear in `n` categories, so we'll divide it by `n` to keep it conserved.
    } for job_id in raw_job_details.keys()
        for skill_name, categories_skill_is_in in skill_cat.skill_to_categories.items()
            for category in categories_skill_is_in
]).set_index(["id", "category", "skill"])

In [8]:
# Join information about (how popular a skill was in the job listing) with 
#     (how many categories that skill is represented in) setting a default `1` for uncategorized skillls
#     and a default `0` popularity for skills not in the listing.
# Skills which are not in the listing have 'NaN' in their `skill text` column, and a waiting replacement in
#     their `skill text default` column.

jcst_data = job_skill_text_data.reset_index("skill text").join(jobs_categories_skills_data, how='outer')

In [9]:
jcst_data[["share of job"]].sum()

share of job    137.866026
dtype: float64

In [10]:
jcst_data.fillna({
        "num categories":1, 
        "share of job":0, 
        "skill text":jcst_data.pop("skill text default")
    }, inplace=True)

In [11]:
jcst_data[["share of job"]].sum()

share of job    137.866026
dtype: float64

In [12]:
jcst_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,skill text,share of job,num categories
id,skill,category,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0124e218-2d33-41e8-907b-5228ea386455,agile,Project,Agile,0.000000,1.0
0124e218-2d33-41e8-907b-5228ea386455,algebra,Ml,Algebra,0.000000,1.0
0124e218-2d33-41e8-907b-5228ea386455,algorithms,Ml,Algorithms,0.000000,2.0
0124e218-2d33-41e8-907b-5228ea386455,algorithms,Software Planning,Algorithms,0.000000,2.0
0124e218-2d33-41e8-907b-5228ea386455,analysis,Ml,Analysis,0.000000,2.0
...,...,...,...,...,...
fdd15faa-0804-4b07-9f55-715436213e75,trusted,,Trusted,0.007874,1.0
fdd15faa-0804-4b07-9f55-715436213e75,usability,Software Planning,Usability,0.000000,2.0
fdd15faa-0804-4b07-9f55-715436213e75,usability,Software Best Practices,Usability,0.000000,2.0
fdd15faa-0804-4b07-9f55-715436213e75,web development,Webdev,Web Development,0.000000,1.0


In [13]:
# Normalize against double-counting skills which appear in multiple categories, by dividing by however many categories they're in.
jcst_data["share of job"] = jcst_data["share of job"] / jcst_data.pop("num categories")

# Return to desired index configuration
jcst_data = jcst_data.reset_index().set_index(["id", "category", "skill", "skill text"]).sort_index()

In [14]:
jcst_data[["share of job"]].sum()

share of job    101.0
dtype: float64

In [15]:
jcst_data.index.get_level_values("category").isna().any()

True

In [16]:
jcst_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,share of job
id,category,skill,skill text,Unnamed: 4_level_1
0124e218-2d33-41e8-907b-5228ea386455,AWS Tools,aws,AWS,0.000000
0124e218-2d33-41e8-907b-5228ea386455,AWS Tools,ec2,EC2,0.000000
0124e218-2d33-41e8-907b-5228ea386455,AWS Tools,ecs,ECS,0.000000
0124e218-2d33-41e8-907b-5228ea386455,Academic,computer science,Computer Science,0.006173
0124e218-2d33-41e8-907b-5228ea386455,Academic,research,Research,0.000000
...,...,...,...,...
fdd15faa-0804-4b07-9f55-715436213e75,,strong relationships,Strong Relationships,0.007874
fdd15faa-0804-4b07-9f55-715436213e75,,successful,Successful,0.007874
fdd15faa-0804-4b07-9f55-715436213e75,,team performance,Team Performance,0.007874
fdd15faa-0804-4b07-9f55-715436213e75,,track record,Track Record,0.007874


In [17]:
jcst_data.xs(None, level="category")

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,share of job
id,skill,skill text,Unnamed: 3_level_1
0124e218-2d33-41e8-907b-5228ea386455,autonomous systems,Autonomous System,0.018519
0124e218-2d33-41e8-907b-5228ea386455,azure devops,Azure DevOps,0.018519
0124e218-2d33-41e8-907b-5228ea386455,django,Django,0.018519
0124e218-2d33-41e8-907b-5228ea386455,electronics,Electronics,0.018519
0124e218-2d33-41e8-907b-5228ea386455,excellent,Excellent,0.018519
...,...,...,...
fdd15faa-0804-4b07-9f55-715436213e75,strong relationships,Strong Relationships,0.007874
fdd15faa-0804-4b07-9f55-715436213e75,successful,Successful,0.007874
fdd15faa-0804-4b07-9f55-715436213e75,team performance,Team Performance,0.007874
fdd15faa-0804-4b07-9f55-715436213e75,track record,Track Record,0.007874


In [18]:
# Separately from how popular a skill was in the job listing, some skills need to be displayed because
#     they are in a category of skills the listing mentions often, but they are not mentioned by name.
#
# To manage this, skills contribute indirectly to all the other skills they share a category with.
#
# Each skill keeps `list_weight` of its `share of job` for itself, contributing directly to its weight.
#
# Additionally, `1-list_weight` of each skill's `share of job` gets divided equally between all the categories
#     that skill is in, then sub-divided equally between all the skills in that category.
#
# skill -(divides evenly between)-> categories skill is in -(divides evenly between)-> skills in that category
# --NOT--
# skill -(divides evenly between)-> every skill in every category skill is in
#
# --Example--
# If skill S is in category 'A' with 10 skills (A1-A9 and S), and in category 'B' with 3 total (B1, B2, and S),
# and S is only in those 2 categories,
#     A1 will receive   1/10 * 1/2 * (1-list_weight)   of S's share of the job listing,
# and B1 will receive   1/3  * 1/2 * (1-list_weight)   of S's share of the job listing,
# and S  will receive   1/10 * 1/2 * (1-list_weight)   of its own share via category A,
# and S  will receive   1/3  * 1/2 * (1-list_weight)   of its own share via category B.
# Additionally, S will receive from A1-A9, and from B1-2, in the same way.
#
# In practice, this second 'implied' contribution is averaged within each job&category and added
#     to each skill's first 'direct' contribution. 
#
# Because pandas does wonderful broadcasting,
#     the mean of a category's indirect contribution broadcasts to all the skills in that category,
#     conserving our total weights so that each job's weights sum to 1.
#
# Finally, this is assigned to a new 'skill weight' column, preserving each skill's original share of 
#     the base listing. This is not *strictly* necessary for the final product, but it's excellent
#     for development and debugging.

jcst_data["skill weight"] = (jcst_data["share of job"]*list_weight + (jcst_data["share of job"]*(1-list_weight)).reset_index().groupby(["id", "category"], dropna=False)["share of job"].mean())

In [19]:
jcst_data.sum()

share of job    101.0
skill weight    101.0
dtype: float64

In [20]:
jcst_data.xs(None, level="category")

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,share of job,skill weight
id,skill,skill text,Unnamed: 3_level_1,Unnamed: 4_level_1
0124e218-2d33-41e8-907b-5228ea386455,autonomous systems,Autonomous System,0.018519,0.018827
0124e218-2d33-41e8-907b-5228ea386455,azure devops,Azure DevOps,0.018519,0.018827
0124e218-2d33-41e8-907b-5228ea386455,django,Django,0.018519,0.018827
0124e218-2d33-41e8-907b-5228ea386455,electronics,Electronics,0.018519,0.018827
0124e218-2d33-41e8-907b-5228ea386455,excellent,Excellent,0.018519,0.018827
...,...,...,...,...
fdd15faa-0804-4b07-9f55-715436213e75,strong relationships,Strong Relationships,0.007874,0.008245
fdd15faa-0804-4b07-9f55-715436213e75,successful,Successful,0.007874,0.008245
fdd15faa-0804-4b07-9f55-715436213e75,team performance,Team Performance,0.007874,0.008245
fdd15faa-0804-4b07-9f55-715436213e75,track record,Track Record,0.007874,0.008245


In [21]:
# Some categories are foundational, even when they aren't mentioned at all.
# Skills in those categories should be biased towards getting displayed.
# Currently, this is done in a way which conserves total weight per job.
# Modify these biases above.
if use_category_bias:
        jcst_data["skill weight"] = (jcst_data["skill weight"] * (1-category_biases.sum())) + (category_biases/jcst_data.reset_index().groupby(["id","category"], dropna=False).size()).fillna(0)

In [22]:
jcst_data.xs(None, level="category")

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,share of job,skill weight
id,skill,skill text,Unnamed: 3_level_1,Unnamed: 4_level_1
0124e218-2d33-41e8-907b-5228ea386455,autonomous systems,Autonomous System,0.018519,0.018262
0124e218-2d33-41e8-907b-5228ea386455,azure devops,Azure DevOps,0.018519,0.018262
0124e218-2d33-41e8-907b-5228ea386455,django,Django,0.018519,0.018262
0124e218-2d33-41e8-907b-5228ea386455,electronics,Electronics,0.018519,0.018262
0124e218-2d33-41e8-907b-5228ea386455,excellent,Excellent,0.018519,0.018262
...,...,...,...,...
fdd15faa-0804-4b07-9f55-715436213e75,strong relationships,Strong Relationships,0.007874,0.007997
fdd15faa-0804-4b07-9f55-715436213e75,successful,Successful,0.007874,0.007997
fdd15faa-0804-4b07-9f55-715436213e75,team performance,Team Performance,0.007874,0.007997
fdd15faa-0804-4b07-9f55-715436213e75,track record,Track Record,0.007874,0.007997


In [23]:
jcst_data.sum()

share of job    101.0
skill weight    101.0
dtype: float64