In [1]:
import pandas as pd, skill_cat, multiprocessing.dummy
from jobs_skills_weights import *
import homonyms

In [2]:
skill_cat._refresh()
homonyms._refresh()

In [3]:
jobs = get_jobs(bookmarked=False)

In [4]:
len(jobs)

100

In [5]:
jobs.head(3)

Unnamed: 0_level_0,company_name,location,role,url,excitement
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
bd59469c-29f4-41b9-9001-5b7902d766e1,Ansys,"Madrid, Community of Madrid, Spain",R&D Engineer II - Python f/m,https://www.linkedin.com/jobs/search/?currentJ...,4
0124e218-2d33-41e8-907b-5228ea386455,Treibacher Industrie AG,"Althofen, Carinthia, Austria",AI/ML Specialist – Software Engineer (m/f/d),https://www.linkedin.com/jobs/search/?geoId=91...,4
35b60a7d-f9a3-4f63-b8ab-4db8db49fca8,Helsing,"Paris, Île-de-France, France",Deployed AI Engineer,https://www.linkedin.com/jobs/search/?currentJ...,5


# What is the goal here?
For each job, create a dict from (a skill name that job wants) to (a float indicating the importance of that skill)

In [6]:
# What fraction of skill weight comes from being listed in a job description?
# 1-list_weight gives the weight of being in a category
list_weight = 4/5.0 

In [7]:
pooldict = lambda func, keys, pool: dict(zip(keys, pool.map(func, keys)))
raw_job_details = pooldict(get_job_details, jobs.index, multiprocessing.dummy.Pool())

In [8]:
# Collect the job descriptions
job_descriptions = pd.DataFrame([
    {
        "id":key, 
        "job description": data["attributes"]["job_description"]
    } for key, data in raw_job_details.items()
]).set_index("id")
job_descriptions.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, bd59469c-29f4-41b9-9001-5b7902d766e1 to 7afa68f6-4c79-4981-a5f7-751e59da1f64
Data columns (total 1 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   job description  100 non-null    object
dtypes: object(1)
memory usage: 1.6+ KB


In [9]:
skill_cat._refresh()
homonyms._refresh()
# Use the refreshed categories and homonyms configurations to parse the (already downloaded) data
jobs_skills_data = pd.concat(
    {key: extract_skills_data(value) for key, value in raw_job_details.items()},
    names=["id"]
)

In [10]:
# Sum around teal category
jobs_skills_data = jobs_skills_data.groupby(level=[0,2]).sum()

In [11]:
# Filter out entries which aren't categorizable
jobs_skills_data = jobs_skills_data.query("`skill`.isin(@skill_cat.skill_to_categories.keys())")

In [12]:
# Compute the share of each job a skill appears to represent
jobs_skills_data["share of job"] = jobs_skills_data["count"] / jobs_skills_data["count"].groupby("id").sum()

In [13]:
jobs_skills_data.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1939 entries, ('bd59469c-29f4-41b9-9001-5b7902d766e1', 'c') to ('7afa68f6-4c79-4981-a5f7-751e59da1f64', 'research')
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   count         1939 non-null   int64  
 1   share of job  1939 non-null   float64
dtypes: float64(1), int64(1)
memory usage: 83.3+ KB


In [14]:
jobs_categories_skills_data = pd.DataFrame([{
        "id":job_id,
        "category":category,
        "skill name":skill_name,
        "share of job":jobs_skills_data["share of job"].get((job_id, skill_name), 0)/float(len(categories_skill_is_in)),
    } for job_id in jobs.index
        for skill_name, categories_skill_is_in in skill_cat.skill_to_categories.items()
            for category in categories_skill_is_in
]).set_index(["id", "category", "skill name"]).sort_index()

### V2:
soj = jobs_categories_skills_data.pop("share of job")
jobs_categories_skills_data["listed contribution"]  = soj*list_weight
jobs_categories_skills_data["implied contribution"] = (soj*(1-list_weight)).groupby(level=[0,1]).mean()

### V1:
# jobs_categories_skills_data[["listed contribution", "share of job for implied"]] = [(soj*list_weight, soj*(1-list_weight)) for soj in jobs_categories_skills_data.pop("share of job")]
# jobs_categories_skills_data["implied contribution"] = jobs_categories_skills_data.pop("share of job for implied").groupby(level=[0, 1]).mean()

jobs_categories_skills_data.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 13700 entries, ('0124e218-2d33-41e8-907b-5228ea386455', 'Academic', 'computer science') to ('fdd15faa-0804-4b07-9f55-715436213e75', 'WebDev', 'web development')
Data columns (total 2 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   listed contribution   13700 non-null  float64
 1   implied contribution  13700 non-null  float64
dtypes: float64(2)
memory usage: 264.9+ KB


In [15]:
jobs_categories_skills_data.groupby(level=0).sum()

Unnamed: 0_level_0,listed contribution,implied contribution
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0124e218-2d33-41e8-907b-5228ea386455,0.8,0.2
02cd1859-6731-4c7e-a1b5-3c0d22d2f483,0.8,0.2
057e7f52-416e-4339-90cb-08b1d9bd5247,0.8,0.2
077a5779-7d63-468e-b61f-29b0b9828e95,0.8,0.2
08d3836e-f7b3-45af-8db5-ebbceb456013,0.8,0.2
...,...,...
ee6ac56f-c8cd-4000-bf43-477ff5464c5f,0.8,0.2
f8c70e51-922f-450d-8a2a-d36d1b88593f,0.8,0.2
f9454138-5a6a-4682-ad21-fb67d7572ab6,0.8,0.2
fca8e347-ac43-41e3-9e31-d3c04e084045,0.8,0.2


In [16]:
jobs_categories_skills_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,listed contribution,implied contribution
id,category,skill name,Unnamed: 3_level_1,Unnamed: 4_level_1
0124e218-2d33-41e8-907b-5228ea386455,Academic,computer science,0.013793,0.001149
0124e218-2d33-41e8-907b-5228ea386455,Academic,experimentation,0.000000,0.001149
0124e218-2d33-41e8-907b-5228ea386455,Academic,research,0.000000,0.001149
0124e218-2d33-41e8-907b-5228ea386455,Admin,automation,0.027586,0.002194
0124e218-2d33-41e8-907b-5228ea386455,Admin,aws,0.000000,0.002194
...,...,...,...,...
fdd15faa-0804-4b07-9f55-715436213e75,WebDev,html,0.000000,0.000000
fdd15faa-0804-4b07-9f55-715436213e75,WebDev,javascript,0.000000,0.000000
fdd15faa-0804-4b07-9f55-715436213e75,WebDev,react,0.000000,0.000000
fdd15faa-0804-4b07-9f55-715436213e75,WebDev,web applications,0.000000,0.000000


In [17]:
jobs_categories_skills_data.groupby(level=0).sum() # This proves we've successfully conserved totals

Unnamed: 0_level_0,listed contribution,implied contribution
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0124e218-2d33-41e8-907b-5228ea386455,0.8,0.2
02cd1859-6731-4c7e-a1b5-3c0d22d2f483,0.8,0.2
057e7f52-416e-4339-90cb-08b1d9bd5247,0.8,0.2
077a5779-7d63-468e-b61f-29b0b9828e95,0.8,0.2
08d3836e-f7b3-45af-8db5-ebbceb456013,0.8,0.2
...,...,...
ee6ac56f-c8cd-4000-bf43-477ff5464c5f,0.8,0.2
f8c70e51-922f-450d-8a2a-d36d1b88593f,0.8,0.2
f9454138-5a6a-4682-ad21-fb67d7572ab6,0.8,0.2
fca8e347-ac43-41e3-9e31-d3c04e084045,0.8,0.2


In [18]:
job_skill_weights = jobs_categories_skills_data["listed contribution"] + jobs_categories_skills_data["implied contribution"]

In [19]:
job_skill_weights.info()

<class 'pandas.core.series.Series'>
MultiIndex: 13700 entries, ('0124e218-2d33-41e8-907b-5228ea386455', 'Academic', 'computer science') to ('fdd15faa-0804-4b07-9f55-715436213e75', 'WebDev', 'web development')
Series name: None
Non-Null Count  Dtype  
--------------  -----  
13700 non-null  float64
dtypes: float64(1)
memory usage: 157.8+ KB


In [20]:
job_skill_weights.groupby(level=[0, 1]).sum().loc[lambda sw: sw == 0]

id                                    category
0124e218-2d33-41e8-907b-5228ea386455  Bio         0.0
                                      Embedded    0.0
                                      Project     0.0
                                      Visual      0.0
                                      WebDev      0.0
                                                 ... 
fdd15faa-0804-4b07-9f55-715436213e75  Embedded    0.0
                                      Java        0.0
                                      Unix        0.0
                                      Visual      0.0
                                      WebDev      0.0
Length: 507, dtype: float64

In [21]:
job_skill_weights.groupby(level=0).sum()

id
0124e218-2d33-41e8-907b-5228ea386455    1.0
02cd1859-6731-4c7e-a1b5-3c0d22d2f483    1.0
057e7f52-416e-4339-90cb-08b1d9bd5247    1.0
077a5779-7d63-468e-b61f-29b0b9828e95    1.0
08d3836e-f7b3-45af-8db5-ebbceb456013    1.0
                                       ... 
ee6ac56f-c8cd-4000-bf43-477ff5464c5f    1.0
f8c70e51-922f-450d-8a2a-d36d1b88593f    1.0
f9454138-5a6a-4682-ad21-fb67d7572ab6    1.0
fca8e347-ac43-41e3-9e31-d3c04e084045    1.0
fdd15faa-0804-4b07-9f55-715436213e75    1.0
Length: 100, dtype: float64

In [22]:
job_skill_weights.loc[(jobs.index[0], slice(None), "computer science")]

category
Academic    0.014444
Code        0.016818
dtype: float64