In [1]:
import pandas as pd, skill_cat, multiprocessing.dummy
from jobs_skills_weights import *

In [2]:
skill_cat._refresh()
stemming._refresh()

In [3]:
jobs = get_jobs(bookmarked=False)

In [4]:
len(jobs)

100

In [5]:
jobs.head(3)

Unnamed: 0_level_0,company_name,location,role,url,excitement
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
bd59469c-29f4-41b9-9001-5b7902d766e1,Ansys,"Madrid, Community of Madrid, Spain",R&D Engineer II - Python f/m,https://www.linkedin.com/jobs/search/?currentJ...,4
0124e218-2d33-41e8-907b-5228ea386455,Treibacher Industrie AG,"Althofen, Carinthia, Austria",AI/ML Specialist – Software Engineer (m/f/d),https://www.linkedin.com/jobs/search/?geoId=91...,4
35b60a7d-f9a3-4f63-b8ab-4db8db49fca8,Helsing,"Paris, Île-de-France, France",Deployed AI Engineer,https://www.linkedin.com/jobs/search/?currentJ...,5


# What is the goal here?
For each job, create a dict from (a skill name that job wants) to (a float indicating the importance of that skill)

In [6]:
pooldict = lambda func, keys, pool: dict(zip(keys, pool.map(func, keys)))
raw_job_details = pooldict(get_job_details, jobs.index, multiprocessing.dummy.Pool())

In [7]:
# Collect the job descriptions
job_descriptions = pd.DataFrame([
    {
        "id":key, 
        "job description": data["attributes"]["job_description"]
    } for key, data in raw_job_details.items()
]).set_index("id")
job_descriptions.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, bd59469c-29f4-41b9-9001-5b7902d766e1 to 7afa68f6-4c79-4981-a5f7-751e59da1f64
Data columns (total 1 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   job description  100 non-null    object
dtypes: object(1)
memory usage: 1.6+ KB


In [8]:
skill_cat._refresh()
stemming._refresh()
# Use the refreshed categories and stemming configurations to parse the (already downloaded) data
jobs_skills_data = pd.concat(
    {key: extract_skills_data(value) for key, value in raw_job_details.items()},
    names=["id", "teal category", "skill name"]
)

In [9]:
# Sum around teal category
jobs_skills_data = jobs_skills_data.groupby(level=[0,2]).sum()

In [10]:
# Filter out entries which aren't categorizable
jobs_skills_data = jobs_skills_data.query("`skill name`.isin(@skill_cat.skill_to_categories.keys())")

In [11]:
# Compute the share of each job a skill appears to represent
jobs_skills_data["share of job"] = jobs_skills_data["count"] / jobs_skills_data["count"].groupby("id").sum()

In [12]:
jobs_skills_data.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1906 entries, ('bd59469c-29f4-41b9-9001-5b7902d766e1', 'c') to ('7afa68f6-4c79-4981-a5f7-751e59da1f64', 'research')
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   count         1906 non-null   int64  
 1   share of job  1906 non-null   float64
dtypes: float64(1), int64(1)
memory usage: 82.8+ KB


In [13]:
jobs_categories_skills_data = pd.DataFrame([{
        "id":job_id,
        "category":category,
        "skill name":skill_name,
        "count":jobs_skills_data["count"].get((job_id, skill_name), 0)/len(categories),
        "share of job":jobs_skills_data["share of job"].get((job_id, skill_name), 0)/len(categories),
    } for job_id in jobs.index
        for skill_name, categories in skill_cat.skill_to_categories.items()
            for category in categories
]).set_index(["id", "category", "skill name"])
jobs_categories_skills_data.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 10900 entries, ('bd59469c-29f4-41b9-9001-5b7902d766e1', 'ML', 'statistics') to ('7afa68f6-4c79-4981-a5f7-751e59da1f64', 'WebDev', 'web development')
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   count         10900 non-null  float64
 1   share of job  10900 non-null  float64
dtypes: float64(2)
memory usage: 210.4+ KB


In [14]:
jobs_categories_data = jobs_categories_skills_data.groupby(level=[0,1]).sum()#.query("count > 0")
jobs_categories_data.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1200 entries, ('0124e218-2d33-41e8-907b-5228ea386455', 'Academic') to ('fdd15faa-0804-4b07-9f55-715436213e75', 'Writing')
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   count         1200 non-null   float64
 1   share of job  1200 non-null   float64
dtypes: float64(2)
memory usage: 26.4+ KB


In [15]:
jobs_categories_skills_data["category contribution"] = (jobs_categories_data["share of job"] / jobs_categories_data["count"])

In [16]:
jobs_categories_skills_data.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 10900 entries, ('bd59469c-29f4-41b9-9001-5b7902d766e1', 'ML', 'statistics') to ('7afa68f6-4c79-4981-a5f7-751e59da1f64', 'WebDev', 'web development')
Data columns (total 3 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   count                  10900 non-null  float64
 1   share of job           10900 non-null  float64
 2   category contribution  9483 non-null   float64
dtypes: float64(3)
memory usage: 295.6+ KB


In [17]:
jobs_categories_skills_data["skill weight"] = jobs_categories_skills_data["category contribution"]*(1/3.0) + jobs_categories_skills_data["share of job"]*(2/3.0)

In [18]:
jobs_categories_skills_data.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 10900 entries, ('bd59469c-29f4-41b9-9001-5b7902d766e1', 'ML', 'statistics') to ('7afa68f6-4c79-4981-a5f7-751e59da1f64', 'WebDev', 'web development')
Data columns (total 4 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   count                  10900 non-null  float64
 1   share of job           10900 non-null  float64
 2   category contribution  9483 non-null   float64
 3   skill weight           9483 non-null   float64
dtypes: float64(4)
memory usage: 380.8+ KB


In [19]:
jobs_categories_skills_data.join(job_descriptions).info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 10900 entries, ('bd59469c-29f4-41b9-9001-5b7902d766e1', 'ML', 'statistics') to ('7afa68f6-4c79-4981-a5f7-751e59da1f64', 'WebDev', 'web development')
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   count                  10900 non-null  float64
 1   share of job           10900 non-null  float64
 2   category contribution  9483 non-null   float64
 3   skill weight           9483 non-null   float64
 4   job description        10900 non-null  object 
dtypes: float64(4), object(1)
memory usage: 465.9+ KB
