# About

This notebook uses LLM to predict the risk category of the jobs when provided the skill vector (With combined skills)


In [1]:
from enum import Enum
import instructor
import pandas as pd
from openai import OpenAI
from pydantic import BaseModel, Field


In [37]:
SKILL_IMPORTANCE_PATH = 'data/skills/skills_importance.csv'
SKILL_BASED_RISK_PATH = 'data/skills/skills_based_risk.csv'

In [3]:
system_prompt = """
SYSTEM (role description)
You are an expert labour economist bot trained on the World Economic Forum's *Future of Jobs Report 2025*.  
Your task is to decide whether a given occupation is at **high risk of automation** in the next 5-10 years.

JOB INPUT
Job title and Skill-importance ratings (1=not important; 5=extremely important):
Reading Comprehension ,
Writing ,
Speaking ,
Active Listening ,
Critical Thinking ,
Complex Problem Solving ,
Judgment & Decision Making ,
Active Learning ,
Learning Strategies ,
Monitoring ,
Operations Analysis ,
Social Perceptiveness ,
Coordination ,
Persuasion ,
Negotiation ,
Instructing ,
Service Orientation ,
Operation and Control ,
Operations Monitoring ,
Quality Control Analysis ,
Troubleshooting ,
Equipment Selection ,
Equipment Maintenance ,
Installation ,
Repairing ,
Technology Design ,
Programming ,
Mathematics ,
Science ,
Systems Analysis ,
Systems Evaluation ,
Time Management ,
Management of Financial Resources ,
Management of Material Resources ,
Management of Personnel Resources

REFERENCE HEURISTICS  
(derived from WEF report tables & figures)

• Jobs dominated by **routine technical / operations-control / physical-maintenance** skills  
  and scoring *low* on **social or management** skills → HIGH automation risk.  
• Jobs with **strong social, basic-literacy or management** skill averages (≥ 3.5)  
  and lower technical/physical reliance → LOW automation risk.  
• Care, education, health-services and green-transition trades are flagged by WEF as *growing*;  
  treat them as LOW risk if their social or basic skill average ≥ 3.  
• Clerical / data-entry / simple sales support roles are flagged as *declining*;  
  treat them as HIGH risk if their basic skill average ≤ 3 and social average ≤ 3.  
• Otherwise classify as MEDIUM risk (but you must still output **no_automation_risk**).

TASK
1. Group the 35 skills into the six clusters:  
   basic, cognitive, social, operations_control, physical/maintenance, technical.  
2. Compute each cluster's average importance.  
3. Apply the heuristics above.  
4. Output **exactly one** of the following labels in lowercase:  
   • automation_risk   (if high risk)  
   • no_automation_risk (if medium or low risk)  
5. Also return a one-sentence justification citing the dominant skill pattern (optional for the calling code).

"""

In [26]:
client = instructor.from_openai(OpenAI())

class AutomationRisk(str, Enum):
    AUTOMATION_RISK = "automation_risk"
    NO_AUTOMATION_RISK = "no_automation_risk"

class AutomationRiskOutput(BaseModel):
    explanation: str = Field(description="Brief explanation (< 200 tokens) of why the job has high risk of automation or low risk of automation")
    automation_risk: AutomationRisk = Field(
        description="""does this job have high risk of automation?"""
    )


def get_automation_risk(job_title, skill_importance_ratings) -> AutomationRiskOutput:
    if type(skill_importance_ratings) == dict:
        skill_importance_ratings = str(skill_importance_ratings)
    
    query = f"""
    Job title: {job_title}
    Skill-importance ratings: {skill_importance_ratings}
    """
    return client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": system_prompt,
            },
            {"role": "user", "content": query},
        ],
        model="gpt-4.1",
        response_model=AutomationRiskOutput,
    )

def is_risked_by_automation(job_title, skill_importance_ratings) -> bool:
    output = get_automation_risk(job_title, skill_importance_ratings)
    return output.automation_risk == AutomationRisk.AUTOMATION_RISK

In [38]:
skills_importance_df = pd.read_csv(SKILL_IMPORTANCE_PATH)
skills_based_risk_df = pd.read_csv(SKILL_BASED_RISK_PATH)

In [28]:
def get_job_title_and_skill_importance(onetsoc_code) -> tuple:
    global skills_importance_df
    skill_vector = skills_importance_df[skills_importance_df['O*NET-SOC Code'] == onetsoc_code].iloc[0].to_dict()
    job_title = skill_vector['Title']
    del skill_vector['O*NET-SOC Code']
    del skill_vector['Title']
    del skill_vector['normalized_major_code']
    del skill_vector['normalized_minor_code']
    return job_title, skill_vector




In [58]:
job_title, skill_vector = get_job_title_and_skill_importance('11-1011.00')
is_risked_by_automation(job_title, skill_vector)

False

In [72]:
from tqdm import tqdm
row_processed = 0
results = []
for i, row in tqdm(skills_based_risk_df.iterrows(), total=len(skills_based_risk_df)):
    onetsoc_code = row['O*NET-SOC Code']
    job_title, skill_vector = get_job_title_and_skill_importance(onetsoc_code)
    is_risked = is_risked_by_automation(job_title, skill_vector)
    results.append(is_risked)
    row_processed += 1

100%|██████████| 879/879 [39:23<00:00,  2.69s/it] 


In [73]:
sum(results)

348

In [74]:
skills_based_risk_df['llm_based_automation_risk'] = results

In [75]:
# skills_based_risk_df.to_csv(SKILL_BASED_RISK_PATH, index=False)

In [125]:
skills_based_risk_df.head()

Unnamed: 0,O*NET-SOC Code,Title,basic_skills,cognitive_skills,social_skills,operations_skills,maintenance_skills,technical_skills,management_skills,automation_risk_score,automation_risk,llm_based_automation_risk
0,11-1011.00,Chief Executives,4.1225,3.928571,3.831667,1.815,1.03,2.0925,4.145,36.5,26.2,False
1,11-1011.03,Chief Sustainability Officers,4.03,3.68,3.521667,1.72,1.03,2.19,3.231667,31.2,20.3,False
2,11-1021.00,General and Operations Managers,3.875,3.462857,3.5,2.065,1.0,1.78,3.288333,33.1,22.4,False
3,11-2011.00,Advertising and Promotions Managers,3.905,3.357143,3.333333,1.31,1.03,2.03,3.038333,38.2,28.0,False
4,11-2021.00,Marketing Managers,3.7225,3.625714,3.478333,1.4075,1.0,2.0325,3.188333,37.6,27.3,False


In [127]:
llm_based_risk = skills_based_risk_df[skills_based_risk_df['llm_based_automation_risk'] == True]

In [132]:
llm_based_risk.sort_values(by='automation_risk_score', ascending=False)

Unnamed: 0,O*NET-SOC Code,Title,basic_skills,cognitive_skills,social_skills,operations_skills,maintenance_skills,technical_skills,management_skills,automation_risk_score,automation_risk,llm_based_automation_risk
722,49-9097.00,Signal and Track Switch Repairers,2.9100,2.804286,2.435000,3.7175,3.6250,1.9375,2.188333,76.0,68.9,True
680,49-2092.00,"Electric Motor, Power Tool, and Related Repairers",3.0000,2.945714,2.605000,3.5950,3.6875,2.0600,2.501667,74.5,66.8,True
706,49-9043.00,"Maintenance Workers, Machinery",2.8150,2.625714,2.371667,3.5600,3.2825,1.8450,2.145000,74.1,66.2,True
693,49-3042.00,"Mobile Heavy Equipment Mechanics, Except Engines",3.0300,2.855714,2.668333,3.7175,3.4700,2.0650,2.541667,73.9,66.0,True
687,49-3011.00,Aircraft Mechanics and Service Technicians,3.3150,3.161429,2.876667,3.8750,3.5325,2.0625,2.560000,73.9,66.1,True
...,...,...,...,...,...,...,...,...,...,...,...,...
601,43-9111.00,Statistical Assistants,3.5025,3.142857,2.793333,1.4075,1.0000,2.8450,2.271667,41.7,31.6,True
540,41-3041.00,Travel Agents,3.8100,2.838571,3.391667,1.4050,1.0300,1.9075,2.230000,41.6,31.5,True
86,13-2053.00,Insurance Underwriters,3.7175,3.017143,2.896667,1.2500,1.0000,1.8450,2.460000,41.1,31.0,True
112,15-1299.03,Document Management Specialists,3.6250,3.230000,2.938333,1.4375,1.0000,2.0300,2.790000,41.0,30.9,True


In [110]:
b_size = 20
x = skills_based_risk_df.sort_values(by='automation_risk_score', ascending=False)

# Convert boolean to int for sum
window_sums = x['llm_based_automation_risk'].astype(float).rolling(window=b_size).sum()

# If you want to align the sum with the right edge of the window (default)
x['window_sum'] = window_sums

# If you want to align with the center or left, you can use the 'center' or 'min_periods' arguments
# window_sums = x['llm_based_automation_risk'].astype(int).rolling(window=b_size, min_periods=1).sum()

In [None]:
client = instructor.from_openai(OpenAI())


logic_prompt = """
Brief explanation (< 200 tokens) of why the job is a producer or servicer in the context of local region economy. Think of the fact that producer based job 
"""
class JobType(str, Enum):
    Producer = "producer"
    Servicer = "consumer"

class JobTypeOutput(BaseModel):
    explanation: str = Field(description="")
    producer_score: float = Field(description="score between 0 and 100 for how producer-like the job is")
    servicer_score: float = Field(description="score between 0 and 100 for how servicer-like the job is")
    job_type: JobType = Field(
        description="""is this job a producer or servicer?"""
    )


def get_automation_risk(job_title, skill_importance_ratings) -> AutomationRiskOutput:
    if type(skill_importance_ratings) == dict:
        skill_importance_ratings = str(skill_importance_ratings)
    
    query = f"""
    Job title: {job_title}
    Skill-importance ratings: {skill_importance_ratings}
    """
    return client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": system_prompt,
            },
            {"role": "user", "content": query},
        ],
        model="gpt-4.1",
        response_model=AutomationRiskOutput,
    )

def is_risked_by_automation(job_title, skill_importance_ratings) -> bool:
    output = get_automation_risk(job_title, skill_importance_ratings)
    return output.automation_risk == AutomationRisk.AUTOMATION_RISK