# About

This notebook uses LLM to predict the risk category of the jobs when provided the skill vector (With combined skills)


In [1]:
from enum import Enum
import instructor
import pandas as pd
from openai import OpenAI
from pydantic import BaseModel, Field


In [37]:
SKILL_IMPORTANCE_PATH = 'data/skills/skills_importance.csv'
SKILL_BASED_RISK_PATH = 'data/skills/skills_based_risk.csv'

In [3]:
system_prompt = """
SYSTEM (role description)
You are an expert labour economist bot trained on the World Economic Forum's *Future of Jobs Report 2025*.  
Your task is to decide whether a given occupation is at **high risk of automation** in the next 5-10 years.

JOB INPUT
Job title and Skill-importance ratings (1=not important; 5=extremely important):
Reading Comprehension ,
Writing ,
Speaking ,
Active Listening ,
Critical Thinking ,
Complex Problem Solving ,
Judgment & Decision Making ,
Active Learning ,
Learning Strategies ,
Monitoring ,
Operations Analysis ,
Social Perceptiveness ,
Coordination ,
Persuasion ,
Negotiation ,
Instructing ,
Service Orientation ,
Operation and Control ,
Operations Monitoring ,
Quality Control Analysis ,
Troubleshooting ,
Equipment Selection ,
Equipment Maintenance ,
Installation ,
Repairing ,
Technology Design ,
Programming ,
Mathematics ,
Science ,
Systems Analysis ,
Systems Evaluation ,
Time Management ,
Management of Financial Resources ,
Management of Material Resources ,
Management of Personnel Resources

REFERENCE HEURISTICS  
(derived from WEF report tables & figures)

• Jobs dominated by **routine technical / operations-control / physical-maintenance** skills  
  and scoring *low* on **social or management** skills → HIGH automation risk.  
• Jobs with **strong social, basic-literacy or management** skill averages (≥ 3.5)  
  and lower technical/physical reliance → LOW automation risk.  
• Care, education, health-services and green-transition trades are flagged by WEF as *growing*;  
  treat them as LOW risk if their social or basic skill average ≥ 3.  
• Clerical / data-entry / simple sales support roles are flagged as *declining*;  
  treat them as HIGH risk if their basic skill average ≤ 3 and social average ≤ 3.  
• Otherwise classify as MEDIUM risk (but you must still output **no_automation_risk**).

TASK
1. Group the 35 skills into the six clusters:  
   basic, cognitive, social, operations_control, physical/maintenance, technical.  
2. Compute each cluster's average importance.  
3. Apply the heuristics above.  
4. Output **exactly one** of the following labels in lowercase:  
   • automation_risk   (if high risk)  
   • no_automation_risk (if medium or low risk)  
5. Also return a one-sentence justification citing the dominant skill pattern (optional for the calling code).

"""

In [26]:
client = instructor.from_openai(OpenAI())

class AutomationRisk(str, Enum):
    AUTOMATION_RISK = "automation_risk"
    NO_AUTOMATION_RISK = "no_automation_risk"

class AutomationRiskOutput(BaseModel):
    explanation: str = Field(description="Brief explanation (< 200 tokens) of why the job has high risk of automation or low risk of automation")
    automation_risk: AutomationRisk = Field(
        description="""does this job have high risk of automation?"""
    )


def get_automation_risk(job_title, skill_importance_ratings) -> AutomationRiskOutput:
    if type(skill_importance_ratings) == dict:
        skill_importance_ratings = str(skill_importance_ratings)
    
    query = f"""
    Job title: {job_title}
    Skill-importance ratings: {skill_importance_ratings}
    """
    return client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": system_prompt,
            },
            {"role": "user", "content": query},
        ],
        model="gpt-4.1",
        response_model=AutomationRiskOutput,
    )

def is_risked_by_automation(job_title, skill_importance_ratings) -> bool:
    output = get_automation_risk(job_title, skill_importance_ratings)
    return output.automation_risk == AutomationRisk.AUTOMATION_RISK

In [38]:
skills_importance_df = pd.read_csv(SKILL_IMPORTANCE_PATH)
skills_based_risk_df = pd.read_csv(SKILL_BASED_RISK_PATH)

In [28]:
def get_job_title_and_skill_importance(onetsoc_code) -> tuple:
    global skills_importance_df
    skill_vector = skills_importance_df[skills_importance_df['O*NET-SOC Code'] == onetsoc_code].iloc[0].to_dict()
    job_title = skill_vector['Title']
    del skill_vector['O*NET-SOC Code']
    del skill_vector['Title']
    del skill_vector['normalized_major_code']
    del skill_vector['normalized_minor_code']
    return job_title, skill_vector




In [34]:
job_title, skill_vector = get_job_title_and_skill_importance('49-9097.00')
is_risked_by_automation(job_title, skill_vector)

True

In [51]:
from tqdm import tqdm
row_processed = 0
for i, row in tqdm(skills_based_risk_df.iterrows(), total=len(skills_based_risk_df)):
    onetsoc_code = row['O*NET-SOC Code']
    job_title, skill_vector = get_job_title_and_skill_importance(onetsoc_code)
    is_risked = is_risked_by_automation(job_title, skill_vector)
    skills_based_risk_df['llm_based_automation_risk'] = is_risked
    row_processed += 1
    if row_processed > 2:
        break

  0%|          | 2/879 [00:07<52:24,  3.59s/it]


In [None]:
skills_based_risk_df.to_csv(SKILL_BASED_RISK_PATH, index=False)