# About

This notebook uses LLM to predict the risk category of the jobs when provided the skill vector (With combined skills)


In [13]:
from enum import Enum
import instructor
import pandas as pd
from openai import OpenAI
from pydantic import BaseModel, Field
import re

import instructor
from anthropic import Anthropic
import numpy as np

In [2]:
JOB_DESCRIPTION_PATH = '../onetonline_job_details'
INDUSTRY_DF_PATH = '../v2_assets/modified_bls_super_sector_df.csv'

LLM_JOBS_ANALYSIS_PATH = '../v2_assets/llm_risk.csv'

In [3]:
industry_df = pd.read_csv(INDUSTRY_DF_PATH)

In [4]:
import os
import glob
import json

def get_job_summary(file_path):
    """
    Read a JSON file and return the content of the 'summary' key.
    
    Args:
        file_path (str): Path to the JSON file
        
    Returns:
        str: Content of the 'summary' key, or None if not found
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
            
        # Return the summary content
        summary = data.get('summary', None)
        
        if summary is None:
            print(f"Warning: 'summary' key not found in {file_path}")
            
        return summary
        
    except FileNotFoundError:
        print(f"Error: File not found - {file_path}")
        return None
    except json.JSONDecodeError as e:
        print(f"Error: Invalid JSON in {file_path} - {e}")
        return None
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return None

def create_complete_job_dataframe(job_description_path, include_summaries=True, sample_size=None):
    """
    Create a comprehensive DataFrame with O*NET-SOC Code, file path, and summary.
    
    Args:
        job_description_path (str): Path to the directory containing JSON files
        include_summaries (bool): Whether to extract and include job summaries
        sample_size (int, optional): Number of files to process. If None, processes all.
        
    Returns:
        pd.DataFrame: DataFrame with columns: O*NET-SOC Code, file_path, summary
    """
    def extract_job_title_regex(job_string):
        """
        Extract job title from O*NET-SOC formatted string using regex.
        
        Args:
            job_string (str): String in format "code - job title"
            
        Returns:
            str: The job title part, or original string if pattern doesn't match
        """
        # Pattern explanation:
        # ^[\d\-\.]+  - Start of string, one or more digits, hyphens, or dots
        # \s*-\s*     - Optional whitespace, hyphen, optional whitespace
        # (.+)$       - Capture group for everything else until end of string
        pattern = r'^[\d\-\.]+\s*-\s*(.+)$'
        
        match = re.match(pattern, job_string.strip())
        if match:
            return match.group(1).strip()
        return job_string

    # Get all JSON files in the directory
    json_files = glob.glob(os.path.join(job_description_path, "*.json"))
    
    if sample_size:
        json_files = json_files[:sample_size]
    
    data = []
    total_files = len(json_files)
    
    print(f"Processing {total_files} job description files...")
    
    for i, file_path in enumerate(json_files):
        # Get the filename without the path
        filename = os.path.basename(file_path)
        
        # Extract O*NET-SOC Code by splitting by '_' and getting the 0th element
        onet_soc_code = filename.split('_')[0]
        
        # Initialize row data
        row_data = {
            'O*NET-SOC Code': onet_soc_code,
            'file_path': file_path,
        }
        
        # Add summary if requested
        if include_summaries:
            summary = get_job_summary(file_path)
            title = summary.get('name', None)
            title = extract_job_title_regex(title)
            row_data['Title'] = title
            row_data['summary'] = summary
        
        data.append(row_data)
        
        # Progress indicator
        if (i + 1) % 50 == 0 or (i + 1) == total_files:
            print(f"Processed {i + 1}/{total_files} files")
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Sort by O*NET-SOC Code for better organization
    df = df.sort_values('O*NET-SOC Code').reset_index(drop=True)
    
    return df

# Create the complete DataFrame with all information
print("Creating complete job descriptions DataFrame with summaries...")
job_descriptions_df = create_complete_job_dataframe(JOB_DESCRIPTION_PATH, include_summaries=True)

print(f"\nDataFrame created successfully!")
print(f"Shape: {job_descriptions_df.shape}")
print(f"Columns: {list(job_descriptions_df.columns)}")

Creating complete job descriptions DataFrame with summaries...
Processing 923 job description files...
Processed 50/923 files
Processed 100/923 files
Processed 150/923 files
Processed 200/923 files
Processed 250/923 files
Processed 300/923 files
Processed 350/923 files
Processed 400/923 files
Processed 450/923 files
Processed 500/923 files
Processed 550/923 files
Processed 600/923 files
Processed 650/923 files
Processed 700/923 files
Processed 750/923 files
Processed 800/923 files
Processed 850/923 files
Processed 900/923 files
Processed 923/923 files

DataFrame created successfully!
Shape: (923, 3)
Columns: ['O*NET-SOC Code', 'file_path', 'summary']


In [5]:
job_descriptions_df = job_descriptions_df.merge(industry_df, on='O*NET-SOC Code', how='left')

In [6]:
job_descriptions_df

Unnamed: 0,O*NET-SOC Code,file_path,summary,Title,Modified BLS Super Sector
0,11-1011.00,../onetonline_job_details/11-1011.00_20250611_...,"{'soc_code': '11-1011.00', 'name': '11-1011.00...",Chief Executives,MANAGEMENT
1,11-1011.03,../onetonline_job_details/11-1011.03_20250611_...,"{'soc_code': '11-1011.03', 'name': '11-1011.03...",Chief Sustainability Officers,MANAGEMENT
2,11-1021.00,../onetonline_job_details/11-1021.00_20250611_...,"{'soc_code': '11-1021.00', 'name': '11-1021.00...",General and Operations Managers,MANAGEMENT
3,11-1031.00,../onetonline_job_details/11-1031.00_20250611_...,"{'soc_code': '11-1031.00', 'name': '11-1031.00...",Legislators,GOVERNMENT
4,11-2011.00,../onetonline_job_details/11-2011.00_20250611_...,"{'soc_code': '11-2011.00', 'name': '11-2011.00...",Advertising and Promotions Managers,MEDIA_AND_COMMUNICATIONS_SERVICES
...,...,...,...,...,...
918,53-7071.00,../onetonline_job_details/53-7071.00_20250611_...,"{'soc_code': '53-7071.00', 'name': '53-7071.00...",Gas Compressor and Gas Pumping Station Operators,ENERGY
919,53-7072.00,../onetonline_job_details/53-7072.00_20250611_...,"{'soc_code': '53-7072.00', 'name': '53-7072.00...","Pump Operators, Except Wellhead Pumpers",MANUFACTURING
920,53-7073.00,../onetonline_job_details/53-7073.00_20250611_...,"{'soc_code': '53-7073.00', 'name': '53-7073.00...",Wellhead Pumpers,MINING_OIL_GAS_EXTRACTION
921,53-7081.00,../onetonline_job_details/53-7081.00_20250611_...,"{'soc_code': '53-7081.00', 'name': '53-7081.00...",Refuse and Recyclable Material Collectors,SPECIAL_INDUSTRIES


In [7]:
system_prompt = """
You are an expert labour economist bot that reasons and thinks about the future prospects of a job role.

You have access to some premises extracted from a well established WEF's Future of Jobs Report 2025 to
predict the risk of automation for a given occupation. Utilize the premises to predict the risk of automation for a given occupation.

Following are the premises that you will be using to predict the risk of automation for a given occupation:

Short Term / Software only Automation:

| Vulnerable Duties                                                                                                                                           | Safe Duties                                                                                                                                       |
| ----------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------- |
| **Templated data entry**<br>\[“Data Entry Clerks –24 net growth”; “Bank Tellers and Related Clerks –20 net growth”]                                         | **Framing strategy & designing novel solutions**<br>\[“Analytical thinking 88 %”; “Creative thinking 62 %”]                                       |
| **Rule‑based admin workflows**<br>\[“Administrative Assistants and Executive Secretaries –18 net growth”; “Legal Secretaries –?” (among fastest‑declining)] | **Tackling ambiguous problems without clear precedents**<br>\[“Resilience, flexibility and agility 75 %”; “Curiosity and lifelong learning 81 %”] |
| **Metric‑driven, routine reporting**<br>\[“Accountants and Auditors –8 net growth”; “Claims Adjusters, Examiners, and Investigators” among declines]        | **Deep domain & experiential expertise**<br>\[“Leadership and social influence 88 %”; “Motivation and self‑awareness 58 %”]                       |
| **Standardized correspondence generation**<br>\[“Legal Secretaries” in top‑declining list; “Postal Service Clerks”]                                         | **High‑stakes ethical/legal judgment**<br>\[“Empathy and active listening 81 %”; “Service orientation and customer service” stable core skill]    |
| **Meeting narrow accuracy targets**<br>\[“Claims Adjusters, Examiners, and Investigators”]; \[“Cashiers and Ticket Clerks”]                                 | **Building relationships & coaching**<br>\[“Teaching and mentoring”; “Leadership and social influence 88 %”]                                      |


Long Term / Software+Robotics Automation:

| Vulnerable Duties                                                                                                                | Safe Duties                                                                                                                        |
| -------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
| **Repetitive assembly tasks**<br>\[“Assembly and Factory Workers –4 net growth”; “Printing and Related Trades Workers”]          | **Fine‑motor craftsmanship**<br>\[“Manual dexterity, endurance and precision”; “Drafters, Engineering and Mapping Technicians”]    |
| **Bulk handling & fixed workflows**<br>\[“Material‑Recording and Stock‑Keeping Clerks”; “Door‑To‑Door Sales Workers”]            | **Adaptability in variable environments**<br>\[“Resilience, flexibility and agility 75 %”; “Curiosity and lifelong learning 81 %”] |
| **Operating standardized machinery**<br>\[“Printing and Related Trades Workers”; “Transportation Attendants and Conductors”]     | **Human‑centric care & education**<br>\[“Empathy and active listening 81 %”; “Service orientation and customer service”]           |
| **Uniform load‑lifting**<br>\[“Construction Labourers” in declining roles; “Cashiers and Ticket Clerks” handle repetitive loads] | **On‑the‑fly physical problem solving**<br>\[“Curiosity and lifelong learning 81 %”; “Systems thinking”]                           |
| **Exact blueprint adherence**<br>\[“Building Framers, Finishers, and Related Trades Workers”]                                    | **Creative physical expression**<br>\[“Creative thinking 69 %”; “Motivation and self‑awareness 58 %”]                              |


You will be given job title and some job attributes that show the work activies that are done by the employees in that job.
You will also be given the largest industry that the job belongs to.

With all the job context and the premises, you will be able to predict the risk of automation for a given occupation for both
1. Short Term / Software Automation scenario
2. Long Term / Robotics Automation scenario
"""

In [14]:
from tenacity import Retrying, stop_after_attempt, wait_fixed

client = instructor.from_anthropic(Anthropic())

class SoftwareAutomationRisk(BaseModel):
    risk_score: float = Field(description="The risk score for the job between 0 and 10; score <= 2 is low risk, score >= 8 is high risk; score 3-7 is medium risk")
    risk_explanation: str = Field(description="Brief explanation (< 150 tokens) of the automation risks score considering when AI capabilities ONLY in the software are reliably available in the industry")

class SoftwareAndRoboticsAutomationRisk(BaseModel):
    risk_score: float = Field(description="The risk score for the job between 0 and 10; score <= 2 is low risk, score >= 8 is high risk; score 3-7 is medium risk")
    risk_explanation: str = Field(description="Brief explanation (< 150 tokens) of the automation risks score considering when AI capabilities in the software and robotics are reliably available in the industry.")

class AutomationAdoptionRate(BaseModel):
    software_only_adoption_rate: float = Field(description="Considering the job and the largest industry it belongs to, output the software only automation adoption rate of automation in the industry - between 0 and 10; rate <= 2 is slow adoption, score >= 8 is quick adoption; score 3-7 is average speed adoption")
    software_and_robotics_adoption_rate: float = Field(description="Considering the job and the largest industry it belongs to, output the software + robotics automation adoption rate of automation in the industry - between 0 and 10; rate <= 2 is slow adoption, score >= 8 is quick adoption; score 3-7 is average speed adoption")
    adoption_rate_explanation: str = Field(description="Brief explanation (< 150 tokens) of you gave the adoption rate for software only and software + robotics automation, reason using how fast the industry is to accept the changes in general. For example, training individuals or setting up systems at scale could be costly causing the adoption to be slow etc.")

class AutomationRiskOutput(BaseModel):
    software_automation_risk: SoftwareAutomationRisk
    software_and_robotics_automation_risk: SoftwareAndRoboticsAutomationRisk
    automation_adoption_rate: AutomationAdoptionRate

def get_automation_risk(job_title, job_description, job_industry) -> AutomationRiskOutput:
    query = f"""
    Job title: {job_title}
    Job description: {job_description}
    Job industry: {job_industry}
    """
    return client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": system_prompt,
            },
            {"role": "user", "content": query},
        ],
        model="claude-sonnet-4-20250514",
        max_tokens=2048,
        temperature=0.0,
        response_model=AutomationRiskOutput,
            max_retries=Retrying(
            stop=stop_after_attempt(2),  
            wait=wait_fixed(30),  
        )
    )

In [12]:
from concurrent.futures import ThreadPoolExecutor

def process_job_row(row_data):
    """Process a single job row and return the results"""
    index, row = row_data
    
    try:
        automation_risk = get_automation_risk(row['Title'], row['summary'], row['Modified BLS Super Sector'])
        
        return {
            'index': index,
            'software_automation_risk': automation_risk.software_automation_risk.risk_score,
            'software_automation_reason': automation_risk.software_automation_risk.risk_explanation,
            'software_and_robotics_automation_risk': automation_risk.software_and_robotics_automation_risk.risk_score,
            'software_and_robotics_automation_reason': automation_risk.software_and_robotics_automation_risk.risk_explanation,
            'software_only_adoption_rate': automation_risk.automation_adoption_rate.software_only_adoption_rate,
            'software_and_robotics_adoption_rate': automation_risk.automation_adoption_rate.software_and_robotics_adoption_rate,
            'automation_adoption_rate_reason': automation_risk.automation_adoption_rate.adoption_rate_explanation
        }
    except Exception as e:
        print(f"Error processing job {row['Title']}: {e}")
        return {
            'index': index,
            'software_automation_risk': np.nan,
            'software_automation_reason': '',
            'software_and_robotics_automation_risk': np.nan,
            'software_and_robotics_automation_reason': '',
            'software_only_adoption_rate': np.nan,
            'software_and_robotics_adoption_rate': np.nan,
            'automation_adoption_rate_reason': ''
        }

In [22]:
job_descriptions_df['software_automation_risk'] = np.nan
job_descriptions_df['software_automation_reason'] = ''
job_descriptions_df['software_and_robotics_automation_risk'] = np.nan
job_descriptions_df['software_and_robotics_automation_reason'] = ''

job_descriptions_df['software_only_adoption_rate'] = np.nan
job_descriptions_df['software_and_robotics_adoption_rate'] = np.nan
job_descriptions_df['automation_adoption_rate_reason'] = ''

# Parallelize the processing
# sample_df = job_descriptions_df.sample(n=9)
sample_df = job_descriptions_df

# Prepare data for parallel processing
row_data = list(sample_df.iterrows())

print(f"Processing {len(row_data)} jobs with 5 concurrent threads...")

# Process jobs in parallel with concurrency 5
with ThreadPoolExecutor(max_workers=5) as executor:
    results = list(executor.map(process_job_row, row_data))

print("Updating DataFrame with results...")

# Update the DataFrame with results
for result in results:
    if result:  # Check if result is not None
        index = result['index']
        sample_df.at[index, 'software_automation_risk'] = result['software_automation_risk']
        sample_df.at[index, 'software_automation_reason'] = result['software_automation_reason']
        sample_df.at[index, 'software_and_robotics_automation_risk'] = result['software_and_robotics_automation_risk']
        sample_df.at[index, 'software_and_robotics_automation_reason'] = result['software_and_robotics_automation_reason']
        sample_df.at[index, 'software_only_adoption_rate'] = result['software_only_adoption_rate']
        sample_df.at[index, 'software_and_robotics_adoption_rate'] = result['software_and_robotics_adoption_rate']
        sample_df.at[index, 'automation_adoption_rate_reason'] = result['automation_adoption_rate_reason']

print("Processing completed!")

Processing 923 jobs with 5 concurrent threads...
Updating DataFrame with results...
Processing completed!


In [23]:
sample_df = sample_df.drop(['file_path', 'summary'], axis=1)

In [31]:
# sample_df.to_csv(LLM_JOBS_ANALYSIS_PATH, index=False)