# About

This notebook 
- converts Skills.xlsx into Skills.csv
- outputs soc_mapping.csv
- outputs skills_importance.csv
- outputs skill_based_risk.csv


In [1]:
import pandas as pd
import numpy as np

In [2]:
TECH_INTENSITY_PATH = '../data/tech_intensity_simple.csv'

In [3]:
pd.set_option('display.max_columns', None)

# soc_mapping.csv

In [4]:
import json

# Read the SOC mapping JSON file
with open('../data/soc_mapping.json', 'r') as f:
    soc_mapping = json.load(f)

df = pd.DataFrame.from_dict(soc_mapping, orient='index')

# Reset index to make the SOC codes a column
df = df.reset_index()
df = df.rename(columns={'index': 'SOC Code'})
# First remove the hyphen, then take the appropriate number of digits
df['normalized_SOC_Code'] = df['SOC Code'].str.replace('-', '')
df['normalized_major_code'] = df['SOC Code'].str.replace('-', '').str[:2]
df['normalized_minor_code'] = df['SOC Code'].str.replace('-', '').str[:3]
df['normalized_broad_code'] = df['SOC Code'].str.replace('-', '').str[:4]

print("\nColumns in the DataFrame:")
print(df.columns.tolist())


Columns in the DataFrame:
['SOC Code', 'detailed_title', 'major_code', 'major_title', 'minor_code', 'minor_title', 'broad_code', 'broad_title', 'normalized_SOC_Code', 'normalized_major_code', 'normalized_minor_code', 'normalized_broad_code']


In [5]:
# df.to_csv('../data/soc_mapping.csv', index=False)

# skills.csv

In [6]:
skills_df = pd.read_excel('../data/ONET/Skills.xlsx')

In [7]:
skills_df.head()

Unnamed: 0,O*NET-SOC Code,Title,Element ID,Element Name,Scale ID,Scale Name,Data Value,N,Standard Error,Lower CI Bound,Upper CI Bound,Recommend Suppress,Not Relevant,Date,Domain Source
0,11-1011.00,Chief Executives,2.A.1.a,Reading Comprehension,IM,Importance,4.12,8,0.125,3.88,4.37,N,,08/2023,Analyst
1,11-1011.00,Chief Executives,2.A.1.a,Reading Comprehension,LV,Level,4.62,8,0.183,4.2664,4.9836,N,N,08/2023,Analyst
2,11-1011.00,Chief Executives,2.A.1.b,Active Listening,IM,Importance,4.0,8,0.0,4.0,4.0,N,,08/2023,Analyst
3,11-1011.00,Chief Executives,2.A.1.b,Active Listening,LV,Level,4.75,8,0.1637,4.4292,5.0708,N,N,08/2023,Analyst
4,11-1011.00,Chief Executives,2.A.1.c,Writing,IM,Importance,4.12,8,0.125,3.88,4.37,N,,08/2023,Analyst


In [8]:
# print unique values of Element Name
print(f"unique count of Element Name: {len(skills_df['Element Name'].unique())}")
print(f"unique values of Element Name: {skills_df['Element Name'].unique()}")

print(f"unique count of O*NET-SOC Code: {len(skills_df['O*NET-SOC Code'].unique())}")
print(f"unique count of SOC code without .specification: {len(skills_df['O*NET-SOC Code'].unique())}")

unique count of Element Name: 35
unique values of Element Name: ['Reading Comprehension' 'Active Listening' 'Writing' 'Speaking'
 'Mathematics' 'Science' 'Critical Thinking' 'Active Learning'
 'Learning Strategies' 'Monitoring' 'Social Perceptiveness' 'Coordination'
 'Persuasion' 'Negotiation' 'Instructing' 'Service Orientation'
 'Complex Problem Solving' 'Operations Analysis' 'Technology Design'
 'Equipment Selection' 'Installation' 'Programming'
 'Operations Monitoring' 'Operation and Control' 'Equipment Maintenance'
 'Troubleshooting' 'Repairing' 'Quality Control Analysis'
 'Judgment and Decision Making' 'Systems Analysis' 'Systems Evaluation'
 'Time Management' 'Management of Financial Resources'
 'Management of Material Resources' 'Management of Personnel Resources']
unique count of O*NET-SOC Code: 879
unique count of SOC code without .specification: 879


In [9]:
# Update SOC_Code column based on the condition
skills_df['SOC_Code'] = skills_df['O*NET-SOC Code'].apply(lambda x: x.split('.')[0] if '.' in x else x)

In [10]:
# skills_df.to_csv('Skills.csv', index=False)

# skills_importance.csv

In [11]:
skills_importance_df = skills_df[skills_df['Scale Name'] == 'Importance']

In [12]:
skills_importance_df  = skills_importance_df.pivot(
    index=['O*NET-SOC Code', 'Title'],
    columns='Element Name',
    values='Data Value'
).reset_index()

In [13]:
skills_importance_df['normalized_major_code'] = skills_importance_df['O*NET-SOC Code'].str.replace('-', '').str[:2]
skills_importance_df['normalized_minor_code'] = skills_importance_df['O*NET-SOC Code'].str.replace('-', '').str[:3]


In [14]:
skills_importance_df

Element Name,O*NET-SOC Code,Title,Active Learning,Active Listening,Complex Problem Solving,Coordination,Critical Thinking,Equipment Maintenance,Equipment Selection,Installation,Instructing,Judgment and Decision Making,Learning Strategies,Management of Financial Resources,Management of Material Resources,Management of Personnel Resources,Mathematics,Monitoring,Negotiation,Operation and Control,Operations Analysis,Operations Monitoring,Persuasion,Programming,Quality Control Analysis,Reading Comprehension,Repairing,Science,Service Orientation,Social Perceptiveness,Speaking,Systems Analysis,Systems Evaluation,Technology Design,Time Management,Troubleshooting,Writing,normalized_major_code,normalized_minor_code
0,11-1011.00,Chief Executives,3.75,4.00,4.38,4.25,4.38,1.00,1.12,1.00,3.38,4.75,3.12,4.25,4.00,4.25,3.25,4.00,4.12,1.88,3.12,2.00,4.00,1.75,1.88,4.12,1.00,1.62,3.12,4.12,4.25,4.12,4.25,1.75,4.00,1.50,4.12,11,111
1,11-1011.03,Chief Sustainability Officers,3.75,4.00,4.00,3.75,4.12,1.00,1.12,1.00,3.25,3.88,3.38,2.88,2.25,3.12,2.88,3.75,3.12,2.00,2.88,2.00,3.88,1.88,1.88,4.00,1.00,2.12,3.25,3.88,4.00,3.88,3.88,1.88,3.38,1.00,4.12,11,111
2,11-1021.00,General and Operations Managers,3.62,4.00,3.62,3.88,3.88,1.00,1.00,1.00,3.00,3.62,3.00,3.00,3.12,3.75,2.62,4.00,3.50,1.88,2.50,2.25,3.62,1.50,2.38,4.00,1.00,1.50,3.25,3.75,4.00,3.12,3.12,1.50,3.62,1.75,3.50,11,111
3,11-2011.00,Advertising and Promotions Managers,3.25,4.12,3.50,3.50,4.00,1.00,1.12,1.00,2.88,3.75,3.00,2.75,2.62,3.12,3.00,3.25,3.12,1.00,2.75,1.62,3.38,1.75,1.62,3.75,1.00,1.62,3.12,4.00,4.00,3.12,3.12,1.75,3.50,1.00,3.75,11,112
4,11-2021.00,Marketing Managers,3.88,3.88,3.62,3.50,3.88,1.00,1.00,1.00,3.00,3.75,3.12,2.88,2.62,3.38,2.75,3.75,3.62,1.00,3.38,1.75,3.75,1.88,1.88,3.88,1.00,1.75,3.12,3.88,3.88,3.25,3.50,1.75,3.50,1.00,3.25,11,112
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
874,53-7071.00,Gas Compressor and Gas Pumping Station Operators,2.88,3.12,3.00,3.00,3.62,3.12,2.12,1.00,2.50,3.00,2.25,1.62,2.00,2.25,2.25,3.12,2.25,3.88,1.00,4.00,2.25,1.38,3.00,3.12,3.00,1.88,2.25,2.75,3.00,2.62,2.00,1.88,3.00,3.12,3.00,53,537
875,53-7072.00,"Pump Operators, Except Wellhead Pumpers",2.88,3.12,2.88,2.88,3.12,2.88,2.25,1.12,2.88,3.00,2.12,2.00,2.25,2.62,2.38,3.50,2.25,3.50,1.88,3.88,2.38,1.75,2.88,3.12,2.75,2.00,2.38,2.75,3.12,2.38,2.12,1.88,3.00,3.00,2.88,53,537
876,53-7073.00,Wellhead Pumpers,2.38,2.88,3.00,2.25,3.12,3.12,2.25,1.25,2.12,3.12,1.88,1.38,1.75,2.12,2.25,3.12,2.00,3.88,1.38,4.00,2.00,1.25,2.38,2.75,3.12,1.12,1.88,2.50,3.00,2.00,2.00,1.50,2.75,3.12,2.62,53,537
877,53-7081.00,Refuse and Recyclable Material Collectors,2.25,2.88,2.38,2.62,2.75,2.75,1.75,1.00,1.88,2.38,1.38,1.00,1.00,2.00,1.00,2.50,2.00,3.00,1.12,3.00,2.00,1.00,2.25,2.62,2.50,1.00,2.38,2.50,2.88,1.38,1.38,1.00,2.50,2.50,2.50,53,537


In [15]:
skills_importance_df.sort_values(by='Operations Analysis', ascending=False)[:30][['O*NET-SOC Code', 'Title', 'Operations Analysis']]

Element Name,O*NET-SOC Code,Title,Operations Analysis
123,17-1011.00,"Architects, Except Landscape and Naval",4.0
128,17-2011.00,Aerospace Engineers,3.88
152,17-2161.00,Nuclear Engineers,3.75
326,27-1027.00,Set and Exhibit Designers,3.75
130,17-2031.00,Bioengineers and Biomedical Engineers,3.75
117,15-2031.00,Operations Research Analysts,3.75
150,17-2141.02,Automotive Engineers,3.75
113,15-1299.08,Computer Systems Engineers/Architects,3.75
131,17-2041.00,Chemical Engineers,3.62
34,11-9111.00,Medical and Health Services Managers,3.62


In [16]:
# skills_importance_df.to_csv('../data/skills/skills_importance.csv', index=False)

# skill_based_risk

In [17]:
# Define your skill_categories mapping
skill_categories = {
    'basic_skills': [
        'Reading Comprehension', 'Active Listening', 'Writing', 'Speaking',
    ],
    'cognitive_skills': [
        'Critical Thinking', 'Active Learning', 'Learning Strategies',
        'Monitoring', 'Complex Problem Solving', 'Judgment and Decision Making', 'Operations Analysis'
    ],
    'social_skills': [
        'Social Perceptiveness', 'Coordination', 'Persuasion',
        'Negotiation', 'Instructing', 'Service Orientation'
    ],
    'operations_skills': [
        'Operation and Control', 'Operations Monitoring', 'Quality Control Analysis', 'Troubleshooting'
    ],
    'maintenance_skills': [
        'Equipment Selection', 'Installation', 'Equipment Maintenance', 'Repairing'
    ],
    'technical_skills': [
        'Technology Design', 'Programming', 'Mathematics', 'Science'
    ],
    'management_skills': [
        'Systems Analysis', 'Systems Evaluation', 'Time Management',
        'Management of Financial Resources', 'Management of Material Resources',
        'Management of Personnel Resources'
    ]
}

# Prepare a dict for the new DataFrame
new_data = {
    'O*NET-SOC Code': skills_importance_df['O*NET-SOC Code'],
    'Title': skills_importance_df['Title'],
}

# For each category, compute the mean of the mapped columns and add as a new column
for cat, skills in skill_categories.items():
    present_skills = [s for s in skills if s in skills_importance_df.columns]
    new_data[cat] = skills_importance_df[present_skills].mean(axis=1)

# Create the new DataFrame
skills_combined_df = pd.DataFrame(new_data)

# Display the first few rows to check
display(skills_combined_df.head())

Unnamed: 0,O*NET-SOC Code,Title,basic_skills,cognitive_skills,social_skills,operations_skills,maintenance_skills,technical_skills,management_skills
0,11-1011.00,Chief Executives,4.1225,3.928571,3.831667,1.815,1.03,2.0925,4.145
1,11-1011.03,Chief Sustainability Officers,4.03,3.68,3.521667,1.72,1.03,2.19,3.231667
2,11-1021.00,General and Operations Managers,3.875,3.462857,3.5,2.065,1.0,1.78,3.288333
3,11-2011.00,Advertising and Promotions Managers,3.905,3.357143,3.333333,1.31,1.03,2.03,3.038333
4,11-2021.00,Marketing Managers,3.7225,3.625714,3.478333,1.4075,1.0,2.0325,3.188333


In [18]:
skills_combined_df.columns

Index(['O*NET-SOC Code', 'Title', 'basic_skills', 'cognitive_skills',
       'social_skills', 'operations_skills', 'maintenance_skills',
       'technical_skills', 'management_skills'],
      dtype='object')

In [19]:
state = 'California' # The columns we are interested in are same across all the states
national_detailed_file_path = '../generation/national_summary_detailed.csv'
national_detailed_df = pd.read_csv(national_detailed_file_path)
national_detailed_df = national_detailed_df[national_detailed_df['AREA_TITLE'] == state]

In [20]:
national_detailed_df.head()

Unnamed: 0,AREA,AREA_TITLE,AREA_TYPE,PRIM_STATE,NAICS,NAICS_TITLE,I_GROUP,OWN_CODE,OCC_CODE,OCC_TITLE,O_GROUP,TOT_EMP,EMP_PRSE,JOBS_1000,LOC_QUOTIENT,PCT_TOTAL,PCT_RPT,H_MEAN,A_MEAN,MEAN_PRSE,H_PCT10,H_PCT25,H_MEDIAN,H_PCT75,H_PCT90,A_PCT10,A_PCT25,A_MEDIAN,A_PCT75,A_PCT90,ANNUAL,HOURLY,SOC_Code_Cleaned,tech_intensity_score,automation_susceptibility,automation_risk_category,major_group,estimated_zapier_apps_major,major_group_emp_total,emp_proportion,estimated_zapier_apps,zapier_apps_per_worker,zapier_apps_normalized,automation_susceptibility_norm,enhanced_automation_risk,enhanced_risk_category,economic_value,is_potentially_at_risk,is_currently_at_risk,potential_index,econ_potential_index,minor_group,minor_group_name,state
2682,6,California,2,CA,0,Cross-industry,cross-industry,1235,11-1011,Chief Executives,detailed,38920.0,2.4,2.169,1.56,,,135.11,281030.0,1.4,42.01,70.21,111.10,#,#,87390,146030,231080.0,#,#,,,111011,12.305882,37.703481,Low,11,0.0,1308810.0,0.029737,0.0,0.0,0,0.377035,28.277611,Low,10937690000.0,False,False,0.0,0.089501,111,111,California
2683,6,California,2,CA,0,Cross-industry,cross-industry,1235,11-1021,General and Operations Managers,detailed,273630.0,0.9,15.248,0.66,,,74.60,155170.0,4.2,29.57,39.52,58.72,86.00,#,61500,82210,122140.0,178870,#,,,111021,14.610314,43.238114,Moderate,11,0.0,1308810.0,0.209068,0.0,0.0,0,0.432381,32.428586,Low,42459170000.0,False,False,0.0,0.398438,111,111,California
2684,6,California,2,CA,0,Cross-industry,cross-industry,1235,11-1031,Legislators,detailed,2120.0,5.2,0.118,0.55,,,*,86890.0,2.9,*,*,*,*,*,34810,38940,62380.0,104450,136310,True,,111031,10.412132,32.905331,Low,11,0.0,1308810.0,0.00162,0.0,0.0,0,0.329053,24.678998,Low,184206800.0,False,False,0.0,0.001316,111,111,California
2685,6,California,2,CA,0,Cross-industry,cross-industry,1235,11-2011,Advertising and Promotions Managers,detailed,2880.0,7.2,0.16,1.18,,,84.18,175080.0,6.0,36.99,50.12,67.48,97.44,#,76950,104240,140360.0,202670,#,,,112011,13.092828,40.677276,Moderate,11,0.0,1308810.0,0.0022,0.0,0.0,0,0.406773,30.507957,Low,504230400.0,False,False,0.0,0.004451,112,112,California
2686,6,California,2,CA,0,Cross-industry,cross-industry,1235,11-2021,Marketing Managers,detailed,59830.0,2.8,3.334,1.37,,,92.66,192730.0,1.0,45.87,62.45,83.89,109.89,#,95400,129890,174480.0,228560,#,,,112021,18.60031,55.7113,Moderate,11,0.0,1308810.0,0.045713,0.0,0.0,0,0.557113,41.783475,Moderate,11531040000.0,True,False,100.0,0.139423,112,112,California


In [21]:
def get_automation_sustainability_score_component(onet_soc_code):
    global national_detailed_df
    if '.' in onet_soc_code:
        occ_code = onet_soc_code.split('.')[0]
    else:
        occ_code = onet_soc_code

    occ_df = national_detailed_df[national_detailed_df['OCC_CODE'] == occ_code]
    if occ_df.empty:
        print(f"No data found for {occ_code} in national_detailed_df")
        return None

    score = occ_df['automation_susceptibility_norm'].iloc[0]
    return float(score)

In [22]:
def add_automation_sustainability_score_component(df):
    global national_detailed_df
    
    top_10_perc_thresh = df["technical_skills"].quantile(0.9)
    df["automation_risk"] = df["intermediate_automation_risk"]
    ctr = 0
    for index, row in df.iterrows():
        technical_skills = row["technical_skills"]
        if technical_skills >= top_10_perc_thresh:
            automation_sustainability_norm = get_automation_sustainability_score_component(row["O*NET-SOC Code"])
            if automation_sustainability_norm is not None:
                ctr += 1
                df.at[index, "automation_risk"] = 0.2 * df.at[index, "intermediate_automation_risk"] + 0.8 * automation_sustainability_norm
    
    print(f"Updated {ctr} jobs with automation_susceptibility_norm score")
    df["automation_risk_score"] = (
        100 / (1 + np.exp(-4 * ((df["automation_risk"]) - 0.40)))
    ).round(1)
    return df


In [23]:
# import re
# ------------------------------------------------------------
# 1.  sector‑growth keywords for the WEF “discount” step
# ------------------------------------------------------------
# GROWTH_SECTOR_PATTERNS = re.compile(
#     r"(nurs|therap|counsel|teacher|educat|"      # care & education
#     r"ai\b|ml\b|machine learning|data|cyber|"    # digital / AI / security
#     r"engineer|developer|analyst|"               # generic digital titles
#     r"renewable|solar|wind|green|sustain|"       # green transition
#     r"project manager|operations manager)",      # leadership / project
#     flags=re.I
# )

# ------------------------------------------------------------
# 2.  helper to normalize each 1‑to‑5 skill bucket to 0‑1
# ------------------------------------------------------------
def _norm(series: pd.Series) -> pd.Series:
    return (series - 1.0) / 4.0

# ------------------------------------------------------------
# 3.  main scorer
# ------------------------------------------------------------
def add_intermediate_automation_risk(df: pd.DataFrame) -> pd.DataFrame:
    """Append a 0‑100 'automation_risk_score' column to the skills dataframe.

    Expected numeric columns (1‑5 scale):
      basic_skills, cognitive_skills, social_skills,
      operations_skills, maintenance_skills,
      technical_skills, management_skills
    """
    # 1. normalise buckets
    basic_skills  = _norm(df["basic_skills"])
    cognitive_skills  = _norm(df["cognitive_skills"])
    social_skills  = _norm(df["social_skills"])
    operations_skills  = _norm(df["operations_skills"])
    maintenance_skills  = _norm(df["maintenance_skills"])
    technical_skills  = _norm(df["technical_skills"])
    management_skills  = _norm(df["management_skills"])

    # 2. composite indices
    field_intensity = 0.3 * operations_skills + 0.7 * maintenance_skills
    # human_capital     = (
    #     0.60 * social_skills + 0.40 * management_skills # + 0.20 * cognitive_skills + 0.10 * basic_skills
    # )   # ↑ social/management weight, ↓ basic  #v1 - works for food grader

    human_capital     = (
        np.maximum(social_skills, management_skills) # any such skill heavily required would be a good indicator of human capital
    )   # ↑ social/management weight, ↓ basic
    
    tech_shield       = 0.30 * technical_skills + 0.10 * cognitive_skills 
    # tech shield is the sum of technical and cognitive skills - helps in longer term job creation but not in short term

    tech_composition = (technical_skills) / (technical_skills + management_skills + social_skills + maintenance_skills)

    # 3. raw risk (0‑1) with updated weights
    # Long-term
    # raw = (
    #     0.55 * routine_intensity
    #     + 0.30 * (1 - human_capital)
    #     + 0.15 * (1 - tech_shield)
    # )

    # Modified
    # Short-term
    raw = (
        0.5 * (tech_composition)  # The more technical the job, the higher the risk
        + 0.3 * (1 - field_intensity)
        + 0.2 * (1 - human_capital)
        # + 0.15 * (1 - tech_shield)
    )

    # 4. WEF growth‑sector discount (‑30 %) – stricter threshold
    # mask_growth = df["Title"].str.contains(GROWTH_SECTOR_PATTERNS, na=False) & (
    #     (social_skills >= 0.60) | (technical_skills >= 0.60)   # need ≥3.4 in raw 1‑5 scale
    # )

    # raw = raw.mask(mask_growth, raw * 0.70)

    # 5. scale to 0‑100
    df = df.copy()
    df["intermediate_automation_risk"] = (raw).round(2)
    # df["automation_risk_score"] = (
    #     100 / (1 + np.exp(-4 * (raw - 0.40)))
    # ).round(1)

    return df



In [24]:
def find_most_similar_job_from_skills_df(onet_soc_code, skills_df) -> str:
    """
        return the most similar job (O*NET-SOC Code) from skills_df to the given onet_soc_code (O*NET-SOC Code)
    """
    # Get all unique O*NET-SOC codes from the skills dataframe
    available_codes = skills_df['O*NET-SOC Code'].unique()
    
    # If the exact code exists, return it
    if onet_soc_code in available_codes:
        return onet_soc_code
    
    max_common_length = 0
    candidates = []
    
    # First pass: find all codes with the maximum common prefix length
    for code in available_codes:
        # Calculate common prefix length
        common_length = 0
        min_length = min(len(onet_soc_code), len(code))
        
        for i in range(min_length):
            if onet_soc_code[i] == code[i]:
                common_length += 1
            else:
                break
        
        # Update candidates if this has a longer common prefix
        if common_length > max_common_length:
            max_common_length = common_length
            candidates = [code]
        elif common_length == max_common_length:
            candidates.append(code)
    
    # If only one candidate, return it
    if len(candidates) == 1:
        return candidates[0]
    
    # Second pass: among candidates with same prefix length, find closest differing character
    best_match = None
    min_diff = float('inf')
    
    for code in candidates:
        if max_common_length < len(onet_soc_code) and max_common_length < len(code):
            # Compare the differing characters
            char1 = onet_soc_code[max_common_length]
            char2 = code[max_common_length]
            
            # If both are digits, compare numerically
            if char1.isdigit() and char2.isdigit():
                diff = abs(int(char1) - int(char2))
            else:
                # Fallback to ASCII difference for non-numeric characters
                diff = abs(ord(char1) - ord(char2))
            
            if diff < min_diff:
                min_diff = diff
                best_match = code
        else:
            # If one string is shorter, prefer the shorter one (more general)
            if best_match is None:
                best_match = code
    
    return best_match if best_match is not None else candidates[0]
    

def add_missing_jobs_from_national_detailed_df(national_detailed_df, skills_combined_df) -> pd.DataFrame:
    job_codes_indexed = set(skills_combined_df["O*NET-SOC Code"].values)

    national_detailed_df = national_detailed_df.drop_duplicates(subset="OCC_CODE", keep="first")[["OCC_CODE", "OCC_TITLE"]]
    national_detailed_df["O*NET-SOC Code"] = national_detailed_df["OCC_CODE"].apply(lambda x: f"{x}.00")
    
    # Add missing_job column if it doesn't exist
    if 'missing_job' not in skills_combined_df.columns:
        skills_combined_df['missing_job'] = False

    missing_jobs_onet_soc_codes = []
    rows_to_add = []
    
    for index, row in national_detailed_df.iterrows():
        code = row["O*NET-SOC Code"]
        title = row["OCC_TITLE"]

        if code in job_codes_indexed:
            continue

        # Find the most similar job
        replaced_code = find_most_similar_job_from_skills_df(code, skills_combined_df)
        if replaced_code is None:
            print(f"No similar job found for: {code} - {title}")
            continue
            
        # Get the closest job row
        closest_job_rows = skills_combined_df[skills_combined_df["O*NET-SOC Code"] == replaced_code]
        if closest_job_rows.empty:
            print(f"Replaced code {replaced_code} not found in dataframe")
            continue
            
        closest_job_row = closest_job_rows.iloc[0].copy()
        
        # Update the row with new information
        closest_job_row["O*NET-SOC Code"] = code
        closest_job_row["Title"] = title
        closest_job_row['missing_job'] = True
        
        rows_to_add.append(closest_job_row)

        print(f"Missing job: {code} - {title} --> replacing with most similar job {replaced_code} - {closest_job_rows.iloc[0]['Title']}")
        missing_jobs_onet_soc_codes.append(code)
    
    # Add all new rows at once using pd.concat (more efficient than append)
    if rows_to_add:
        new_rows_df = pd.DataFrame(rows_to_add)
        skills_combined_df = pd.concat([skills_combined_df, new_rows_df], ignore_index=True)
    
    print(f"Mapped {len(missing_jobs_onet_soc_codes)} missing jobs from skills.csv")

    return skills_combined_df

In [25]:
def add_adoption_rate_to_skills_df(skills_df):
    """
    Add adoption rate to skills_df
    """
    tech_intensity_df = pd.read_csv(TECH_INTENSITY_PATH)
    tech_intensity_df = tech_intensity_df.rename(columns={'hot_tech_ratio': 'adoption_rate'})  # hot tech ratio is a proxy
    
    tech_intensity_df = tech_intensity_df[['O*NET-SOC Code', 'adoption_rate']]
    skills_df = skills_df.merge(tech_intensity_df, on=['O*NET-SOC Code'], how='left')
    
    skills_df['adoption_rate'] = skills_df['adoption_rate'].fillna(skills_df['adoption_rate'].median())
    return skills_df

In [26]:
skills_combined_df[skills_combined_df["O*NET-SOC Code"] == "19-1099.00"]

Unnamed: 0,O*NET-SOC Code,Title,basic_skills,cognitive_skills,social_skills,operations_skills,maintenance_skills,technical_skills,management_skills


In [27]:
# ------------------------------------------------------------
# 4.  usage
# ------------------------------------------------------------
skills_combined_df = add_intermediate_automation_risk(skills_combined_df)

In [28]:
skills_combined_df = add_automation_sustainability_score_component(skills_combined_df)

No data found for 29-2036 in national_detailed_df
Updated 87 jobs with automation_susceptibility_norm score


In [29]:
skills_combined_df = add_missing_jobs_from_national_detailed_df(national_detailed_df, skills_combined_df)

Missing job: 11-1031.00 - Legislators --> replacing with most similar job 11-1021.00 - General and Operations Managers
Missing job: 11-2032.00 - Public Relations Managers --> replacing with most similar job 11-2021.00 - Marketing Managers
Missing job: 11-2033.00 - Fundraising Managers --> replacing with most similar job 11-2021.00 - Marketing Managers
Missing job: 11-9039.00 - Education Administrators, All Other --> replacing with most similar job 11-9033.00 - Education Administrators, Postsecondary
Missing job: 11-9072.00 - Entertainment and Recreation Managers, Except Gambling --> replacing with most similar job 11-9071.00 - Gambling Managers
Missing job: 11-9179.00 - Personal Service Managers, All Other --> replacing with most similar job 11-9179.01 - Fitness and Wellness Coordinators
Missing job: 11-9199.00 - Managers, All Other --> replacing with most similar job 11-9199.01 - Regulatory Affairs Managers
Missing job: 13-1020.00 - Buyers and Purchasing Agents --> replacing with most

In [None]:
skills_combined_df = add_adoption_rate_to_skills_df(skills_combined_df)

In [31]:
skills_combined_df.head()

Unnamed: 0,O*NET-SOC Code,Title,basic_skills,cognitive_skills,social_skills,operations_skills,maintenance_skills,technical_skills,management_skills,intermediate_automation_risk,automation_risk,automation_risk_score,missing_job,adoption_rate
0,11-1011.00,Chief Executives,4.1225,3.928571,3.831667,1.815,1.03,2.0925,4.145,0.4,0.4,50.0,False,0.346939
1,11-1011.03,Chief Sustainability Officers,4.03,3.68,3.521667,1.72,1.03,2.19,3.231667,0.46,0.46,56.0,False,0.666667
2,11-1021.00,General and Operations Managers,3.875,3.462857,3.5,2.065,1.0,1.78,3.288333,0.42,0.42,52.0,False,0.335616
3,11-2011.00,Advertising and Promotions Managers,3.905,3.357143,3.333333,1.31,1.03,2.03,3.038333,0.47,0.47,57.0,False,0.39726
4,11-2021.00,Marketing Managers,3.7225,3.625714,3.478333,1.4075,1.0,2.0325,3.188333,0.46,0.46,56.0,False,0.460526


In [32]:
skills_combined_df[skills_combined_df['missing_job'] == True]

Unnamed: 0,O*NET-SOC Code,Title,basic_skills,cognitive_skills,social_skills,operations_skills,maintenance_skills,technical_skills,management_skills,intermediate_automation_risk,automation_risk,automation_risk_score,missing_job,adoption_rate
879,11-1031.00,Legislators,3.8750,3.462857,3.500000,2.0650,1.0000,1.7800,3.288333,0.42,0.42,52.0,True,0.343750
880,11-2032.00,Public Relations Managers,3.7225,3.625714,3.478333,1.4075,1.0000,2.0325,3.188333,0.46,0.46,56.0,True,0.469697
881,11-2033.00,Fundraising Managers,3.7225,3.625714,3.478333,1.4075,1.0000,2.0325,3.188333,0.46,0.46,56.0,True,0.433333
882,11-9039.00,"Education Administrators, All Other",4.0300,3.590000,3.685000,1.3450,1.0000,1.8450,3.375000,0.43,0.43,53.0,True,0.388889
883,11-9072.00,"Entertainment and Recreation Managers, Except ...",3.6875,3.447143,3.583333,1.9675,1.0000,1.9975,3.226667,0.43,0.43,53.0,True,0.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
986,53-3099.00,"Motor Vehicle Operators, All Other",2.8425,2.427143,2.560000,2.9050,1.4700,1.4375,2.083333,0.42,0.42,52.0,True,0.388889
987,53-4099.00,"Rail Transportation Workers, All Other",3.2175,2.820000,2.811667,3.2200,1.8425,1.3450,2.186667,0.36,0.36,46.0,True,0.388889
988,53-6032.00,Aircraft Service Attendants,2.9025,2.535714,2.748333,2.7475,2.5950,1.6250,2.206667,0.35,0.35,45.0,True,0.333333
989,53-6099.00,"Transportation Workers, All Other",3.2500,2.427143,3.168333,2.2825,1.0000,1.4700,1.876667,0.43,0.43,53.0,True,0.388889


In [33]:
skills_combined_df[skills_combined_df['automation_risk'] != skills_combined_df['intermediate_automation_risk']]

Unnamed: 0,O*NET-SOC Code,Title,basic_skills,cognitive_skills,social_skills,operations_skills,maintenance_skills,technical_skills,management_skills,intermediate_automation_risk,automation_risk,automation_risk_score,missing_job,adoption_rate
35,11-9121.00,Natural Sciences Managers,3.9400,3.697143,3.230000,2.0300,1.0300,2.9350,3.080000,0.52,0.539570,63.6,False,0.655172
69,13-1081.01,Logistics Engineers,3.9675,3.608571,3.083333,2.3125,1.3775,2.9375,3.395000,0.47,0.430260,53.0,False,0.448980
92,13-2099.01,Financial Quantitative Analysts,3.7500,3.357143,2.980000,1.4975,1.0000,2.6900,2.833333,0.54,0.632993,71.7,False,0.777778
94,15-1211.00,Computer Systems Analysts,3.7525,3.410000,2.895000,2.9050,1.9075,2.8450,2.936667,0.45,0.667529,74.5,False,0.395498
95,15-1211.01,Health Informatics Specialists,4.0950,3.695714,3.186667,2.2475,1.2825,2.6875,3.020000,0.48,0.673529,74.9,False,0.412698
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
904,19-2099.00,"Physical Scientists, All Other",3.9100,3.518571,2.813333,2.0950,1.1250,3.2225,2.771667,0.57,0.600911,69.1,True,0.388889
934,29-1242.00,"Orthopedic Surgeons, Except Pediatric",4.0925,3.661429,3.666667,2.5925,1.4700,2.6900,2.873333,0.43,0.175086,28.9,True,0.250000
935,29-1243.00,Pediatric Surgeons,4.0925,3.661429,3.666667,2.5925,1.4700,2.6900,2.873333,0.43,0.175086,28.9,True,0.250000
936,29-1249.00,"Surgeons, All Other",4.0925,3.661429,3.666667,2.5925,1.4700,2.6900,2.873333,0.43,0.175086,28.9,True,0.388889


In [34]:
print(f"median: {skills_combined_df['automation_risk_score'].median()}")
print(f"std: {skills_combined_df['automation_risk_score'].std()}")
print(f"min: {skills_combined_df['automation_risk_score'].min()}")
print(f"max: {skills_combined_df['automation_risk_score'].max()}")
print(f"iqr: {skills_combined_df['automation_risk_score'].quantile(0.75) - skills_combined_df['automation_risk_score'].quantile(0.25)}")
print(f"mean: {skills_combined_df['automation_risk_score'].mean()}")

median: 54.0
std: 6.43425165416634
min: 28.9
max: 81.8
iqr: 7.899999999999999
mean: 53.854389505549946


```
Prior without automation_susceptibility_norm

median: 54.0
std: 5.709221485812173
min: 38.2
max: 71.5
iqr: 7.899999999999999
mean: 53.725142207053466
```

In [35]:
skills_combined_df.sort_values(by='automation_risk_score', ascending=False, inplace=True)

In [36]:
skills_combined_df[:40][['O*NET-SOC Code', 'Title', 'automation_risk_score', 'automation_risk', 'basic_skills', 'cognitive_skills', 'social_skills', 'operations_skills', 'maintenance_skills', 'technical_skills', 'management_skills']]

Unnamed: 0,O*NET-SOC Code,Title,automation_risk_score,automation_risk,basic_skills,cognitive_skills,social_skills,operations_skills,maintenance_skills,technical_skills,management_skills
107,15-1253.00,Software Quality Assurance Analysts and Testers,81.8,0.775459,3.8775,3.302857,2.603333,2.785,1.4675,2.935,2.415
893,15-1252.00,Software Developers,78.5,0.723413,3.375,3.195714,2.601667,2.28,1.0625,3.3425,2.706667
106,15-1251.00,Computer Programmers,78.5,0.723413,3.375,3.195714,2.601667,2.28,1.0625,3.3425,2.706667
104,15-1243.01,Data Warehousing Specialists,77.2,0.705171,3.5025,3.161429,2.788333,2.095,1.185,2.8125,2.563333
108,15-1254.00,Web Developers,77.0,0.701428,3.3725,3.392857,2.856667,2.185,1.3125,2.8425,2.5
103,15-1243.00,Database Architects,76.8,0.699171,3.5,3.43,2.896667,1.9375,1.37,2.815,2.75
894,15-1255.00,Web and Digital Interface Designers,76.3,0.692879,3.5,3.355714,2.958333,1.5925,1.06,2.815,2.753333
109,15-1255.01,Video Game Designers,76.3,0.692879,3.5,3.355714,2.958333,1.5925,1.06,2.815,2.753333
102,15-1242.00,Database Administrators,75.2,0.677884,3.5325,3.482857,2.73,2.3125,1.25,2.69,2.583333
95,15-1211.01,Health Informatics Specialists,74.9,0.673529,4.095,3.695714,3.186667,2.2475,1.2825,2.6875,3.02


In [37]:
skills_importance_df[
    skills_importance_df['O*NET-SOC Code'].isin(['31-9011.00', '29-1022.00', '27-2011.00' , '45-2021.00'])
]

Element Name,O*NET-SOC Code,Title,Active Learning,Active Listening,Complex Problem Solving,Coordination,Critical Thinking,Equipment Maintenance,Equipment Selection,Installation,Instructing,Judgment and Decision Making,Learning Strategies,Management of Financial Resources,Management of Material Resources,Management of Personnel Resources,Mathematics,Monitoring,Negotiation,Operation and Control,Operations Analysis,Operations Monitoring,Persuasion,Programming,Quality Control Analysis,Reading Comprehension,Repairing,Science,Service Orientation,Social Perceptiveness,Speaking,Systems Analysis,Systems Evaluation,Technology Design,Time Management,Troubleshooting,Writing,normalized_major_code,normalized_minor_code
327,27-2011.00,Actors,2.62,3.75,2.88,2.88,3.0,1.0,1.0,1.0,2.75,2.88,2.75,1.0,1.0,2.38,1.0,3.0,2.5,1.0,1.75,1.25,2.5,1.0,1.0,3.88,1.0,1.5,2.12,3.75,3.88,2.0,2.0,1.25,3.0,1.0,2.88,27,272
356,29-1022.00,Oral and Maxillofacial Surgeons,3.88,3.88,4.12,3.62,4.0,1.88,2.12,1.0,2.88,4.12,2.88,2.0,2.0,2.75,2.62,3.88,2.62,2.88,3.12,2.75,2.75,1.5,2.75,4.0,1.88,3.5,3.25,3.75,3.88,3.25,3.0,2.0,3.38,1.88,3.5,29,291
443,31-9011.00,Massage Therapists,3.0,3.62,3.0,2.38,3.0,1.0,1.0,1.0,2.25,3.12,2.75,1.38,1.62,1.75,2.0,3.0,2.0,1.0,1.62,1.38,2.0,1.0,1.75,3.0,1.0,1.75,3.25,3.25,3.62,2.12,2.25,1.75,2.88,1.38,3.0,31,319
604,45-2021.00,Animal Breeders,3.0,3.0,3.0,2.5,3.25,1.0,1.38,1.0,1.88,3.0,2.12,2.12,2.0,2.0,2.12,3.0,1.75,2.62,1.62,2.88,2.38,1.5,2.62,2.88,1.0,3.0,2.38,2.62,3.0,2.38,2.12,1.25,2.75,2.0,2.75,45,452


In [38]:
skills_combined_df.to_csv('../v2_assets/skills_based_risk.csv', index=False)

In [39]:
skills_combined_df.shape

(991, 14)