# About

To map ONET-SOC code to BLS Super Sector code  (2-digit NAICS)

In [81]:
from enum import Enum
import instructor
import pandas as pd
from openai import OpenAI
from pydantic import BaseModel, Field

import numpy as np

import industry_type

In [45]:
skills_df = pd.read_csv('Skills.csv')
skills_df = skills_df.drop_duplicates(subset="O*NET-SOC Code", keep="first")

skill_list_codes = []

for index, row in skills_df.iterrows():
    skill_list_codes.append({
        'code': row['O*NET-SOC Code'],
        'title': row['Title']
    })

In [83]:
client = instructor.from_openai(OpenAI())

system_prompt = """
You are a helpful assistant that can help with mapping ONET-SOC code to BLS Super Sector code.
Please use your knowledge of the BLS Super Sector code to help with the mapping.
This is being done for jobs in the US. Use your best judgement to map the job to the correct BLS Super Sector code.
"""

class BLSSuperSectorExtracted(BaseModel):
    explanation: str = Field(description="A short explanation of the mapping less than 50 tokens")
    bls_super_sector: industry_type.BLSSuperSector = Field(
        description="Assigned (modified) BLS Super Sector as per the interpretation of the job title and ONET-SOC code: pay more to the title and should be STRICTLY one of the following"
    )


def get_bls_super_sector(job_onet_soc_code, job_title) -> BLSSuperSectorExtracted:
    query = f"""
    Job ONET-SOC code: {job_onet_soc_code}
    Job title: {job_title}
    """
    try:
        op: BLSSuperSectorExtracted = client.chat.completions.create(
                messages=[
                {
                    "role": "system",
                    "content": system_prompt,
                },
                {"role": "user", "content": query},
            ],
            model="gpt-4.1-nano",
            response_model=BLSSuperSectorExtracted,
        )
        return op.bls_super_sector.value
    except Exception as e:
        print(f"Error Extracting BLS Super Sector: Code: {job_onet_soc_code} Title: {job_title}")
    return np.nan


In [92]:
new_df_data = []
count = 0
for index, row in skills_df.iterrows():
    bls_super_sector = ""
    try:
        bls_super_sector = get_bls_super_sector(row['O*NET-SOC Code'], row['Title'])
    except Exception as e:
        print(f"Error Extracting BLS Super Sector: Code: {row['O*NET-SOC Code']} Title: {row['Title']}")
    

    new_df_data.append(
        {'O*NET-SOC Code': row['O*NET-SOC Code'], 'Title': row['Title'], 'Modified BLS Super Sector': bls_super_sector}
    )
    count += 1

    if count % 50 == 0:
        print(f"Processed {count} rows")

new_df = pd.DataFrame(new_df_data)
new_df.to_csv('modified_bls_super_sector_df.csv', index=False)

Processed 50 rows
Processed 100 rows
Processed 150 rows
Processed 200 rows
Processed 250 rows
Processed 300 rows
Processed 350 rows
Processed 400 rows
Processed 450 rows
Processed 500 rows
Processed 550 rows
Processed 600 rows
Processed 650 rows
Processed 700 rows
Processed 750 rows
Processed 800 rows
Processed 850 rows


In [93]:
new_df.to_csv('modified_bls_super_sector_df.csv', index=False)