# About

To map ONET-SOC code to BLS Super Sector code  (2-digit NAICS)

In [34]:
from enum import Enum
import instructor
import pandas as pd
from openai import OpenAI
from pydantic import BaseModel, Field

import numpy as np

import industry_type

### Step 1: Get all job codes (to fix missing ones)

In [35]:
skills_df = pd.read_csv('../data/ONET/Skills.csv')
skills_df = skills_df.drop_duplicates(subset="O*NET-SOC Code", keep="first")

skill_list_codes = []
seen_onet_soc_codes = set()

for index, row in skills_df.iterrows():
    code = row['O*NET-SOC Code']
    if code in seen_onet_soc_codes:
        continue
    seen_onet_soc_codes.add(code)
    skill_list_codes.append({
        'code': code,
        'title': row['Title']
    })

print(f'Jobs from skills.csv: {len(skill_list_codes)}')

Jobs from skills.csv: 879


In [36]:
national_detailed_df = pd.read_csv('../generation/national_summary_detailed.csv')

more_jobs_from_national_detailed_df = national_detailed_df.drop_duplicates(subset="OCC_CODE", keep="first")[["OCC_CODE", "OCC_TITLE"]]
more_jobs_from_national_detailed_df["O*NET-SOC Code"] = more_jobs_from_national_detailed_df["OCC_CODE"].apply(lambda x: f"{x}.00")

In [37]:
ctr = 0
for index, row in more_jobs_from_national_detailed_df.iterrows():
    code = row["O*NET-SOC Code"]
    title = row["OCC_TITLE"]

    if code in seen_onet_soc_codes:
        continue
    seen_onet_soc_codes.add(code)
    skill_list_codes.append({
        'code': code,
        'title': title
    })
    ctr += 1

print(f'Jobs from national_detailed_df: {ctr}')

Jobs from national_detailed_df: 114


In [38]:
print(len(skill_list_codes))

993


In [6]:
import instructor
from anthropic import Anthropic
from pydantic import BaseModel


client = instructor.from_anthropic(Anthropic())

In [13]:
system_prompt = """
# System Instructions  
You are a classifier that maps O*NET job titles into one—and only one—of the following Modified BLS Super-Sector categories.  
- If a title clearly fits one sector, choose that.  
- If it spans multiple, pick the sector with the strongest core focus.  
- Always return exactly the enum name (e.g. `MANUFACTURING`)

---

# ModifiedBLSSuperSector Definitions

- **AGRICULTURE_FORESTRY_FISHING_HUNTING**  
  Establishments primarily engaged in growing crops, raising animals, harvesting timber, and commercial fishing and hunting.

- **MINING_OIL_GAS_EXTRACTION**  
  Establishments that extract naturally occurring mineral solids, liquid minerals, and gases, including quarrying, oil and gas well operations, and related site services.

- **ENERGY**
  Establishments producing and distributing energy, including electric power generation, transmission and distribution; natural gas distribution; petroleum refining; and renewable energy operations.

- **CONSTRUCTION**  
  Establishments that build, repair, or renovate structures and engineering projects—residential, commercial, and civil—plus site preparation and specialized trades.

- **MANUFACTURING**  
  Establishments engaged in the mechanical, physical, or chemical transformation of materials, substances, or components into new products.

- **RETAIL_TRADE**  
  Establishments selling merchandise in small quantities to the general public—storefront and non-store retailers—and providing services incidental to the sale.

- **PACKAGING_AND_GOODS_TRANSPORTATION**
  Establishments providing freight transportation by air, rail, water, pipeline, and truck; warehousing and storage; packaging; and logistics support services.

- **PASSENGER_TRANSPORTATION**
  Establishments providing the movement of people by transit, ground passenger services, scenic and sightseeing transportation, and related support activities.

- **TECHNOLOGY_AND_SOFTWARE_DEVELOPMENT**
  Establishments designing, developing, and publishing software; computer systems design; data processing; and related information technology services.

- **FINANCE_AND_INSURANCE**  
  Establishments engaged in financial transactions and/or in facilitating financial transactions, including banking, credit intermediation, insurance underwriting, and related support activities.
  Additionally, Establishments offering accounting, tax preparation, bookkeeping, payroll processing, and auditing services.

- **REAL_ESTATE_RENTAL_LEASING**  
  Establishments renting, leasing, or otherwise allowing the use of real estate, tangible assets (e.g., vehicles, equipment), and intangible assets (e.g., intellectual property).

- **LEGAL_SERVICES**  
  Establishments providing legal advice and representation in civil and criminal matters, corporate law, estate planning, and related services.

- **ARCHITECTURAL_ENGINEERING_SERVICES**  
  Establishments providing architectural, engineering, and related design services for buildings, infrastructure, and specialized projects.

- **SCIENTIFIC_RESEARCH_AND_DEVELOPMENT**  
  Establishments conducting research and experimental development in the physical, engineering, and life sciences fields.

- **MANAGEMENT_CONSULTING_SERVICES**  
  Establishments providing advice and assistance on management, strategy, organizational structure, and environmental compliance.

- **MEDIA_AND_COMMUNICATIONS_SERVICES**
  Establishments engaged in content creation and distribution—publishing, motion picture and sound recording, broadcasting, telecommunications, and internet data processing.
  This category should also be used for jobs in marketing, advertising, and public relations.

- **MANAGEMENT**  
  Establishments holding the securities of (or other equity interests in) companies or enterprises for the purpose of owning a controlling interest, and those administering, overseeing, and managing these companies’ operations.
  ATTENTION!! This category should be used ONLY for jobs that are top level executives. If the job is related to field specialists, they should be mapped to a specific sector. DO NOT USE THIS CATEGORY FOR FIELD SPECIALIST MANAGERS.
  Example: CEO, CFO, CTO, CMO, etc.
  Counter Example: Lodging Managers should NOT be mapped to this category. Instead 'Lodging Managers' should be mapped to the industry, which for this example is TRAVEL_AND_ENTERTAINMENT_SERVICES

- **ADMINISTRATIVE_SUPPORT_WASTE_SERVICES**  
  Establishments performing routine support activities such as office administration, staffing services, facilities support, security services, and waste management and remediation.

- **EDUCATIONAL_SERVICES**  
  Establishments providing instruction and training through schools, colleges, universities, and training centers, including both academic and vocational education.

- **HEALTHCARE**  
  Establishments providing health care and social assistance to individuals, including ambulatory care, hospitals, nursing and residential care, and social service agencies.

- **TRAVEL_AND_ENTERTAINMENT_SERVICES**
  Establishments offering lodging (hotels, motels), food services (restaurants, bars), and cultural (art exhibits), entertainment, and recreational activities (live performances, museums, amusement parks).

- **GOVERNMENT**  
  Establishments of federal, state, and local government agencies that administer, oversee, and manage public programs and services. Military and defence services included.

- **SPECIAL_INDUSTRIES**
  Establishments operating in niche or highly specialized markets not captured by any other sector above.
  Example jobs: Recycling and Reclamation Workers, Hazardous Materials Removal Workers, etc.

"""

class BLSSuperSectorExtracted(BaseModel):
    explanation: str = Field(description="A short explanation of the mapping in less than 50 tokens")
    bls_super_sector: industry_type.ModifiedBLSSuperSector = Field(
        description="Assign (modified) BLS Super Sector as per the interpretation of the job title and ONET-SOC code: pay more to the title and should be STRICTLY one of the following"
    )


def get_bls_super_sector(job_onet_soc_code, job_title) -> BLSSuperSectorExtracted:
    query = f"""
    Job ONET-SOC code: {job_onet_soc_code}
    Job title: {job_title}
    """
    try:
        op: BLSSuperSectorExtracted = client.messages.create(
            model="claude-sonnet-4-20250514",
            max_tokens=1024,
            temperature=0.0,
            messages=[
                {
                    "role": "system",
                    "content": system_prompt,
                },
                {
                    "role": "user",
                    "content": query,
                }
            ],
            response_model=BLSSuperSectorExtracted,
        )
        return op.bls_super_sector.value
    except Exception as e:
        print(f"Error Extracting BLS Super Sector: Code: {job_onet_soc_code} Title: {job_title}")
        print(e)
    return np.nan


In [40]:
new_df_data = []
count = 0
for item in skill_list_codes[879:]:
    bls_super_sector = ""
    code = item['code']
    title = item['title']
    try:
        bls_super_sector = get_bls_super_sector(code, title)
    except Exception as e:
        print(f"Error Extracting BLS Super Sector: Code: {code} Title: {title}")
 

    new_df_data.append(
    {'O*NET-SOC Code': code, 'Title': title, 'Modified BLS Super Sector': bls_super_sector}
    )
    count += 1

    if count % 50 == 0:
        print(f"Processed {count} items")



Processed 50 items
Processed 100 items


In [41]:
new_df_1 = pd.DataFrame(new_df_data)

In [42]:
new_df_1.head(20)

Unnamed: 0,O*NET-SOC Code,Title,Modified BLS Super Sector
0,11-1031.00,Legislators,GOVERNMENT
1,11-2032.00,Public Relations Managers,MEDIA_AND_COMMUNICATIONS_SERVICES
2,11-2033.00,Fundraising Managers,MANAGEMENT_CONSULTING_SERVICES
3,11-9039.00,"Education Administrators, All Other",EDUCATIONAL_SERVICES
4,11-9072.00,"Entertainment and Recreation Managers, Except ...",TRAVEL_AND_ENTERTAINMENT_SERVICES
5,11-9199.00,"Managers, All Other",MANAGEMENT
6,13-1020.00,Buyers and Purchasing Agents,ADMINISTRATIVE_SUPPORT_WASTE_SERVICES
7,13-1082.00,Project Management Specialists,MANAGEMENT_CONSULTING_SERVICES
8,13-1199.00,"Business Operations Specialists, All Other",MANAGEMENT_CONSULTING_SERVICES
9,13-2020.00,Property Appraisers and Assessors,REAL_ESTATE_RENTAL_LEASING


In [43]:
data_df = pd.concat([new_df, new_df_1])

In [44]:
data_df.shape

(993, 3)

In [45]:
new_df_1.shape

(114, 3)

In [46]:
data_df.to_csv('../v2_assets/modified_bls_super_sector_df.csv', index=False)