# About

To map ONET-SOC code to BLS Super Sector code  (2-digit NAICS)

In [1]:
from enum import Enum
import instructor
import pandas as pd
from openai import OpenAI
from pydantic import BaseModel, Field

import numpy as np

import industry_type

In [2]:
skills_df = pd.read_csv('Skills.csv')
skills_df = skills_df.drop_duplicates(subset="O*NET-SOC Code", keep="first")

skill_list_codes = []

for index, row in skills_df.iterrows():
 skill_list_codes.append({
  'code': row['O*NET-SOC Code'],
  'title': row['Title']
 })

In [10]:
client = instructor.from_openai(OpenAI())

system_prompt = """
# System Instructions  
You are a classifier that maps O*NET job titles into one—and only one—of the following Modified BLS Super-Sector categories.  
- If a title clearly fits one sector, choose that.  
- If it spans multiple, pick the sector with the strongest core focus.  
- Always return exactly the enum name (e.g. `MANUFACTURING`)

---

# ModifiedBLSSuperSector Definitions

- **AGRICULTURE_FORESTRY_FISHING_HUNTING**  
  Establishments primarily engaged in growing crops, raising animals, harvesting timber, and commercial fishing and hunting.

- **MINING_OIL_GAS_EXTRACTION**  
  Establishments that extract naturally occurring mineral solids, liquid minerals, and gases, including quarrying, oil and gas well operations, and related site services.

- **ENERGY**
  Establishments producing and distributing energy, including electric power generation, transmission and distribution; natural gas distribution; petroleum refining; and renewable energy operations.

- **CONSTRUCTION**  
  Establishments that build, repair, or renovate structures and engineering projects—residential, commercial, and civil—plus site preparation and specialized trades.

- **MANUFACTURING**  
  Establishments engaged in the mechanical, physical, or chemical transformation of materials, substances, or components into new products.

- **RETAIL_TRADE**  
  Establishments selling merchandise in small quantities to the general public—storefront and non-store retailers—and providing services incidental to the sale.

- **PACKAGING_AND_GOODS_TRANSPORTATION**
  Establishments providing freight transportation by air, rail, water, pipeline, and truck; warehousing and storage; packaging; and logistics support services.

- **PASSENGER_TRANSPORTATION**
  Establishments providing the movement of people by transit, ground passenger services, scenic and sightseeing transportation, and related support activities.

- **TECHNOLOGY_AND_SOFTWARE_DEVELOPMENT**
  Establishments designing, developing, and publishing software; computer systems design; data processing; and related information technology services.

- **FINANCE_AND_INSURANCE**  
  Establishments engaged in financial transactions and/or in facilitating financial transactions, including banking, credit intermediation, insurance underwriting, and related support activities.

- **REAL_ESTATE_RENTAL_LEASING**  
  Establishments renting, leasing, or otherwise allowing the use of real estate, tangible assets (e.g., vehicles, equipment), and intangible assets (e.g., intellectual property).

- **LEGAL_SERVICES**  
  Establishments providing legal advice and representation in civil and criminal matters, corporate law, estate planning, and related services.

- **ACCOUNTING_AND_AUDITING_SERVICES**  
  Establishments offering accounting, tax preparation, bookkeeping, payroll processing, and auditing services.

- **ARCHITECTURAL_ENGINEERING_SERVICES**  
  Establishments providing architectural, engineering, and related design services for buildings, infrastructure, and specialized projects.

- **SCIENTIFIC_RESEARCH_AND_DEVELOPMENT**  
  Establishments conducting research and experimental development in the physical, engineering, and life sciences fields.

- **MANAGEMENT_CONSULTING_SERVICES**  
  Establishments providing advice and assistance on management, strategy, organizational structure, and environmental compliance.

- **MEDIA_AND_COMMUNICATIONS_SERVICES**
  Establishments engaged in content creation and distribution—publishing, motion picture and sound recording, broadcasting, telecommunications, and internet data processing.
  This category should also be used for jobs in marketing, advertising, and public relations.

- **MANAGEMENT**  
  Establishments holding the securities of (or other equity interests in) companies or enterprises for the purpose of owning a controlling interest, and those administering, overseeing, and managing these companies’ operations.
  This category should be used for jobs that are top level executives or directors. If they are field specialists, they should be mapped to a specific sector. DO NOT USE THIS CATEGORY FOR FIELD SPECIALIST MANAGERS.


- **ADMINISTRATIVE_SUPPORT_WASTE_SERVICES**  
  Establishments performing routine support activities such as office administration, staffing services, facilities support, security services, and waste management and remediation.

- **EDUCATIONAL_SERVICES**  
  Establishments providing instruction and training through schools, colleges, universities, and training centers, including both academic and vocational education.

- **HEALTHCARE**  
  Establishments providing health care and social assistance to individuals, including ambulatory care, hospitals, nursing and residential care, and social service agencies.

- **TRAVEL_AND_ENTERTAINMENT_SERVICES**
  Establishments offering lodging (hotels, motels), food services (restaurants, bars), and cultural (art exhibits), entertainment, and recreational activities (live performances, museums, amusement parks).

- **GOVERNMENT**  
  Establishments of federal, state, and local government agencies that administer, oversee, and manage public programs and services. Military and defence services included.

- **SPECIAL_INDUSTRIES**
  Establishments operating in niche or highly specialized markets not captured by any other sector above.

"""

class BLSSuperSectorExtracted(BaseModel):
    explanation: str = Field(description="A short explanation of the mapping less than 50 tokens")
    bls_super_sector: industry_type.ModifiedBLSSuperSector = Field(
        description="Assigned (modified) BLS Super Sector as per the interpretation of the job title and ONET-SOC code: pay more to the title and should be STRICTLY one of the following"
    )


def get_bls_super_sector(job_onet_soc_code, job_title) -> BLSSuperSectorExtracted:
 query = f"""
 Job ONET-SOC code: {job_onet_soc_code}
 Job title: {job_title}
 """
 try:
  op: BLSSuperSectorExtracted = client.chat.completions.create(
 messages=[
 {
  "role": "system",
  "content": system_prompt,
 },
 {"role": "user", "content": query},
],
model="gpt-4.1-nano",
response_model=BLSSuperSectorExtracted,
  )
  return op.bls_super_sector.value
 except Exception as e:
  print(f"Error Extracting BLS Super Sector: Code: {job_onet_soc_code} Title: {job_title}")
 return np.nan


In [13]:
new_df_data = []
count = 0
for index, row in skills_df.iterrows():
    bls_super_sector = ""
    try:
        bls_super_sector = get_bls_super_sector(row['O*NET-SOC Code'], row['Title'])
    except Exception as e:
        print(f"Error Extracting BLS Super Sector: Code: {row['O*NET-SOC Code']} Title: {row['Title']}")
 

    new_df_data.append(
    {'O*NET-SOC Code': row['O*NET-SOC Code'], 'Title': row['Title'], 'Modified BLS Super Sector': bls_super_sector}
    )
    count += 1
    # if count > 100:
    #     break
    if count % 50 == 0:
        print(f"Processed {count} rows")

new_df = pd.DataFrame(new_df_data)
new_df.to_csv('modified_bls_super_sector_df.csv', index=False)

Processed 50 rows
Processed 100 rows
Processed 150 rows
Processed 200 rows
Processed 250 rows
Processed 300 rows
Processed 350 rows
Processed 400 rows
Processed 450 rows
Processed 500 rows
Processed 550 rows
Processed 600 rows
Processed 650 rows
Processed 700 rows
Processed 750 rows
Processed 800 rows
Processed 850 rows


In [14]:
new_df.to_csv('modified_bls_super_sector_df.csv', index=False)