In [194]:
import pandas as pd
import numpy as np
import re
from thefuzz import fuzz, process
from thefuzz import process
from typing import Optional, Union, Literal, Iterable, Any, Dict, Tuple

# Get Possible Names of Each Job to ISCO Code

We’ll load the PSOC Excel file and focus only on two columns: Job Title and ISCO code. The ISCO code will serve as the key to link PSOC with O*NET, where we can attach the AIOE and Complementarity scores. We also have to note that The PSA lists job titles in a clear cycle. Each section starts with the main job title in all caps, followed by a description of tasks. It then introduces related titles with the phrase “Examples of the occupations classified here:”. The cycle repeats with a new ISCO code whenever a new all-caps job title appears. We will utilize this to get a one to many mapping of the ISCO code to the job title.

In [195]:
def add_job_isco(df_map: pd.DataFrame, job_isco: Dict[str, int]) -> Dict[str, int]:
    """
    Build a mapping of job titles (main + examples) to their ISCO codes.

    Parameters
    ----------
    df_map : pd.DataFrame
        DataFrame with columns ["Job Title", "ISCO"].
    job_isco : dict
        Dictionary to update with mappings {job_title: isco_code}.

    Returns
    -------
    dict
        Updated mapping {job_title: isco_code}.
    """
    collecting: bool = False

    phrase_1 = "Some related occupations classified elsewhere:"
    phrase_2 = "Related occupation classified elsewhere:"

    for _, row in df_map.iterrows():
        title: str = row["Job Title"]
        code = row["ISCO"]

        # If all caps job title is seen, collect it 
        # but also know that the tasks should not be collected
        if title.isupper():
            # Main job title
            job_isco[title] = code
            collecting = False

        # Start collecting example job titles
        elif "Examples of the occupations classified here:" in title:
            collecting = True
        
        # If this phrase is seen, stop collecting
        elif (phrase_1 in title) or (phrase_2 in title):
            collecting = False

        # Collect this specific job title
        elif collecting:
            job_isco[title] = code

    return job_isco

In [196]:
# Get the data
filename = '2022-Updates-to-the-2012-PSOC.xlsx'
relevant_cols = [3, 5]
names = ['Job Title', 'ISCO']
df_maps = pd.read_excel(
    filename, 
    usecols=relevant_cols, 
    names=names,
    sheet_name=None,
    dtype={"Job Title": str, "ISCO": str}
    )

In [197]:
# for each df_map in df_maps, get their jobs to ISCO pairs
job_isco = {}

for _, df_map in df_maps.items():
    # Replace empty strings or strings with only whitespace with NaN
    df_map["ISCO"] = df_map["ISCO"].replace(r"^\s*$", np.nan, regex=True)
    
    # Forward fill ISCO codes
    df_map["ISCO"].ffill(inplace=True)

    # Extract only first 4 digits
    df_map["ISCO"] = df_map["ISCO"].astype(str).str.extract(r"(\d{4})")[0]

    
    # Drop rows missing job title (but keep NaN ISCO for now)
    df_map.dropna(inplace=True)

    # Drop duplicate job titles
    df_map.drop_duplicates(inplace=True, ignore_index=True)

    add_job_isco(df_map, job_isco)


In [198]:
mapping = pd.DataFrame(list(job_isco.items()), columns=names)
# Get the Major Group
mapping['Group'] = mapping['ISCO'].astype(str).str[0].astype(int)
mapping.head()

Unnamed: 0,Job Title,ISCO,Group
0,LEGISLATORS,1111,1
1,City/Municipal Councilor,1111,1
2,Congressman,1111,1
3,Member of the Barangay Council (Sangguniang Pa...,1111,1
4,Member of the Provincial Board (Sangguniang Pa...,1111,1


# Map each job in MCA to the closest ISCO code based on Job Title

Each job in the MCA dataset will be mapped to the closest ISCO code by comparing its job title to the list of PSOC job titles using fuzzy string matching with the ```fuzzywuzzy``` library, which scores similarity based on character-level differences and selects the best match.

In [213]:
filename = 'FINAL (WIP)-MCA Job list August 2025(USE_final).csv'
mca_df = pd.read_csv(filename, encoding='latin1')

# Remove the trailing white space and lower case
mca_df['Job Sector'] = mca_df['Job Sector'].str.strip().str.lower()
mca_df['Job Title'] = mca_df['Job Title'].str.strip()

mca_df.head()

Unnamed: 0,Job Title,Job Sector,Job Subsector,Educational Pathway,HEI with PRC (Professional Regulation Commission) Exam,Some HEI
0,2D Echocardiography Technician,human health and social work activities,Medical and dental practice activities,Higher Education,No,Yes
1,2D Game Artist,"arts, entertainment and recreation","Creative, arts and entertainment activities",Not Higher Education,No,No
2,3D Game Artist,"arts, entertainment and recreation","Creative, arts and entertainment activities",Not Higher Education,No,No
3,3D Modeller,"arts, entertainment and recreation","Creative, arts and entertainment activities",Not Higher Education,No,No
4,3D Printing Designer,manufacturing,Manufacture of other fabricated metal products...,Higher Education,No,No


In [215]:
jobs_to_keep = ['Cold Storage Technician',
 'Commercial Diver',
 'Dental Laboratory Technician',
 'Digital Background Painting',
 'EV Battery Automotive Mechanic',
 'Food Safety Auditor',
 'HACCP Coordinator',
 'Junior Airline Accounting Clerk',
 'Maintenance Laborer',
 'Milk Processing Technician',
 'Product Development Technician',
 'R&D Food Specialist',
 'Semiconductor Operator',
 'Wastewater Treatment Specialist',
 'Water Quality Engineer']

filtered_df = mca_df[mca_df["Job Title"].isin(jobs_to_keep)]
filtered_df

Unnamed: 0,Job Title,Job Sector,Job Subsector,Educational Pathway,HEI with PRC (Professional Regulation Commission) Exam,Some HEI
190,Cold Storage Technician,manufacturing,Installation of industrial machinery and equip...,Higher Education,No,No
192,Commercial Diver,transportation and storage,"Sea and coastal water transport, Inland water ...",Not Higher Education,No,No
289,Dental Laboratory Technician,human health and social work activities,Medical and dental practice activities,Not Higher Education,No,No
298,Digital Background Painting,information and communication,"Motion picture, video and television programme...",Higher Education,No,No
376,EV Battery Automotive Mechanic,manufacturing,Manufacture of batteries and accumulators,Not Higher Education,No,No
434,Food Safety Auditor,manufacturing,Manufacture of other food products,Higher Education,Yes,No
478,HACCP Coordinator,manufacturing,Manufacture of other food products,Higher Education,No,No
570,Junior Airline Accounting Clerk,transportation and storage,"Passenger air transport, Freight air transport",Higher Education,Yes,No
680,Maintenance Laborer,construction,Construction of buildings,Not Higher Education,No,No
738,Milk Processing Technician,manufacturing,Manufacture of dairy products,Higher Education,No,No


In [216]:
sector_to_major_group = {
    "human health and social work activities": [2, 3, 5],
    "arts, entertainment and recreation": [2, 3, 5],
    "manufacturing": [7, 8],
    "agriculture, forestry, and fishing": [6, 9],
    "professional, scientific and technical activities": [2, 3],
    "construction": [7, 9],
    "financial and insurance activities": [1, 2, 4],
    "administrative and support service activities": [4, 5],
    "other service activities": [4, 5],
    "information and communication": [2, 3, 4],
    "transportation and storage": [8, 9],
    "education": [2, 3],
    "accommodation and food service activities": [5],
    "wholesale and retail trade; repair of motor vehicles and motorcycles": [5, 9],
    "public administration and defense; compulsory social security": [1, 2, 3],
    "electricity, gas, steam and air conditioning supply": [7, 8],
    "water supply; sewerage, waste management and remediation": [7, 8],
    "real estate activities": [1, 2, 4],
    "mining and quarrying": [7, 8, 9],
    "armed forces": [0]
}


In [217]:
def map_to_isco(mca_title: str, job_isco: Dict[str, int]) -> Tuple[str, int, int]:
    """
    Map an MCA job title to the closest PSOC job title and return its ISCO code.

    Args:
        mca_title (str): The MCA job title to map.
        job_isco (Dict[str, int]): Dictionary of PSOC job titles to ISCO codes.

    Returns:
        Tuple[str, int, int]: Best matching PSOC job title, its ISCO code, and similarity score.
    """
    psoc_titles = list(job_isco.keys())
    best_match, score = process.extractOne(mca_title, psoc_titles)
    isco_code = job_isco[best_match]
    return best_match, isco_code, score


In [218]:
# Prepare new columns
mca_df['Best Match'] = ""
mca_df['ISCO Code'] = 0
mca_df['Similarity Score'] = 0

for idx, job in mca_df.iterrows():
    title, sector = job['Job Title'], job['Job Sector']
    groups = sector_to_major_group[sector]

    # Filter mapping by allowed Major Groups
    filt = mapping['Group'].isin(groups)
    filtered = mapping[filt][['Job Title', 'ISCO']]

    # Create filtered dictionary
    filt_job_to_isco = dict(zip(filtered['Job Title'], filtered['ISCO']))

    # Map MCA job title to closest PSOC title
    if filt_job_to_isco:  # only if there are filtered titles
        best_match, isco_code, score = map_to_isco(title, filt_job_to_isco)
    else:
        best_match, isco_code, score = None, None, 0  # fallback if no titles

    # Assign to new columns
    mca_df.at[idx, 'Best Match'] = best_match
    mca_df.at[idx, 'ISCO Code'] = isco_code
    mca_df.at[idx, 'Similarity Score'] = score


  mca_df.at[idx, 'ISCO Code'] = isco_code


In [219]:
mca_df

Unnamed: 0,Job Title,Job Sector,Job Subsector,Educational Pathway,HEI with PRC (Professional Regulation Commission) Exam,Some HEI,Best Match,ISCO Code,Similarity Score
0,2D Echocardiography Technician,human health and social work activities,Medical and dental practice activities,Higher Education,No,Yes,2D animators,2166,86
1,2D Game Artist,"arts, entertainment and recreation","Creative, arts and entertainment activities",Not Higher Education,No,No,Website game developers,2513,86
2,3D Game Artist,"arts, entertainment and recreation","Creative, arts and entertainment activities",Not Higher Education,No,No,Website game developers,2513,86
3,3D Modeller,"arts, entertainment and recreation","Creative, arts and entertainment activities",Not Higher Education,No,No,Financial modelers,2413,72
4,3D Printing Designer,manufacturing,Manufacture of other fabricated metal products...,Higher Education,No,No,HANDICRAFT AND PRINTING WORKERS,7234,86
...,...,...,...,...,...,...,...,...,...
1146,Window Cleaner,accommodation and food service activities,Other accommodation,Higher Education,No,No,Dorm caretaker,5153,57
1147,Wood Carver,manufacturing,Manufacture of furniture,Not Higher Education,No,No,Wood carver,7317,100
1148,Workforce Management Analyst,administrative and support service activities,"Business support service activities, n.e.c.",Higher Education,No,No,Fireman,5411,64
1149,Workforce Wellness Coordinator,human health and social work activities,Other social work activities without accommoda...,Higher Education,No,Yes,Events coordinator,3332,86


In [225]:
jobs_to_keep = ['Cold Storage Technician',
 'Commercial Diver',
 'Dental Laboratory Technician',
 'Digital Background Painting',
 'EV Battery Automotive Mechanic',
 'Food Safety Auditor',
 'HACCP Coordinator',
 'Junior Airline Accounting Clerk',
 'Maintenance Laborer',
 'Milk Processing Technician',
 'Product Development Technician',
 'R&D Food Specialist',
 'Semiconductor Operator',
 'Wastewater Treatment Specialist',
 'Water Quality Engineer']

filtered_df = mca_df[mca_df["Job Title"].isin(jobs_to_keep)]
filtered_df

Unnamed: 0,Job Title,Job Sector,Job Subsector,Educational Pathway,HEI with PRC (Professional Regulation Commission) Exam,Some HEI,Best Match,ISCO Code,Similarity Score
190,Cold Storage Technician,manufacturing,Installation of industrial machinery and equip...,Higher Education,No,No,Automotive brakes systems service technician,7231,86
192,Commercial Diver,transportation and storage,"Sea and coastal water transport, Inland water ...",Not Higher Education,No,No,Bus driver,8331,68
289,Dental Laboratory Technician,human health and social work activities,Medical and dental practice activities,Not Higher Education,No,No,Medical laboratory technician,3212,88
298,Digital Background Painting,information and communication,"Motion picture, video and television programme...",Higher Education,No,No,Digital artist,2166,86
376,EV Battery Automotive Mechanic,manufacturing,Manufacture of batteries and accumulators,Not Higher Education,No,No,Automotive mechanic,7231,90
434,Food Safety Auditor,manufacturing,Manufacture of other food products,Higher Education,Yes,No,"FOOD PROCESSING, WOOD WORKING, GARMENT AND OTH...",7422,86
478,HACCP Coordinator,manufacturing,Manufacture of other food products,Higher Education,No,No,Cooper,7522,68
570,Junior Airline Accounting Clerk,transportation and storage,"Passenger air transport, Freight air transport",Higher Education,Yes,No,Mining driller,8111,58
680,Maintenance Laborer,construction,Construction of buildings,Not Higher Education,No,No,Dams maintenance laborer,9312,95
738,Milk Processing Technician,manufacturing,Manufacture of dairy products,Higher Education,No,No,Drain technician,7126,86


In [None]:
fil

In [221]:
mca_df[mca_df['Job Title'] == 'Water Quality Engineer']

Unnamed: 0,Job Title,Job Sector,Job Subsector,Educational Pathway,HEI with PRC (Professional Regulation Commission) Exam,Some HEI,Best Match,ISCO Code,Similarity Score
1133,Water Quality Engineer,"water supply; sewerage, waste management and r...","Water collection, treatment and supply",Higher Education,Yes,No,Aircraft airframes maintenance engineer,7232,86


In [222]:
filtered_df

Unnamed: 0,Job Title,Job Sector,Job Subsector,Educational Pathway,HEI with PRC (Professional Regulation Commission) Exam,Some HEI,Best Match,ISCO Code,Similarity Score
190,Cold Storage Technician,manufacturing,Installation of industrial machinery and equip...,Higher Education,No,No,Automotive brakes systems service technician,7231,86
192,Commercial Diver,transportation and storage,"Sea and coastal water transport, Inland water ...",Not Higher Education,No,No,Bus driver,8331,68
289,Dental Laboratory Technician,human health and social work activities,Medical and dental practice activities,Not Higher Education,No,No,Medical laboratory technician,3212,88
298,Digital Background Painting,information and communication,"Motion picture, video and television programme...",Higher Education,No,No,Digital artist,2166,86
376,EV Battery Automotive Mechanic,manufacturing,Manufacture of batteries and accumulators,Not Higher Education,No,No,Automotive mechanic,7231,90
434,Food Safety Auditor,manufacturing,Manufacture of other food products,Higher Education,Yes,No,"FOOD PROCESSING, WOOD WORKING, GARMENT AND OTH...",7422,86
478,HACCP Coordinator,manufacturing,Manufacture of other food products,Higher Education,No,No,Cooper,7522,68
570,Junior Airline Accounting Clerk,transportation and storage,"Passenger air transport, Freight air transport",Higher Education,Yes,No,Mining driller,8111,58
680,Maintenance Laborer,construction,Construction of buildings,Not Higher Education,No,No,Dams maintenance laborer,9312,95
738,Milk Processing Technician,manufacturing,Manufacture of dairy products,Higher Education,No,No,Drain technician,7126,86


In [223]:
mca_df

Unnamed: 0,Job Title,Job Sector,Job Subsector,Educational Pathway,HEI with PRC (Professional Regulation Commission) Exam,Some HEI,Best Match,ISCO Code,Similarity Score
0,2D Echocardiography Technician,human health and social work activities,Medical and dental practice activities,Higher Education,No,Yes,2D animators,2166,86
1,2D Game Artist,"arts, entertainment and recreation","Creative, arts and entertainment activities",Not Higher Education,No,No,Website game developers,2513,86
2,3D Game Artist,"arts, entertainment and recreation","Creative, arts and entertainment activities",Not Higher Education,No,No,Website game developers,2513,86
3,3D Modeller,"arts, entertainment and recreation","Creative, arts and entertainment activities",Not Higher Education,No,No,Financial modelers,2413,72
4,3D Printing Designer,manufacturing,Manufacture of other fabricated metal products...,Higher Education,No,No,HANDICRAFT AND PRINTING WORKERS,7234,86
...,...,...,...,...,...,...,...,...,...
1146,Window Cleaner,accommodation and food service activities,Other accommodation,Higher Education,No,No,Dorm caretaker,5153,57
1147,Wood Carver,manufacturing,Manufacture of furniture,Not Higher Education,No,No,Wood carver,7317,100
1148,Workforce Management Analyst,administrative and support service activities,"Business support service activities, n.e.c.",Higher Education,No,No,Fireman,5411,64
1149,Workforce Wellness Coordinator,human health and social work activities,Other social work activities without accommoda...,Higher Education,No,Yes,Events coordinator,3332,86
