In [40]:
import pandas as pd
import numpy as np

from typing import Optional, Union, Literal, Iterable, Any, Dict, Tuple

# Get Possible Names of Each Job to ISCO Code

We’ll load the PSOC Excel file and focus only on two columns: Job Title and ISCO code. The ISCO code will serve as the key to link PSOC with O*NET, where we can attach the AIOE and Complementarity scores.

In [42]:
def add_job_isco(df_map: pd.DataFrame, job_isco: Dict[str, int]) -> Dict[str, int]:
    """
    Build a mapping of job titles (main + examples) to their ISCO codes.

    Parameters
    ----------
    df_map : pd.DataFrame
        DataFrame with columns ["Job Title", "ISCO"].
    job_isco : dict
        Dictionary to update with mappings {job_title: isco_code}.

    Returns
    -------
    dict
        Updated mapping {job_title: isco_code}.
    """
    collecting: bool = False

    phrase_1 = "Some related occupations classified elsewhere:"
    phrase_2 = "Related occupation classified elsewhere:"

    for _, row in df_map.iterrows():
        title: str = row["Job Title"]
        code = row["ISCO"]

        # If all caps job title is seen, collect it 
        # but also know that the tasks should not be collected
        if title.isupper():
            # Main job title
            job_isco[title] = code
            collecting = False

        # Start collecting example job titles
        elif "Examples of the occupations classified here:" in title:
            collecting = True
        
        # If this phrase is seen, stop collecting
        elif (phrase_1 in title) or (phrase_2 in title):
            collecting = False

        # Collect this specific job title
        elif collecting:
            job_isco[title] = code

    return job_isco

In [43]:
# Get the data
filename = '2022-Updates-to-the-2012-PSOC.xlsx'
relevant_cols = [3, 5]
names = ['Job Title', 'ISCO']
df_maps = pd.read_excel(
    filename, 
    usecols=relevant_cols, 
    names=names,
    sheet_name=None,
    )

In [51]:
# for each df_map in df_maps, get their jobs to ISCO pairs
job_isco = {}

for _, df_map in df_maps.items():
    # Replace empty strings or strings with only whitespace with NaN
    df_map["ISCO"] = df_map["ISCO"].replace(r"^\s*$", np.nan, regex=True)
    
    # Forward fill ISCO codes
    df_map["ISCO"].ffill(inplace=True)

    # Extract only first 4 digits
    df_map["ISCO"] = df_map["ISCO"].astype(str).str.extract(r"(\d{4})")[0]

    
    # Drop rows missing job title (but keep NaN ISCO for now)
    df_map.dropna(inplace=True)

    # Drop duplicate job titles
    df_map.drop_duplicates(inplace=True, ignore_index=True)

    add_job_isco(df_map, job_isco)


In [52]:
data = pd.DataFrame(list(job_isco.items()), columns=names)
data

Unnamed: 0,Job Title,ISCO
0,LEGISLATORS,1111
1,City/Municipal Councilor,1111
2,Congressman,1111
3,Member of the Barangay Council (Sangguniang Pa...,1111
4,Member of the Provincial Board (Sangguniang Pa...,1111
...,...,...
3607,Golf-range attendant,9629
3608,Parking attendant,9629
3609,Recreation facilities attendant,9629
3610,Ticket collector,9629


In [29]:
filt = data['Job Title'] == 'Fairground attendant'
data[filt]

Unnamed: 0,Job Title,ISCO


The PSA lists job titles in a clear cycle. Each section starts with the main job title in all caps, followed by a description of tasks. It then introduces related titles with the phrase “Examples of the occupations classified here:”. The cycle repeats with a new ISCO code whenever a new all-caps job title appears.