In [63]:
import pandas as pd
import numpy as np

# Mapping of PSOC Code to ISCO Code

In [64]:
# Get the data
filename = '2022-Updates-to-the-2012-PSOC.xlsx'
relevant_cols = [2, 5]
names = ['PSOC', 'ISCO']
df_maps = pd.read_excel(
    filename, 
    usecols=relevant_cols, 
    names=names,
    sheet_name=None,
    dtype={"PSOC": str, "ISCO": str}
    )

# for each df_map in df_maps, get their jobs to ISCO pairs
psoc_isco = {}
for _, df_map in df_maps.items():
    df_map.dropna(inplace=True)
    mapping = dict(zip(df_map['PSOC'], df_map['ISCO']))
    psoc_isco.update(mapping)

# Using FINAL (WIP)-MCA Job list

There are two sheets of interest for us for the Excel file *FINAL (WIP)-MCA Job list August 2025.xlsx*. Namely, they are **USE_final** and **Pass 3 - Sector reports**. The former is important because it contains the final list of jobs that should have the orderer pair of the exposure v complementarity scores. The latter is an auxilliary sheet that contains the PSOC codes of jobs. What we noticed is that the intersection of the jobs between these two are over 97%.

In [76]:
# Opening the MCA Job list
filename = 'FINAL (WIP)-MCA Job list August 2025.xlsx'

# make a dataframe for the final job list
final_jobs_df = pd.read_excel(filename, sheet_name=0)
final_jobs = set(final_jobs_df['Job Title'].to_list())

# make another dataframe for the sheet with PSOC codes
pass_df = pd.read_excel(filename, sheet_name=4)
pass_jobs = set(pass_df['Job Title'].to_list())

# Show the Jaccard Index
inter = len(final_jobs.intersection(pass_jobs))
union = len(final_jobs.union(pass_jobs))
print(f'The two lists share over {round(inter/union, 2) * 100}% of jobs in common.')

The two lists share over 98.0% of jobs in common.


In [77]:
# Create the mapping from the job to PSOC
job_psoc = dict(zip(pass_df['Job Title'].str.strip(), pass_df['PSOC Code']))
final_jobs_df['PSOC'] = final_jobs_df['Job Title'].apply(
    lambda x : job_psoc.get(x, np.nan)
    )
print("Only 15 jobs do not have a PSOC Code. "
      "My simple answer would be to just drop, "
      "but I don't know if that is the best solution.")

final_jobs_df.isnull().sum()

Only 15 jobs do not have a PSOC Code. My simple answer would be to just drop, but I don't know if that is the best solution.


Job Title                                                  0
Job Sector                                                 0
Job Subsector                                              0
Educational Pathway                                        0
HEI with PRC (Professional Regulation Commission) Exam     0
Some HEI                                                   0
PSOC                                                      15
dtype: int64

In [75]:
final_jobs_df[final_jobs_df['PSOC'].isnull()]['Job Title'].str.strip().to_list()

['Cold Storage Technician',
 'Commercial Diver',
 'Dental Laboratory Technician',
 'Digital Background Painting',
 'EV Battery Automotive Mechanic',
 'Food Safety Auditor',
 'HACCP Coordinator',
 'Junior Airline Accounting Clerk',
 'Maintenance Laborer',
 'Milk Processing Technician',
 'Product Development Technician',
 'R&D Food Specialist',
 'Semiconductor Operator',
 'Wastewater Treatment Specialist',
 'Water Quality Engineer']

In [None]:
['Cold Storage Technician']