## Setup

In [None]:
from tqdm import tqdm
import time
from tenacity import retry

In [None]:
# copy from https://github.com/onetcenter/web-services-samples/blob/master/python-3/OnetWebService.py
import urllib.request, urllib.parse, urllib.error
import urllib.request, urllib.error, urllib.parse
import base64
import json

class OnetWebService:
    
    def __init__(self, username, password):
        self._headers = {
            'User-Agent': 'python-OnetWebService/1.00 (bot)',
            'Authorization': 'Basic ' + base64.standard_b64encode((username + ':' + password).encode()).decode(),
            'Accept': 'application/json' }
        self.set_version()
    
    def set_version(self, version = None):
        if version is None:
            self._url_root = 'https://services.onetcenter.org/ws/'
        else:
            self._url_root = 'https://services.onetcenter.org/v' + version + '/ws/'
    
    def call(self, path, *query):
        url = self._url_root + path
        if len(query) > 0:
            url += '?' + urllib.parse.urlencode(query, True)
        req = urllib.request.Request(url, None, self._headers)
        handle = None
        try:
            handle = urllib.request.urlopen(req)
        except urllib.error.HTTPError as e:
            if e.code == 422:
                return json.load(e)
            else:
                return { 'error': 'Call to ' + url + ' failed with error code ' + str(e.code) }
        except urllib.error.URLError as e:
            return { 'error': 'Call to ' + url + ' failed with reason: ' + str(e.reason) }
        code = handle.getcode()
        if (code != 200) and (code != 422):
            return { 'error': 'Call to ' + url + ' failed with error code ' + str(code),
                     'urllib2_info': handle }
        return json.load(handle)

In [None]:
import API_KEYS 

username = "a_study_on_occupatio"
password = API_KEYS.ONET

In [None]:
onet_ws = OnetWebService(username, password)

In [None]:
def check_for_error(service_result):
    if 'error' in service_result:
        # sys.exit(service_result['error'])
        raise RuntimeError(service_result['error'])

vinfo = onet_ws.call('about')
check_for_error(vinfo)
print("Connected to O*NET Web Services version " + str(vinfo['api_version']))
print("")

## Crosswalk from O*NET-SOC to SOC 

Using the csv file in taxonomy page is convenient than using api 

In [None]:
# O*NET-SOC 2019 (Code;Title;Description)
onetsoc = pd.read_csv("https://www.onetcenter.org/taxonomy/2019/list/2019_Occupations.csv?fmt=csv")

In [None]:
# O*NET-SOC 2019 -> 2018 SO (Code;Title)
crosswalk = pd.read_csv("https://www.onetcenter.org/taxonomy/2019/soc/2019_to_SOC_Crosswalk.csv?fmt=csv")

In [None]:
crosswalk

## Occupation Report

https://services.onetcenter.org/reference/online/occupation/details/full_report

This response returns information from the occupation overview and all of the specific Details report services.

- "code": "O*NET-SOC Code"
- "display" : "short"
  - In most report sections, up to 10 items are returned by default. To see all available items in each report section, set the optional display parameter to long. This is equivalent to setting the parameter on each of the individual report services.
- "occupation": 
  - 'code'; 'title'; 'description'; 'sample_of_reported_job_titles'; 
  - 'also_see' (more detailed occupation code not in SOC)
  - 'updated'
  - 'summary_resources'; 'details_resources'; 'custom_resources'
- ... 
- "tools_technology":
  - Important: The response may include a tools_technology element, which is included for backward compatibility only. It will be removed in a future release and should not be used in new applications. Please use the technology_skills and tools_used elements instead.

**Lise & Postel-Vinay 2020 aer**
- descriptors from [skills, abilities, knowledge, work activities, work context] are used to form skill requirements
- [job interests, work values, work styles] are less directly interpretable in terms of skill requirements
- [experience/education requirements] are used to construct the cognitive, manual, and interpersonal scores for each education level (take the average value from the education requirements of each occupation in ONET)

In [None]:
path = lambda code: f"online/occupations/{code}/details/"

In [None]:
r = onet_ws.call(path("17-2051.00"), ('display', "long"))

In [None]:
[i for i in r.keys()]

In [None]:
for i in r["skills"]['element']:
    i

In [None]:
descriptors_map = {}
for c in ["skills", "abilities", "knowledge", "work_activities", "work_context"]: 
    for i in r[c]['element']:
        # d[i['id']] = i['score']['value']
        descriptors_map[i['id']] = i['name']

In [None]:
pd.DataFrame([d,d], index=[1,2])

In [None]:
@retry
def access_onet(onet, occ):
    r = onet.call(path(occ), ('display', "long"))
    check_for_error(r)
    return r

def get_occ_matrix(onet, occs, categories,):
    ds = []
    for o in tqdm(occs):
        r = access_onet(onet, o)
        d = {}
        for c in categories: 
            if c not in r.keys():
                # some occupation have missing categories
                break
            for i in r[c]['element']:
                if 'score' in i:
                    d[i['id']] = i['score']['value']
                else:
                    d[i['id']] = None
        ds.append(d)
    occ_matrix = pd.DataFrame(ds, index=occs)
    return occ_matrix

In [None]:
occ_all = crosswalk["O*NET-SOC 2019 Code"].values
categories = ["skills", "abilities", "knowledge", "work_activities", "work_context"]

occ_matrix = get_occ_matrix(onet_ws, occ_all, categories)

In [None]:
occ_matrix.dropna()

## PCA

three exclusion restrictions in LPV
- (i) the mathematics score only reflects cognitive skill requirements; ('2.A.1.e')
- (ii) the mechanical knowledge score only reflects manual skill requirements; ('2.C.3.e')
- (iii) the social perceptiveness score only reflects interpersonal skill requirements. ('2.B.1.a')

In [None]:
occm = occ_matrix.dropna()

In [None]:
dimen_occs = ['2.A.1.e','2.C.3.e','2.B.1.a']
reorder_cols = dimen_occs + occm.columns.drop(dimen_occs).to_list()
occm = occm[reorder_cols]

In [None]:
from sklearn.decomposition import PCA

In [None]:
n_components = 3
pca = PCA(n_components=n_components)
mu = np.mean(occm.values, axis=0)
X = pca.fit_transform(occm.values)
X.shape
# components_ is the loading, 
# see e.g. https://scentellegher.github.io/machine-learning/2020/01/27/pca-loadings-sklearn.html 
# or https://stats.stackexchange.com/questions/229092/how-to-reverse-pca-and-reconstruct-original-variables-from-several-principal-com
pca.components_.shape 

In [None]:
XT = X @ pca.components_[:,:3]

In [None]:
# linear rescaling to (0,1)
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
XT_norm = scaler.fit_transform(XT)

In [None]:
XT_norm

In [None]:
occm_XTn =pd.DataFrame(XT_norm, index=occm.index, columns=["cognitive", "manual", "interpersonal"])
occm_cw = crosswalk.set_index("O*NET-SOC 2019 Code").merge(occm_XTn, left_index=True, right_index=True,) # .merge(occm, left_index=True, right_index=True,)

In [None]:
# although it seems somehow weird that Engineers go to the highest cognitive rank
# in general the three cognitive, manual, interpersonal measures seem making sense
# but in other words it seems to very precise but still rough measures for those interpretations
prints(occm_cw.sort_values("cognitive", ascending=False))
# prints(occm_cw.sort_values("manual", ascending=False))
# prints(occm_cw.sort_values("interpersonal", ascending=False))

In [None]:
# check some specific occupation
prints(occm_cw[occm_cw["2018 SOC Code"].str.startswith("15")])