In [11]:
"""Evaluation analysis - 

1) We compare lightcast's algorithm with our lightcast mapped skills at the skill level. We guarantee to map to skills based on setting the minimum cosine similarity threshold to 0;
2) We also compare top extracted skills per occupation with ESCO's essential skills.
"""

"Evaluation analysis - \n\n1) We compare lightcast's algorithm with our lightcast mapped skills at the skill level. We guarantee to map to skills based on setting the minimum cosine similarity threshold to 0;\n2) We also compare top extracted skills per occupation with ESCO's essential skills.\n"

In [22]:
from ojd_daps_skills import config, bucket_name, logger
from ojd_daps_skills.getters.data_getters import (
    get_s3_resource,
    load_s3_data,
    save_to_s3,
)
import pandas as pd
import random

### 0. Relevant functions for analysis.

In [7]:
def percent_overlap(ojo_skills, lightcast_skills):
    """Calculate the percent overlap between two lists"""
    
    if len(ojo_skills) and len(lightcast_skills) > 0:

        setA = set(ojo_skills)
        setB = set(lightcast_skills)

        overlap = setA & setB
        universe = setA | setB


        result1 = float(len(overlap)) / len(setA) * 100
        result2 = float(len(overlap)) / len(setB) * 100
        result3 = float(len(overlap)) / len(universe) * 100

        return result1, result2, result3
    
    else:
        return 100, 100, 100

### 1. Relevant parameters and datasets for analysis.

In [23]:
s3 = get_s3_resource()
ojo_lightcast_skills = load_s3_data(s3, bucket_name, 'escoe_extension/outputs/evaluation/ojo_esmi_skills/ojo_lightcast_skills_20221115.json')
esco_occupations = load_s3_data(s3, bucket_name, 'escoe_extension/outputs/evaluation/aggregate_ojo_esco/ojo_esco_occupation_skills_results_v2.json')
esco_occupations_df = pd.DataFrame(esco_occupations).T.sort_values('skills_in_ojo_esco_percent', ascending=False).reset_index().rename(columns={'index':'occupation'})

### 2. Lightcast comparison analysis

In [24]:
for job_id, skill_info in ojo_lightcast_skills.items():
    comps = percent_overlap(skill_info['ojo_skills'], skill_info['lightcast_skills'])
    for comp_type, comp in zip(('ojo_skills_overlap', 'lightcast_skills_overlap', 'universal_overlap'), comps):
        skill_info[comp_type] = comp    

In [25]:
ojo_lightcast_skills_df = pd.DataFrame(ojo_lightcast_skills).T
ojo_lightcast_skills_df = ojo_lightcast_skills_df.sort_values('lightcast_skills_overlap', ascending=False)
ojo_lightcast_skills_df = ojo_lightcast_skills_df[~(ojo_lightcast_skills_df['ojo_skills'].str.len() == 0) & (ojo_lightcast_skills_df['lightcast_skills'].str.len() != 0)]

In [26]:
print('percent overlap analysis')

print(f"the % of job adverts with no skills overlap is: {len(ojo_lightcast_skills_df[ojo_lightcast_skills_df['ojo_skills_overlap'] == 0.0])/len(ojo_lightcast_skills_df)}")
print(f"the average # of lightcast skills we extract is: {ojo_lightcast_skills_df.ojo_skills.apply(lambda x: len(x)).mean()}")
print(f"the median # of lightcast skills we extract is: {ojo_lightcast_skills_df.ojo_skills.apply(lambda x: len(x)).median()}")

print(f"the average # of lightcast skills lightcast extracts is: {ojo_lightcast_skills_df.lightcast_skills.apply(lambda x: len(x)).mean()}")
print(f"the median # of lightcast skills lightcast extracts is: {ojo_lightcast_skills_df.lightcast_skills.apply(lambda x: len(x)).median()}")

print(f"of the job adverts with overlap, on average, {ojo_lightcast_skills_df[ojo_lightcast_skills_df['lightcast_skills_overlap'] != 0.0].lightcast_skills_overlap.mean()} of lightcast skills are present in our current approach.")
print(f"of the job adverts with overlap, the median is {ojo_lightcast_skills_df[ojo_lightcast_skills_df['lightcast_skills_overlap'] != 0.0].lightcast_skills_overlap.median()} of lightcast skills are present in our current approach.")

print(f"of the job adverts with overlap, on average, {ojo_lightcast_skills_df[ojo_lightcast_skills_df['ojo_skills_overlap'] != 0.0].ojo_skills_overlap.mean()} of our skills are present in lighcast skills.")
print(f"of the job adverts with overlap, the median is {ojo_lightcast_skills_df[ojo_lightcast_skills_df['ojo_skills_overlap'] != 0.0].ojo_skills_overlap.median()} of our skills are present in lightcast skills.")

percent overlap analysis
the % of job adverts with no skills overlap is: 0.425531914893617
the average # of lightcast skills we extract is: 10.872340425531915
the median # of lightcast skills we extract is: 10.0
the average # of lightcast skills lightcast extracts is: 6.74468085106383
the median # of lightcast skills lightcast extracts is: 5.0
of the job adverts with overlap, on average, 39.318289194020196 of lightcast skills are present in our current approach.
of the job adverts with overlap, the median is 33.33333333333333 of lightcast skills are present in our current approach.
of the job adverts with overlap, on average, 25.087275154612616 of our skills are present in lighcast skills.
of the job adverts with overlap, the median is 21.428571428571427 of our skills are present in lightcast skills.


### 3. ESCO occupations comparison analysis

In [27]:
print(f"{len(esco_occupations_df)} occupations (with 100 or more job adverts) in ESCO were also found in OJO.")
print(f"the average # of adverts per occupation (with 100 or more job adverts) is {esco_occupations_df.no_of_job_adverts.mean()}")
print(f"on average, {esco_occupations_df['skills_in_ojo_esco_percent'].mean()} percent of essential ESCO skills per occupation are were extracted from our algorithm.")
print(f"the median percent of essential ESCO skills per occupation are were extracted from our algorithm is {esco_occupations_df['skills_in_ojo_esco_percent'].median()}.")

print(f"the maximum % of skills mentioned in essential ESCO skills in OJO job adverts is {esco_occupations_df.iloc[0].skills_in_ojo_esco_percent}, for the occupation {esco_occupations_df.iloc[0].occupation}.")
print(f"there are {len(esco_occupations_df[esco_occupations_df['skills_in_ojo_esco_percent'] == 0.0])} occupations with no overlap.")

58 occupations (with 100 or more job adverts) in ESCO were also found in OJO.
the average # of adverts per occupation (with 100 or more job adverts) is 345.5344827586207
on average, 19.007832037090004 percent of essential ESCO skills per occupation are were extracted from our algorithm.
the median percent of essential ESCO skills per occupation are were extracted from our algorithm is 17.94871794871795.
the maximum % of skills mentioned in essential ESCO skills in OJO job adverts is 54.54545454545454, for the occupation project manager.
there are 1 occupations with no overlap.


In [36]:
random.seed(42)
occupation_name = random.choice(esco_occupations_df.occupation)

print(f'--top OJO skills for "{occupation_name}" that are not essential ESCO skills--')
print(list(esco_occupations_df[esco_occupations_df.occupation == occupation_name].in_ojo_not_esco))
print(f'--essential ESCO skills for "{occupation_name}" that were not extracted--')
print(list(esco_occupations_df[esco_occupations_df.occupation == occupation_name].in_esco_not_ojo))

--top OJO skills for "security officer" that are not essential ESCO skills--
[['thinking creatively', 'advocating for individual or community needs', 'promote and sell a product', 'monitoring procedures', 'processing information, ideas and concepts', 'work with colleagues', 'utilise management skills', 'operating metal, plastic or rubber forming equipment', 'write English', 'market research', 'managing costs', 'administering human resources', 'negotiating and managing contracts and agreements', 'ensure customers are satisfied', 'develop digital marketing strategies', 'using foreign languages', 'managing a website', 'have good computer literacy', 'monitoring and evaluating the performance of individuals', 'mathematics', 'acute care', 'manage maintenance operations', 'manage a website', 'assure safety in the production area', 'providing medical, dental and nursing care', 'managing customer service', 'undertake patrols', 'Jenkins CI', 'collaborate with stakeholders', 'protection of person

In [37]:
random.seed(54)
occupation_name = random.choice(esco_occupations_df.occupation)

print(f'--top OJO skills for "{occupation_name}" that are not essential ESCO skills--')
print(list(esco_occupations_df[esco_occupations_df.occupation == occupation_name].in_ojo_not_esco))
print(f'--essential ESCO skills for "{occupation_name}" that were not extracted--')
print(list(esco_occupations_df[esco_occupations_df.occupation == occupation_name].in_esco_not_ojo))

--top OJO skills for "marketing manager" that are not essential ESCO skills--
[['advocating for individual or community needs', 'supervising educational staff', 'utilise management skills', 'organise projects', 'communicate information on insurance products', 'managing costs', 'managing a website', 'work in multidisciplinary health-teams', 'liaise with owners', 'offer specialist advice on medication', 'managing customer service', 'work together with industry experts', 'maintain several projects', 'customer-oriented management', 'principles of project control', 'oversee infrastructure', 'design computer network', 'Scala', 'aspects of services', 'managing a team', 'clean kitchen preparation, production & storage areas', 'golf', 'employment legislation', 'determining values of goods or services', 'marketing administration', 'monitoring merchandise delivery', 'customer service', 'negotiating with employment agencies', 'manage accounts department', 'assisting in the practical actions for de

In [38]:
random.seed(72)
occupation_name = random.choice(esco_occupations_df.occupation)

print(f'--top OJO skills for "{occupation_name}" that are not essential ESCO skills--')
print(list(esco_occupations_df[esco_occupations_df.occupation == occupation_name].in_ojo_not_esco))
print(f'--essential ESCO skills for "{occupation_name}" that were not extracted--')
print(list(esco_occupations_df[esco_occupations_df.occupation == occupation_name].in_esco_not_ojo))

--top OJO skills for "software engineer" that are not essential ESCO skills--
[['thinking creatively', 'advocating for individual or community needs', 'monitoring procedures', 'utilise management skills', 'write English', 'organise projects', 'communicate information on insurance products', 'managing costs', 'negotiating and managing contracts and agreements', 'develop digital marketing strategies', 'using foreign languages', 'managing a website', 'have good computer literacy', 'acute care', 'manage maintenance operations', 'liaise with owners', 'assure safety in the production area', 'managing customer service', 'work together with industry experts', 'Jenkins CI', 'protection of persons and property', 'customer-oriented management', 'principles of project control', 'use pneumatic systems', 'oversee infrastructure', 'design computer network', 'an organisational structure', 'Scala', 'monitor court procedures', 'operate CAD software', 'record keeping in a medical environment', 'clean kit