# data preparation

### import the best model

ATTENTION! in order to comply with the git-hub file limits, the file *ft_models/vectors_2018.vec* is compressed. you need to unzip it before loading the model.

In [1]:
from gensim.models import KeyedVectors
import pandas as pd

model_file = 'ft_models/vectors_2018.vec'
model = KeyedVectors.load_word2vec_format(model_file, binary=False)



### import esco occupations
The list of ESCO occupations has been downloaded from the official [ESCO](https://ec.europa.eu/esco/) website.


In [2]:
import pandas as pd
esco = pd.read_csv('data/occupations_en.csv', dtype={'iscoGroup': object})

### select only ICT jobs
In the paper we didn't apply TaxoRef to the whole ESCO, but on a subrgoup of it. We focused on ICT occupations, since it is the field in which we (authors) have more experience. As a consequence, we can better evaluate the results. Below you can find the ESCO codes corresponding yo ICT occupations

In [3]:
ICT_jobs = [1330, 2152, 2511, 2512, 2513, 2514, 2519, 2521, 2522, 2523, 2529, 3511, 3512, 3513, 3514, 3521, 3522]
ict = esco[esco.iscoGroup.astype(int).isin(ICT_jobs)]
ict.head(1)

Unnamed: 0,conceptType,conceptUri,iscoGroup,preferredLabel,altLabels,hiddenLabels,status,modifiedDate,regulatedProfessionNote,scopeNote,definition,inScheme,description,code
62,Occupation,http://data.europa.eu/esco/occupation/0464b062...,2529,ICT security administrator,system security administrator\nnetwork securit...,,released,2016-07-05T17:02:06Z,http://data.europa.eu/esco/regulated-professio...,,,http://data.europa.eu/esco/concept-scheme/occu...,ICT security administrators plan and carry out...,2529.5


### explode alternative labels
each ESCO label (column preferredLabel) has a number of alternative lables, which will undergo the refinement process as well.

In [4]:
pd.options.mode.chained_assignment = None
ict_alt = ict.copy()
ict_alt['altLabels'] = ict_alt['altLabels'].str.split('\n')
ict_alt = ict_alt.explode('altLabels')
ict_alt = ict_alt[['iscoGroup', 'preferredLabel', 'altLabels']]
ict_alt.head(1)

Unnamed: 0,iscoGroup,preferredLabel,altLabels
62,2529,ICT security administrator,system security administrator


### filter occupations and join vector representation
Some occupations do not appear in the Job Vacancies corpus, others appear rarely. In the training phase of the vector models, we set a thresold to filter out occupations that appeared less than 20 times.
In the following cell, we select only the occupations that have a vector representation, thus that appeared at least 20 times in the job vacancies.

In [5]:
ict_vectors = pd.DataFrame(columns=['iscoGroup', 'preferredLabel', 'altLabels', 'vector', 'sample'])
for index, row in ict_alt.iterrows():
    alt = ('_').join(row['altLabels'].split())
    if alt in model:
        row['vector'] = model[alt]
        sample = list(model[alt])
        sample.append(row['iscoGroup'])
        row['sample'] = sample
        ict_vectors.loc[ict_vectors.shape[0]] = row
ict_vectors

Unnamed: 0,iscoGroup,preferredLabel,altLabels,vector,sample
0,2529,ICT security administrator,system security administrator,"[0.0073804, 0.035756, -2.5797, -0.87285, 0.147...","[0.0073804, 0.035756, -2.5797, -0.87285, 0.147..."
1,2529,ICT security administrator,network security administrator,"[-0.42783, 0.063033, -3.258, -0.57337, 0.28755...","[-0.42783, 0.063033, -3.258, -0.57337, 0.28755..."
2,2512,software analyst,programming analyst,"[1.1001, -0.54524, -2.7346, -0.34688, 0.32395,...","[1.1001, -0.54524, -2.7346, -0.34688, 0.32395,..."
3,2512,software analyst,software requirement analyst,"[0.10966, -0.45945, -1.4405, -0.88691, 0.3842,...","[0.10966, -0.45945, -1.4405, -0.88691, 0.3842,..."
4,2512,software analyst,application analyst,"[1.508, -0.53921, -2.3979, -1.3048, 0.85856, 1...","[1.508, -0.53921, -2.3979, -1.3048, 0.85856, 1..."
...,...,...,...,...,...
129,2512,software developer,software engineer,"[1.1506, -0.69103, -2.5371, 0.34477, -0.28465,...","[1.1506, -0.69103, -2.5371, 0.34477, -0.28465,..."
130,2512,software developer,application developer,"[0.56242, 0.066944, -2.2078, -0.16042, -0.2329...","[0.56242, 0.066944, -2.2078, -0.16042, -0.2329..."
131,2511,user experience analyst,user experience officer,"[0.12692, 0.52161, -1.8411, -2.1805, -1.1691, ...","[0.12692, 0.52161, -1.8411, -2.1805, -1.1691, ..."
132,2511,user experience analyst,usability analyst,"[-0.049365, -0.77993, -2.2167, -0.75653, -0.84...","[-0.049365, -0.77993, -2.2167, -0.75653, -0.84..."


In [6]:
counts = ict_vectors.groupby(['iscoGroup'])['altLabels'].count()
keep = counts[counts > 2].index.to_list()

In [7]:
ict_vectors = ict_vectors[ict_vectors['iscoGroup'].isin(keep)]

# taxonomy refinement

### select data
For each word, we use its vector elements as features and its ESCO group as class

In [8]:
import numpy as np
data = ict_vectors['sample'].tolist()
data = np.array(data).astype(float)

### compute class statistics

In [9]:
# dataset statistics
# Function to compute mean, standard deviation and length for each column in a dataset
def summary_stats(data):
    summaries = [(np.mean(column), np.std(column), len(column)) for column in zip(*data)]
    del(summaries[-1])
    return summaries

# class statistics
# Function to slpit the data by class
def split_by_class(data):
    splitted = dict()
    for i in range(len(data)):
        vector = data[i]
        class_value = vector[-1]
        if (class_value not in splitted):
            splitted[class_value] = list()
        splitted[class_value].append(vector)
    return splitted

# Function to compute mean, standard deviation and length for each column AND FOR EACH CLASS in the dataset
def summary_class_stats(data):
    splitted = split_by_class(data)
    summaries = dict()
    for class_value, rows in splitted.items():
        summaries[class_value] = summary_stats(rows)
    return summaries

In [10]:
#compute summary and class statistics
stats = summary_stats(data)
class_stats = summary_class_stats(data)

### compute gaussian probability

In [11]:
from math import exp, sqrt, pi

#Gaussian probability denfity function
def gaussian_probability(x, mean, stdev):
    exponent = exp(-((x-mean)**2 / (2 * stdev**2 )))
    return (1 / (sqrt(2 * pi) * stdev)) * exponent

In [12]:
# For each word (rowe), compute its class probability
def class_probabilities(stats, row):
    total_rows = sum([stats[label][0][2] for label in stats])
    probabilities = dict()
    for class_value, class_stats in stats.items():
        probabilities[class_value] = stats[class_value][0][2]/float(total_rows)
        for i in range(len(class_stats)):
            mean, stdev, count = class_stats[i]
            probabilities[class_value] *= gaussian_probability(row[i], mean, stdev)
    return probabilities

### for each row (word) predict the class based on its word vector

In [13]:
predicted = []
for w in range(len(data)):
    scores = class_probabilities(class_stats, data[w])
    max_score = max(scores, key=scores.get)
    predicted.append(max_score)

### add the predicted class to the occupations dataframe
we also add the name of the original isco group and the suggested isco group to help the user evaluation

In [14]:
names =[
[1330,'ICT service managers'],
[2152,'Electronics engineers'],
[2511,'Systems analysts'],
[2512,'Software developers'],
[2513,'Web and multimedia developers'],
[2514,'Applications programmers'],
[2519,'Software and applications developers and analysts not elsewhere classified'],
[2521,'Database designers and administrators'],
[2522,'Systems administrators'],
[2523,'Computer network professionals'],
[2529,'Database and network professionals not elsewhere classified'],
[3511,'ICT operations technicians'],
[3512,'ICT user support technicians'],
[3513,'Computer network and systems technicians'],
[3514,'Web technicians'],
[3521,'Broadcasting and audiovisual technicians']]

In [15]:
refinement = ict_vectors[['preferredLabel', 'altLabels', 'iscoGroup']]
refinement['TaxoRefGroup'] = predicted
refinement['iscoGroup'] = refinement['iscoGroup'].astype(float)

iscoNames = pd.DataFrame(columns=['iscoGroup', 'iscoName'], data=names)
refinement = pd.merge(refinement, iscoNames, on='iscoGroup')

TaxorRefNames = pd.DataFrame(columns=['TaxoRefGroup', 'TaxorRefName'], data=names)
refinement = pd.merge(refinement, TaxorRefNames, on='TaxoRefGroup')

refinement

Unnamed: 0,preferredLabel,altLabels,iscoGroup,TaxoRefGroup,iscoName,TaxorRefName
0,ICT security administrator,system security administrator,2529.0,2529.0,Database and network professionals not elsewhe...,Database and network professionals not elsewhe...
1,ICT security administrator,network security administrator,2529.0,2529.0,Database and network professionals not elsewhe...,Database and network professionals not elsewhe...
2,digital forensics expert,digital forensics analyst,2529.0,2529.0,Database and network professionals not elsewhe...,Database and network professionals not elsewhe...
3,ethical hacker,vulnerability analyst,2529.0,2529.0,Database and network professionals not elsewhe...,Database and network professionals not elsewhe...
4,ICT security manager,information security manager,2529.0,2529.0,Database and network professionals not elsewhe...,Database and network professionals not elsewhe...
...,...,...,...,...,...,...
125,ICT system administrator,network administrator,2522.0,2522.0,Systems administrators,Systems administrators
126,ICT system administrator,sysadmin,2522.0,2522.0,Systems administrators,Systems administrators
127,webmaster,website administrator,3514.0,3514.0,Web technicians,Web technicians
128,webmaster,web administrator,3514.0,3514.0,Web technicians,Web technicians


# Example of refinement
Below we select a single ESCO group to better observe the refinement suggested by taxoref 

Note that: <br>
1\. Not alle the group have suggested refinements. In those cases TaxoRef is 100\% in accordance with ESCO <br>
2\. Results may vary slghtly from the ones presented in the paper for many reasons, for instance (i) the the randomness intrinsic to the embeddings training or (ii) the use of different underlying data which could contain different terms and relations, due to the evolutionary nature of the labour market. <br>

In [16]:
group = 2511
group_ref = refinement[refinement.iscoGroup == group]
group_ref

Unnamed: 0,preferredLabel,altLabels,iscoGroup,TaxoRefGroup,iscoName,TaxorRefName
11,integration engineer,software integration engineer,2511.0,2511.0,Systems analysts,Systems analysts
12,integration engineer,system integration engineer,2511.0,2511.0,Systems analysts,Systems analysts
13,embedded system designer,embedded system developer,2511.0,2511.0,Systems analysts,Systems analysts
14,data scientist,research data scientist,2511.0,2511.0,Systems analysts,Systems analysts
15,data scientist,data expert,2511.0,2511.0,Systems analysts,Systems analysts
16,ICT system analyst,system analyst,2511.0,2511.0,Systems analysts,Systems analysts
17,ICT system analyst,network analyst,2511.0,2511.0,Systems analysts,Systems analysts
18,ICT system developer,system developer,2511.0,2511.0,Systems analysts,Systems analysts
19,IT auditor,information technology auditor,2511.0,2511.0,Systems analysts,Systems analysts
20,data analyst,data warehousing analyst,2511.0,2511.0,Systems analysts,Systems analysts
