In [252]:
import pandas as pd
from torch_geometric.data import HeteroData
import torch
import torch_geometric.transforms as T
pd.set_option('display.max_rows', 50)

In [253]:
# only use skill nodes which have normalized_name != NaN, this is some indication of quality skill (?)
skill_nodes = pd.read_csv('fullgraphdata/neo4jgraph/skills.csv').dropna(subset=['normalized_name']).reset_index()
job_nodes = pd.read_csv('fullgraphdata/neo4jgraph/onet_skills_unique.csv')

# drop some skills "or"
skill_nodes = skill_nodes.loc[~skill_nodes.skill.isin(['or','technology'])]

In [254]:
# There are duplicate normalized names
skill_nodes.shape[0]-skill_nodes.normalized_name.unique().shape[0]

38692

In [255]:
# There are not as many skill names which are duplicate
skill_nodes.shape[0]-skill_nodes.skill.unique().shape[0]

2483

In [256]:
# we can not use normalized name instead of skill, because it is ambiguous, e.g. communication points to different normalized names
skill_nodes.loc[skill_nodes.skill=='communication']

Unnamed: 0,index,skill,category,normalized_name
695,2229,communication,communication,Third-Party Provider Communication
1292,4059,communication,healthcare,Communication (Including SBAR)
4228,12919,communication,communication,Friendly Communication
5528,16927,communication,communication,radio/telephone communication
6311,19452,communication,communication,communication (phone and email)
...,...,...,...,...
223829,759817,communication,communication,Calling/Applying
224556,762531,communication,communication,Communication
238581,818822,communication,communication,Email/Phone Communication
245411,848577,communication,soft skills,Communication (Phone/Face-to-Face)


In [257]:
skill_nodes.drop_duplicates(subset='skill', inplace=True)

In [258]:

skill_job_edges = pd.read_csv('fullgraphdata/neo4jgraph/tfidf_skill_job_edge.csv')
#skill_job_edges = skill_job_edges.loc[skill_job_edges.scaled_tfidf>8]
# only use edges where we have the skill and job for from the other files
skill_job_edges = skill_job_edges.loc[skill_job_edges['skill'].isin(skill_nodes['skill'])]
skill_job_edges = skill_job_edges.loc[skill_job_edges['alt_title'].isin(job_nodes.index)]

In [259]:
skill_job_edges

Unnamed: 0,alt_title,skill,scaled_tfidf,n_jobdesc_used
1,55010,design,9.887307,240
5,55010,cg,8.744163,240
10,55010,visual effects,6.299518,240
11,55010,software,5.288013,240
12,55010,unity,5.278638,240
...,...,...,...,...
7926039,15285,analysis,6.147100,1
7926040,15285,software,6.013723,1
7926041,15285,engineering,5.864380,1
7926050,15285,development,4.434249,1


In [260]:
#for each alt title select the first 20 skill_job edges, ordered by tfidf
skill_job_edges = skill_job_edges.groupby('alt_title').apply(lambda group: group.nlargest(20,'scaled_tfidf')).reset_index(drop=True)

In [261]:
skill_job_edges

Unnamed: 0,alt_title,skill,scaled_tfidf,n_jobdesc_used
0,7,development,35.545516,1
1,7,physical work environment,14.444801,1
2,7,microsoft teams,13.682348,1
3,7,assessment process,11.763047,1
4,7,limited supervision,10.088181,1
...,...,...,...,...
282867,55652,communications,6.736089,7
282868,55652,systems,6.629133,7
282869,55652,driving,6.265465,7
282870,55652,highly specialized,5.501605,7


In [262]:
skillmapping ={}
for i,skill in enumerate(skill_job_edges.skill.unique()):
    skillmapping[skill] =i
    
jobmapping ={}
for i,title in enumerate(skill_job_edges.alt_title.unique()):
    jobmapping[title] =i
    
inverted_skillmapping = {v:k for k,v in skillmapping.items()}
inverted_jobmapping = {v:k for k,v in jobmapping.items()}

In [263]:
skill_job_edges['skill_dst'] = skill_job_edges['skill'].apply(lambda x:skillmapping[x])
skill_job_edges['job_src'] = skill_job_edges['alt_title'].apply(lambda x:jobmapping[x])

In [264]:
onet_alttitles = pd.read_csv('fullgraphdata/neo4jgraph/onet_alt_titles_unique.csv')
del onet_alttitles['Unnamed: 0']

In [265]:
onet_alttitle_str_mapping = {}
for i,row in onet_alttitles.iterrows():
    onet_alttitle_str_mapping[row['index']] = row['Alternate Title']

In [266]:
from sentence_transformers import SentenceTransformer, util
embedder = SentenceTransformer('all-MiniLM-L6-v2')


In [267]:
# create alttitle sbert embeddings

temp = onet_alttitle_str_mapping.items()
alttitle_sbert_embeddings = embedder.encode([v for k,v in temp], convert_to_tensor=False)
alttitle_sbert_indices = [k for k,v in temp]
#corpus_embeddings = util.normalize_embeddings(corpus_embeddings)

In [268]:
import numpy as np
v = alttitle_sbert_embeddings[0]
np.matmul(v.T,v)

0.99999994

In [269]:
temp = skillmapping.items()
skill_sbert_embeddings = embedder.encode([k for k,v in temp], convert_to_tensor=False)
skill_sbert_indices = [v for k,v in temp]

In [270]:

from sklearn.decomposition import PCA
X = np.concatenate([alttitle_sbert_embeddings,skill_sbert_embeddings])

print('Original:',X.shape[1])
for variance_retained in [0.99,0.95,0.9,0.8,0.75,0.7]:
    pca = PCA(n_components=variance_retained)
    pca.fit(X)
    n_components_retained = pca.n_components_
    print(n_components_retained,' components retained', variance_retained, ' variance retained')

Original: 384
311  components retained 0.99  variance retained
230  components retained 0.95  variance retained
182  components retained 0.9  variance retained


KeyboardInterrupt: 

In [None]:
# choose 128
pca = PCA(n_components=128)
pca.fit(X)
alttitle_sbert_embeddings_reduced = pca.transform(alttitle_sbert_embeddings)
skill_sbert_embeddings_reduced = pca.transform(skill_sbert_embeddings)
alttitle_sbert_reduced_mapping = {k:v for k,v in zip(alttitle_sbert_indices,alttitle_sbert_embeddings_reduced)}
skill_sbert_reduced_mapping = {k:v for k,v in zip(skill_sbert_indices,skill_sbert_embeddings_reduced)}

In [None]:
alttitle_sbert_embeddings_reduced.shape, skill_sbert_embeddings_reduced.shape

((55653, 128), (23025, 128))

In [None]:
data = HeteroData()
data['skill'] =

SyntaxError: invalid syntax (1433060194.py, line 2)

In [None]:
job_nodes


Unnamed: 0,index,O*NET-SOC Code,Title,Alternate Title,Short Title,Source(s)
0,6,11-1011.00,Chief Executives,Business Development Executive (BD Executive),BD Executive,9
1,7,11-1011.00,Chief Executives,Business Development Officer (BD Officer),BD Officer,9
2,10,11-1011.00,Chief Executives,CEO (Chief Executive Officer),CEO,02040810
3,11,11-1011.00,Chief Executives,Chief Administrative Officer (CAO),CAO,9
4,12,11-1011.00,Chief Executives,Chief Diversity Officer (CDO),CDO,2
...,...,...,...,...,...,...
2975,53556,53-7065.00,Stockers and Order Fillers,Quality Control Clerk (QC Clerk),QC Clerk,8
2976,53651,53-7065.00,Stockers and Order Fillers,Warehouse Technician (Warehouse Tech),Warehouse Tech,0208
2977,53681,53-7071.00,Gas Compressor and Gas Pumping Station Operators,Liquefied Natural Gas Plant Operator (LNG Plan...,LNG Plant Operator,0204
2978,53815,53-7121.00,"Tank Car, Truck, and Ship Loaders",PVC Loader (Polyvinyl Chloride Loader),PVC Loader,2
