Analyze the results, i.e. map the node embeddings back to their names and see if they are sensible

In [2]:
import torch
from torch_geometric.data import HeteroData
import os

filename = 'Job_Skill_HeteroData_v3.pt'
if os.path.exists('./'+filename):
    data = HeteroData.from_dict(torch.load('./'+filename))

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from torch_geometric.nn.conv import SimpleConv
from typing import List, Optional, Union, Tuple

import torch
import torch.nn.functional as F
from torch import Tensor
from torch_geometric.nn.dense.linear import Linear
from torch_geometric.nn.aggr import Aggregation
from torch_geometric.nn.conv import MessagePassing
from torch_geometric.typing import (
    Adj,
    OptPairTensor,
    OptTensor,
    Size,
    SparseTensor,
    torch_sparse,
)
from torch_geometric.utils import add_self_loops, spmm

class WeightedGraphSageConv(SimpleConv):
    def __init__(self, in_channels: Union[int, Tuple[int, int]], out_channels: int, normalize, max_pool, combine_root: Optional[str] = None, aggr: str = 'add', bias: bool = True, **kwargs):
        super().__init__(aggr, combine_root, **kwargs)
        
        # from GraphConv https://pytorch-geometric.readthedocs.io/en/latest/_modules/torch_geometric/nn/conv/graph_conv.html#GraphConv
        self.in_channels = in_channels
        self.out_channels = out_channels

        if isinstance(in_channels, int):
            in_channels = (in_channels, in_channels)
            
        if self.combine_root == 'cat':
            assert out_channels%2==0 and out_channels!=-1, 'number of in_channels must be even, and no lazy initialization (-1) is supported'
            to_out_channels = out_channels//2
        else:
            to_out_channels = out_channels
            
        self.lin_j_out = Linear(in_channels[0], to_out_channels, bias=bias)
        self.lin_i_out = Linear(in_channels[0], to_out_channels, bias=bias)
        self.lin_for_max_pool = Linear(in_channels[0], out_channels, bias=True)
        self.normalize = normalize
        self.max_pool = max_pool
        self.reset_parameters()
        
    def reset_parameters(self):
        super().reset_parameters()
        self.lin_j_out.reset_parameters()
        self.lin_i_out.reset_parameters()
        self.lin_for_max_pool.reset_parameters()
    
    def forward(self, x: Union[Tensor, OptPairTensor], edge_index: Adj,
                edge_weight: OptTensor = None, size: Size = None) -> Tensor:

        if self.combine_root is not None:
            if self.combine_root == 'self_loop':
                if not isinstance(x, Tensor) or (size is not None
                                                 and size[0] != size[1]):
                    raise ValueError("Cannot use `combine_root='self_loop'` "
                                     "for bipartite message passing")
                if isinstance(edge_index, Tensor):
                    edge_index, edge_weight = add_self_loops(
                        edge_index, edge_weight, num_nodes=x.size(0))
                elif isinstance(edge_index, SparseTensor):
                    edge_index = torch_sparse.set_diag(edge_index)

        if isinstance(x, Tensor):
            x: OptPairTensor = (x, x)

        if self.max_pool:
            x = (self.lin_for_max_pool(x[0]).relu(), x[1])
        # propagate_type: (x: OptPairTensor, edge_weight: OptTensor)
        out = self.propagate(edge_index, x=x, edge_weight=edge_weight,
                             size=size)

        x_dst = x[1]
        if x_dst is not None and self.combine_root is not None and self.combine_root!='self_loop':
            x_dst = self.lin_i_out(x_dst)
            out = self.lin_j_out(out)
            if self.combine_root == 'sum':
                out = out + x_dst
            elif self.combine_root == 'cat':
                out = torch.cat([x_dst, out], dim=-1)
        
        out = torch.relu(out)
        
        if self.normalize:
            out = F.normalize(out, p=2., dim=-1)
        return out


# Sage conv from the paper, max pooling would be
# normalize will set length
# you can choose to pass edgeweights, then it wont be exactly as in the paper
conv = WeightedGraphSageConv(256, 256, normalize=True, combine_root='cat', aggr='max', bias=True, max_pool=True)

In [4]:
from typing import Tuple, Union
from torch import Tensor
import torch
import torch_geometric
from torch_geometric.nn import to_hetero, HeteroDictLinear, Linear
from torch_geometric.nn.conv import GraphConv, SAGEConv, SimpleConv, HeteroConv

from torch_geometric.typing import Adj, OptPairTensor, OptTensor, Size
from torch_geometric import seed_everything
from torch_geometric.utils import trim_to_layer

class WeightedSkillSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, n_conv_layers):
        super().__init__()
        
        self.lin_in = torch.nn.ModuleDict({
            'Skill': Linear(in_channels,hidden_channels),
            'Job': Linear(in_channels,hidden_channels)
        })
        
        
      
        
        self.hetero_convs = torch.nn.ModuleList()
        for i in range(n_conv_layers):
            # if i == n_conv_layers-1:
            #     in_ch = (in_channels, in_channels)
            # else:
            in_ch = (hidden_channels, hidden_channels)
                
            skill_skill = WeightedGraphSageConv(in_ch, hidden_channels, normalize=True, max_pool=True, combine_root='cat', aggr='max', bias=True)  # use same for rev_skill as well
            job_job = WeightedGraphSageConv(in_ch, hidden_channels, normalize=True, max_pool=True, combine_root='cat', aggr='max', bias=True)  # use same for rev_job... as well
            conv = HeteroConv(
                {
                    ('Job', 'REQUIRES', 'Skill'): WeightedGraphSageConv(in_ch, hidden_channels, normalize=True, max_pool=True, combine_root='cat', aggr='max', bias=True),
                    ('Skill', 'rev_REQUIRES', 'Job'): WeightedGraphSageConv(in_ch, hidden_channels, normalize=True, max_pool=True, combine_root='cat', aggr='max', bias=True),
                    ('Skill', 'IS_SIMILAR_SKILL', 'Skill'):skill_skill,
                    ('Skill', 'rev_IS_SIMILAR_SKILL', 'Skill'):skill_skill,
                    ('Job', 'IS_SIMILAR_JOB', 'Job'):job_job,
                    ('Job', 'rev_IS_SIMILAR_JOB', 'Job'):job_job,
                }, aggr='sum')
            self.hetero_convs.append(conv)
            
        self.lin_out = torch.nn.ModuleDict({
            'Skill': Linear(hidden_channels, out_channels),
            'Job': Linear(hidden_channels, out_channels)
        })

    def forward(self, x_dict, edge_index_dict, edge_weight_dict, num_sampled_edges_dict, num_sampled_nodes_dict):
        x_dict = {key: F.relu(self.lin_in[key](x)) for key, x in x_dict.items()}
        
        # speedup: only compute necessary node representations in each pass through https://pytorch-geometric.readthedocs.io/en/latest/advanced/hgam.html
        for i, conv in enumerate(self.hetero_convs):
            x_dict, edge_index_dict, edge_weight_dict = trim_to_layer(
                layer=i,
                num_sampled_nodes_per_hop=num_sampled_nodes_dict, 
                num_sampled_edges_per_hop=num_sampled_edges_dict, # gives the num sampled edges per edge type, e.g. ('Job', 'REQUIRES', 'Skill'): [3083, 14514] -> 3000 in first step, 14000 in second
                x=x_dict,
                edge_index=edge_index_dict,
                edge_attr=edge_weight_dict
            )
        
            x_dict = conv(x_dict, edge_index_dict, edge_weight_dict) # edge_weight_dict
            # x_dict = {key: F.relu(x) for key, x in x_dict.items()} # relu already implemented
            
        
        x_dict = {key: F.relu(self.lin_out[key](x)) for key, x in x_dict.items()}
        return x_dict

seed_everything(14)
model = WeightedSkillSAGE(in_channels=132, hidden_channels=256, out_channels=256, n_conv_layers=2)

In [5]:

checkpoint = torch.load('runs/WeightedSkillSAGE_lr_2emin7_1lin_1lin_256dim_edgeweight_checkpoints/checkpoint_ep2.pt')
model.load_state_dict(checkpoint['model_state_dict'])
#optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

<All keys matched successfully>

In [6]:
from torch_geometric import seed_everything
import torch_geometric.transforms as T


transform = T.RandomLinkSplit(
    is_undirected=True,
    edge_types=[
        ('Job', 'REQUIRES', 'Skill'),
        ('Skill', 'IS_SIMILAR_SKILL', 'Skill'),
        ('Job', 'IS_SIMILAR_JOB', 'Job')
        ],
    rev_edge_types=[
        ('Skill', 'rev_REQUIRES', 'Job'),
        ('Skill', 'rev_IS_SIMILAR_SKILL', 'Skill'),
        ('Job', 'rev_IS_SIMILAR_JOB', 'Job')
    ],
    num_val=0.005,
    num_test=0.01,
    add_negative_train_samples=False, # only adds neg samples for val and test, neg train are added by LinkNeighborLoader. This means for each train batch, negs. are different, for val and train they stay the same
    neg_sampling_ratio=1.0,
    disjoint_train_ratio=0, #  training edges are shared for message passing and supervision
    

    )

seed_everything(14)
train_data, val_data, test_data = transform(data)

In [43]:
mapping = torch.load('Job_Skill_HeteroData_name_mappings_v3.pt')

In [46]:
mapping['inverted_skillmapping']

{0: 'bilingual communication skills',
 1: 'leadership-focused curriculum',
 2: 'multivendor network services',
 3: 'working in shift',
 4: 'healthcare data understanding',
 5: 'pharmaceutical regulations and guidelines',
 6: 'digital creators',
 7: 'data mapping/design',
 8: 'protocol drafting',
 9: 'turbomachinery operation',
 10: 'cost decrease',
 11: 'integrated solutions',
 12: 'pi software',
 13: 'building envelop systems',
 14: 'full-stack solutions development',
 15: 'oracle controls auditing',
 16: 'operating system',
 17: 'quality programs/standards',
 18: 'cscs card',
 19: 'recruitment accessibility',
 20: 'grading operations',
 21: 'machinery installation and alignment',
 22: 'complex selling',
 23: 'stock and product management',
 24: 'tax and pay adjustments',
 25: 'engineering/mathematical background',
 26: 'correlation rules',
 27: 'personal financial counseling',
 28: 'internet server maintenance',
 29: 'program assessment solutions',
 30: 'middlewares',
 31: 'knowledge

In [8]:
mapping.keys()

dict_keys(['onet_alttitle_str_mapping', 'skillmapping', 'inverted_skillmapping', 'jobmapping', 'inverted_jobmapping', 'jobmapping_index_to_title_alttile'])

In [34]:
from tqdm.auto import tqdm
import numpy as np
from torch_geometric.loader import NeighborLoader
import gc

def get_entity_embedding(model, data, node_type, num_neighbors, node_ids):
    input_nodes = (node_type, torch.LongTensor(node_ids))
    batch_size = 64
    loader = NeighborLoader(
            data,
            num_neighbors=num_neighbors,
            # {
            #     ('Job', 'REQUIRES', 'Skill'):num_neighbors,
            #     ('Skill', 'rev_REQUIRES', 'Job'):num_neighbors,
            #     ('Skill', 'IS_SIMILAR_SKILL', 'Skill'):num_neighbors, # In this example, index 0 will never be used, since neighboring edge to a job node can't be a skill-skill edge
            #     ('Skill', 'rev_IS_SIMILAR_SKILL', 'Skill'):num_neighbors,
            #     ('Job', 'IS_SIMILAR_JOB', 'Job'):num_neighbors,
            #     ('Job', 'rev_IS_SIMILAR_JOB', 'Job'):num_neighbors,
            # },
            input_nodes = input_nodes,
            #edge_label_index=(edge_type, data[edge_type].edge_label_index), # if (edge, None), None means all edges are considered
            #  =train_data[edge].edge_label,
            #neg_sampling=negative_sampling, # adds negative samples
            batch_size=min(len(node_ids),batch_size),
            shuffle=False,
            drop_last=True,
            num_workers=0,
            directed=True,  # contains only edges which are followed, False: contains full node induced subgraph
            #disjoint=True # sampled seed node creates its own, disjoint from the rest, subgraph, will add "batch vector" to loader output
            pin_memory=True, # faster data transfer to gpu
            #num_workers=2,
            #prefetch_factor=2
    )
    num_norm_iterations = 20
    all_embeddings = []
    model.eval()
    for batch in tqdm(loader):
        torch.cuda.empty_cache()
        gc.collect()
        embeddings = []
        for _ in range(num_norm_iterations):
            
            out = model(batch.x_dict, batch.edge_index_dict, batch.edge_weight_dict, batch.num_sampled_edges_dict, batch.num_sampled_nodes_dict)
            
            embeddings.append(out[node_type].reshape((1,min(len(node_ids),batch_size),-1)).detach().cpu().numpy())
        
    
        embeddings = np.concatenate(embeddings, axis=0)
        # print(torch.std(embeddings,dim=0))
        embeddings = np.sum(embeddings, axis=0)/num_norm_iterations
        all_embeddings.append(embeddings)
    
    return np.concatenate(all_embeddings, axis=0)

num_neighbors = [5,4]
a = get_entity_embedding(model,data, 'Job', num_neighbors=num_neighbors, node_ids=[5,8])
a.shape

  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:01<00:00,  1.03s/it]


(2, 256)

In [28]:
def get_node_neighbors(data, node_type, node_ids, num_neighbors=[1000000]):
        input_nodes = (node_type, torch.LongTensor(node_ids))
        loader = NeighborLoader(
                data,
                num_neighbors=num_neighbors,
                # {
                #     ('Job', 'REQUIRES', 'Skill'):num_neighbors,
                #     ('Skill', 'rev_REQUIRES', 'Job'):num_neighbors,
                #     ('Skill', 'IS_SIMILAR_SKILL', 'Skill'):num_neighbors, # In this example, index 0 will never be used, since neighboring edge to a job node can't be a skill-skill edge
                #     ('Skill', 'rev_IS_SIMILAR_SKILL', 'Skill'):num_neighbors,
                #     ('Job', 'IS_SIMILAR_JOB', 'Job'):num_neighbors,
                #     ('Job', 'rev_IS_SIMILAR_JOB', 'Job'):num_neighbors,
                # },
                input_nodes = input_nodes,
                #edge_label_index=(edge_type, data[edge_type].edge_label_index), # if (edge, None), None means all edges are considered
                #  =train_data[edge].edge_label,
                #neg_sampling=negative_sampling, # adds negative samples
                batch_size=1,
                shuffle=False,
                drop_last=True,
                num_workers=0,
                directed=True,  # contains only edges which are followed, False: contains full node induced subgraph
                #disjoint=True # sampled seed node creates its own, disjoint from the rest, subgraph, will add "batch vector" to loader output
                pin_memory=True, # faster data transfer to gpu
                #num_workers=2,
                #prefetch_factor=2
        )
        return next(iter(loader))

In [41]:
data['Skill'].x.shape[0]

120415

In [None]:
data['Skill'].x.shape[0]

In [35]:
num_neighbors = [5,4]
node_ids = list(mapping['inverted_jobmapping'].keys())
a = node_ids[:int(len(node_ids)*0.25)]
job_embeddings1 = get_entity_embedding(model,data, 'Job', num_neighbors=num_neighbors, node_ids=a)

  0%|          | 0/217 [00:00<?, ?it/s]

100%|██████████| 217/217 [03:44<00:00,  1.04s/it]


In [11]:
print(len(list(mapping['inverted_jobmapping'].keys())))

55653


In [60]:
data['Job'].shape

AttributeError: 'NodeStorage' object has no attribute 'shape'

In [56]:
get_node_neighbors(data, 'Job', node_ids=[1])['Skill','rev_REQUIRES','Job']

{'edge_index': tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
         18, 19, 20, 21],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0]]), 'edge_weight': tensor([0.0102, 0.0145, 0.0226, 0.0112, 0.0206, 0.0995, 0.0127, 0.0177, 0.0135,
        0.0087, 0.0094, 0.0199, 0.0517, 0.0213, 0.0115, 0.0164, 0.0203, 0.0199,
        0.0130, 0.0172, 0.0120, 0.0188]), 'e_id': tensor([ 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
        27, 28, 29, 30]), 'num_sampled_edges': [22]}

In [None]:
get_node_neighbors

In [47]:
mapping['inverted_skillmapping']

{0: 'bilingual communication skills',
 1: 'leadership-focused curriculum',
 2: 'multivendor network services',
 3: 'working in shift',
 4: 'healthcare data understanding',
 5: 'pharmaceutical regulations and guidelines',
 6: 'digital creators',
 7: 'data mapping/design',
 8: 'protocol drafting',
 9: 'turbomachinery operation',
 10: 'cost decrease',
 11: 'integrated solutions',
 12: 'pi software',
 13: 'building envelop systems',
 14: 'full-stack solutions development',
 15: 'oracle controls auditing',
 16: 'operating system',
 17: 'quality programs/standards',
 18: 'cscs card',
 19: 'recruitment accessibility',
 20: 'grading operations',
 21: 'machinery installation and alignment',
 22: 'complex selling',
 23: 'stock and product management',
 24: 'tax and pay adjustments',
 25: 'engineering/mathematical background',
 26: 'correlation rules',
 27: 'personal financial counseling',
 28: 'internet server maintenance',
 29: 'program assessment solutions',
 30: 'middlewares',
 31: 'knowledge

In [20]:
# map titles to the normalized index
normindex_to_jobtitle = {}
for key, value in mapping['inverted_jobmapping'].items():
    normindex_to_jobtitle[key] = mapping['jobmapping_index_to_title_alttile'][value]

In [21]:
def find_jobtitle(dict_, s):
    for k,v in dict_.items():
        if s.lower() in v.lower():
            print(k,v)


In [37]:
from sklearn.metrics.pairwise import cosine_similarity
def find_similar(embedding, embeddings):
    similarities = cosine_similarity([embedding], embeddings)

    # Extract the indices of the top 10 most similar vectors
    top_10_indices = np.argsort(similarities[0])[-10:][::-1]
    
    return top_10_indices, similarities[0][top_10_indices]

In [36]:
# 5391 Data Scientists/Data Scientist
index = 5391
similar, cosine = find_similar(job_embeddings1[index], job_embeddings1)
print('==',normindex_to_jobtitle[index],'==')
for i, cos in zip(similar,cosine):
    print(cos,i, normindex_to_jobtitle[i])

== Data Scientists/Data Scientist ==
1.0 5391 Data Scientists/Data Scientist
0.9882052 11789 Library Science Teachers, Postsecondary/Medical Record Librarians Teacher
0.98751557 8535 Clinical and Counseling Psychologists/Behavioral Analyst
0.987507 955 Farmers, Ranchers, and Other Agricultural Managers/Animal Husbandry Manager
0.9872364 8817 Social Scientists and Related Workers, All Other/Social Psychologist
0.9871361 5393 Data Scientists/Data Visualization Developer
0.98713434 5807 Civil Engineers/Hydraulic Engineer
0.98656833 8367 Hydrologists/Hydraulic Engineer
0.9864949 964 Farmers, Ranchers, and Other Agricultural Managers/Crop or Livestock Tenant Farmer
0.98641163 6447 Nuclear Engineers/Nuclear Process Engineer


In [39]:
# 5391 Data Scientists/Data Scientist
index = 5393
similar, cosine = find_similar(job_embeddings1[index], job_embeddings1)
print('==',normindex_to_jobtitle[index],'==')
for i, cos in zip(similar,cosine):
    print(cos,i, normindex_to_jobtitle[i])

== Data Scientists/Data Visualization Developer ==
0.9999999 5393 Data Scientists/Data Visualization Developer
0.9902977 5807 Civil Engineers/Hydraulic Engineer
0.9899486 5041 Geographic Information Systems Technologists and Technicians/Geospatial Technologist
0.98981136 5024 Geographic Information Systems Technologists and Technicians/Geographic Information Systems Analyst (GIS Analyst)
0.9896761 8367 Hydrologists/Hydraulic Engineer
0.98967564 5191 Blockchain Engineers/Security Engineer
0.98957133 8374 Hydrologists/Research Hydrologist
0.9894539 2090 Brownfield Redevelopment Specialists and Site Managers/Cleanup Monitor
0.9893178 7770 Biological Scientists, All Other/Paleobotanist
0.9891199 5321 Operations Research Analysts/Research Specialist


In [33]:
get_node_neighbors(data,node_type='Job',node_ids=[5391])

HeteroData(
  Skill={
    x=[38, 132],
    n_id=[38],
    num_sampled_nodes=[2],
  },
  Job={
    x=[1, 132],
    n_id=[1],
    num_sampled_nodes=[2],
    input_id=[1],
    batch_size=1,
  },
  (Job, REQUIRES, Skill)={
    edge_index=[2, 0],
    edge_weight=[0],
    e_id=[0],
    num_sampled_edges=[1],
  },
  (Skill, IS_SIMILAR_SKILL, Skill)={
    edge_index=[2, 0],
    edge_weight=[0],
    e_id=[0],
    num_sampled_edges=[1],
  },
  (Job, IS_SIMILAR_JOB, Job)={
    edge_index=[2, 0],
    edge_weight=[0],
    e_id=[0],
    num_sampled_edges=[1],
  },
  (Skill, rev_REQUIRES, Job)={
    edge_index=[2, 38],
    edge_weight=[38],
    e_id=[38],
    num_sampled_edges=[1],
  },
  (Skill, rev_IS_SIMILAR_SKILL, Skill)={
    edge_index=[2, 0],
    edge_weight=[0],
    e_id=[0],
    num_sampled_edges=[1],
  },
  (Job, rev_IS_SIMILAR_JOB, Job)={
    edge_index=[2, 0],
    edge_weight=[0],
    e_id=[0],
    num_sampled_edges=[1],
  }
)

In [17]:
find_jobtitle(normindex_to_jobtitle,'Air')

23 Chief Executives/Consumer Affairs Director
332 Public Relations Managers/Public Affairs Director
723 Transportation, Storage, and Distribution Managers/Air Export Logistics Manager
724 Transportation, Storage, and Distribution Managers/Airport Manager
733 Transportation, Storage, and Distribution Managers/Car Inspection and Repair Manager
795 Transportation, Storage, and Distribution Managers/Railroad Car Inspection and Repair Regional Superintendent
966 Farmers, Ranchers, and Other Agricultural Managers/Dairy Farm Manager
1126 Education Administrators, Postsecondary/Academic Affairs Dean
1127 Education Administrators, Postsecondary/Academic Affairs Director
1128 Education Administrators, Postsecondary/Academic Affairs Vice President (Academic Affairs VP)
1176 Education Administrators, Postsecondary/Student Affairs Dean
1177 Education Administrators, Postsecondary/Student Affairs Director
1178 Education Administrators, Postsecondary/Student Affairs Vice President (Student Affairs VP

In [None]:
for k,v in mapping.items():
    if 'job' in k.lower():
        print(k)
        print(max(list(v.keys())))

jobmapping
55656
inverted_jobmapping
55652
jobmapping_index_to_title_alttile
55656


In [18]:
max(list(mapping['jobmapping'].keys()))

55656

In [19]:
mapping['jobmapping'][max(list(mapping['jobmapping'].keys()))]

55652