In [1]:
import pandas as pd
from torch_geometric.data import HeteroData
import torch
import torch_geometric.transforms as T
pd.set_option('display.max_rows', 50)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# only use skill nodes which have normalized_name != NaN, this is some indication of quality skill (?)
skill_nodes = pd.read_csv('fullgraphdata/neo4jgraph/skills.csv').dropna(subset=['normalized_name']).reset_index()
job_nodes = pd.read_csv('fullgraphdata/neo4jgraph/onet_skills_unique.csv')

# drop some skills "or"
skill_nodes = skill_nodes.loc[~skill_nodes.skill.isin(['or','technology'])]

In [3]:
# There are duplicate normalized names
skill_nodes.shape[0]-skill_nodes.normalized_name.unique().shape[0]

38692

In [4]:
# There are not as many skill names which are duplicate
skill_nodes.shape[0]-skill_nodes.skill.unique().shape[0]

2483

In [5]:
# we can not use normalized name instead of skill, because it is ambiguous, e.g. communication points to different normalized names
skill_nodes.loc[skill_nodes.skill=='communication']

Unnamed: 0,index,skill,category,normalized_name
695,2229,communication,communication,Third-Party Provider Communication
1292,4059,communication,healthcare,Communication (Including SBAR)
4228,12919,communication,communication,Friendly Communication
5528,16927,communication,communication,radio/telephone communication
6311,19452,communication,communication,communication (phone and email)
...,...,...,...,...
223829,759817,communication,communication,Calling/Applying
224556,762531,communication,communication,Communication
238581,818822,communication,communication,Email/Phone Communication
245411,848577,communication,soft skills,Communication (Phone/Face-to-Face)


In [6]:
skill_nodes.drop_duplicates(subset='skill', inplace=True)

In [7]:

skill_job_edges = pd.read_csv('fullgraphdata/neo4jgraph/tfidf_skill_job_edge.csv')
#skill_job_edges = skill_job_edges.loc[skill_job_edges.scaled_tfidf>8]
# only use edges where we have the skill and job for from the other files
skill_job_edges = skill_job_edges.loc[skill_job_edges['skill'].isin(skill_nodes['skill'])]
skill_job_edges = skill_job_edges.loc[skill_job_edges['alt_title'].isin(job_nodes.index)]

In [8]:
skill_job_edges

Unnamed: 0,alt_title,skill,scaled_tfidf,n_jobdesc_used
1,55010,design,9.887307,240
5,55010,cg,8.744163,240
10,55010,visual effects,6.299518,240
11,55010,software,5.288013,240
12,55010,unity,5.278638,240
...,...,...,...,...
7926039,15285,analysis,6.147100,1
7926040,15285,software,6.013723,1
7926041,15285,engineering,5.864380,1
7926050,15285,development,4.434249,1


In [9]:
#for each alt title select the first 20 skill_job edges, ordered by tfidf
skill_job_edges = skill_job_edges.groupby('alt_title').apply(lambda group: group.nlargest(20,'scaled_tfidf')).reset_index(drop=True)

In [10]:
skill_job_edges

Unnamed: 0,alt_title,skill,scaled_tfidf,n_jobdesc_used
0,7,development,35.545516,1
1,7,physical work environment,14.444801,1
2,7,microsoft teams,13.682348,1
3,7,assessment process,11.763047,1
4,7,limited supervision,10.088181,1
...,...,...,...,...
282867,55652,communications,6.736089,7
282868,55652,systems,6.629133,7
282869,55652,driving,6.265465,7
282870,55652,highly specialized,5.501605,7


In [11]:
skillmapping ={}
for i,skill in enumerate(skill_nodes.skill.unique()):
    skillmapping[skill] =i
    
jobmapping ={}
for i,index in enumerate(job_nodes['index'].unique()):
    jobmapping[index] =i
    
inverted_skillmapping = {v:k for k,v in skillmapping.items()}
inverted_jobmapping = {v:k for k,v in jobmapping.items()}

In [12]:
skill_job_edges['skill_dst'] = skill_job_edges['skill'].apply(lambda x:skillmapping[x])
skill_job_edges['job_src'] = skill_job_edges['alt_title'].apply(lambda x:jobmapping[x])

In [13]:
onet_alttitles = pd.read_csv('fullgraphdata/neo4jgraph/onet_alt_titles_unique.csv')
del onet_alttitles['Unnamed: 0']

In [14]:
onet_alttitle_str_mapping = {}
for i,row in onet_alttitles.iterrows():
    onet_alttitle_str_mapping[row['index']] = row['Alternate Title']

In [15]:
from sentence_transformers import SentenceTransformer, util
embedder = SentenceTransformer('all-MiniLM-L6-v2')


In [16]:
# create alttitle sbert embeddings to get pca dim

alttitle_sbert_embeddings = embedder.encode(list(onet_alttitle_str_mapping.values()), convert_to_tensor=False)
#alttitle_sbert_indices = [k for k,v in temp]
#corpus_embeddings = util.normalize_embeddings(corpus_embeddings)

In [17]:
import numpy as np
v = alttitle_sbert_embeddings[0]
np.matmul(v.T,v)

0.99999994

In [18]:
skill_sbert_embeddings = embedder.encode(list(skillmapping.keys()), convert_to_tensor=False)

In [19]:

from sklearn.decomposition import PCA
X = np.concatenate([alttitle_sbert_embeddings,skill_sbert_embeddings])

# print('Original:',X.shape[1])
# for variance_retained in [0.99,0.95,0.9,0.8,0.75,0.7]:
#     pca = PCA(n_components=variance_retained)
#     pca.fit(X)
#     n_components_retained = pca.n_components_
#     print(n_components_retained,' components retained', variance_retained, ' variance retained')

In [20]:
# choose 128
pca = PCA(n_components=128)
pca.fit(X)

skill_sbert_embeddings = pca.transform(embedder.encode(skill_nodes['skill'].tolist(), convert_to_numpy=True))
job_sbert_embeddings = pca.transform(embedder.encode(job_nodes['Alternate Title'].tolist(), convert_to_numpy=True))

In [21]:
# add job-job edges, dataset see https://www.onetcenter.org/dictionary/26.3/excel/related_occupations.html
job_job_edges = pd.read_csv('fullgraphdata/neo4jgraph/onet_related_occupations.csv')

In [22]:
job_job_edges

Unnamed: 0,O*NET-SOC Code,Title,Related O*NET-SOC Code,Related Title,Relatedness Tier,index_x,index_y
0,11-1011.00,Chief Executives,11-1021.00,General and Operations Managers,Primary-Short,54641,54643
1,11-1031.00,Legislators,11-1021.00,General and Operations Managers,Supplemental,54644,54643
2,11-2021.00,Marketing Managers,11-1021.00,General and Operations Managers,Supplemental,54646,54643
3,11-2022.00,Sales Managers,11-1021.00,General and Operations Managers,Primary-Long,54647,54643
4,11-2032.00,Public Relations Managers,11-1021.00,General and Operations Managers,Primary-Long,54648,54643
...,...,...,...,...,...,...,...
18379,53-2022.00,Airfield Operations Specialists,53-2011.00,"Airline Pilots, Copilots, and Flight Engineers",Primary-Short,55590,55587
18380,53-2031.00,Flight Attendants,53-2011.00,"Airline Pilots, Copilots, and Flight Engineers",Supplemental,55591,55587
18381,53-5021.00,"Captains, Mates, and Pilots of Water Vessels",53-2011.00,"Airline Pilots, Copilots, and Flight Engineers",Primary-Short,55608,55587
18382,53-5022.00,Motorboat Operators,53-2011.00,"Airline Pilots, Copilots, and Flight Engineers",Supplemental,55609,55587


In [23]:
job_job_edges['job_src'] = job_job_edges['index_x'].apply(lambda x: jobmapping[x])
job_job_edges['job_dst'] = job_job_edges['index_y'].apply(lambda x: jobmapping[x])
relatedness_weight = {
    'Supplemental':1,
    'Primary-Long':2,
    'Primary-Short':4
}
job_job_edges['relatedness_weight'] = job_job_edges['Relatedness Tier'].apply(lambda x: relatedness_weight[x])

In [24]:
skill_skill_edges = pd.read_csv('fullgraphdata/neo4jgraph/skill_skill_edges.csv')


In [25]:
#filter out potentially bad skills (which are not in our original skillmapping)
skill_skill_edges = skill_skill_edges.loc[(skill_skill_edges.skill.isin(list(skillmapping.keys()))) & (skill_skill_edges.related_skill.isin(list(skillmapping.keys())))]

In [26]:
skill_skill_edges['skill_src'] = skill_skill_edges['skill'].apply(lambda x: skillmapping[x])
skill_skill_edges['skill_dst'] = skill_skill_edges['related_skill'].apply(lambda x: skillmapping[x])

In [27]:
data = HeteroData()
data['Skill'].x = torch.tensor(skill_sbert_embeddings)
data['Job'].x = torch.tensor(job_sbert_embeddings)

data['Job','REQUIRES','Skill'].edge_index = torch.tensor(skill_job_edges[['job_src','skill_dst']].to_numpy().T)
data['Skill','IS_SIMILAR_SKILL','Skill'].edge_index = torch.tensor(skill_skill_edges[['skill_src','skill_dst']].to_numpy().T)
data['Job','IS_SIMILAR_JOB','Job'].edge_index = torch.tensor(job_job_edges[['job_src','job_dst']].to_numpy().T)


data['Job','REQUIRES','Skill'].edge_weight = torch.tensor(skill_job_edges['scaled_tfidf'].to_numpy()).to(torch.float)
data['Skill','IS_SIMILAR_SKILL','Skill'].edge_weight = torch.tensor(skill_skill_edges['cosine_sim_score'].to_numpy()).to(torch.float)
data['Job','IS_SIMILAR_JOB','Job'].edge_weight = torch.tensor(job_job_edges['relatedness_weight'].to_numpy()).to(torch.float)

In [28]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [29]:
data.has_isolated_nodes(), data.has_self_loops()

(True, False)

In [30]:
#data = data.to(device)

In [31]:
import torch_geometric.transforms as T

transform = T.Compose([
       T.RemoveIsolatedNodes(),
       T.RemoveDuplicatedEdges(),
       T.ToUndirected(merge=False) # don't merge reversed edges into the original edge type
])

data = transform(data)


In [32]:
transform = T.RandomLinkSplit(
    is_undirected=True,
    edge_types=[
        ('Job', 'REQUIRES', 'Skill'),
        ('Skill', 'IS_SIMILAR_SKILL', 'Skill'),
        ('Job', 'IS_SIMILAR_JOB', 'Job')
        ],
    # rev_edge_types=[
    #     ('Skill', 'rev_REQUIRES', 'Job'),
    #     ('Skill', 'rev_IS_SIMILAR_SKILL', 'Skill'),
    #     ('Job', 'rev_IS_SIMILAR_JOB', 'Job')
    # ],
    num_val=0.1,
    num_test=0.1,
    add_negative_train_samples=False, # only adds neg samples for val and test, neg train are added by LinkNeighborLoader. This means for each train batch, negs. are different, for val and train they stay the same
    neg_sampling_ratio=1.0,
    disjoint_train_ratio=0 #  training edges are shared for message passing and supervision
    
    )
train_data, val_data, test_data = transform(data)

In [33]:
# from torch_geometric.loader import NeighborLoader

# train_loader = NeighborLoader(
#     train_data,
#     # Sample 15 neighbors for each node and each edge type for 2 iterations:
#     num_neighbors={
#          ('Job', 'REQUIRES', 'Skill'):[1000,10], # [add x neighbors, add y neighbors for every x neighbor]
#          ('Skill', 'rev_REQUIRES', 'Job'):[10,0],
#         ('Skill', 'IS_SIMILAR_SKILL', 'Skill'):[10,10],
#         ('Skill', 'rev_IS_SIMILAR_SKILL', 'Skill'):[0,0],
#         ('Job', 'IS_SIMILAR_JOB', 'Job'):[0,20], # can't sample job-job in first iteration
#         ('Job', 'rev_IS_SIMILAR_JOB', 'Job'):[0,20],
#          },
#     # num_neighbors = [10,10],
#     # Use a batch size of 128 for sampling training nodes of type "paper":
#     batch_size=200,
#     input_nodes='Job', #if not set, we consider all nodes
#     shuffle=True,
#     drop_last=True,
#     num_workers=4,
#     directed=True,  # contains only edges which are followed randomly, False: contains full node induced subgraph
# )


In [54]:
from itertools import cycle
from typing import Tuple, List, Union
from torch_geometric.loader import LinkNeighborLoader
from torch_geometric.sampler import NegativeSampling

def create_loader(data:HeteroData, edge:Tuple[str,str,str], num_neighbors:List[int], batch_size:int, is_training:bool)->LinkNeighborLoader:
    
    print('create mini-batches for', edge)
    
    negative_sampling = NegativeSampling(
        mode='binary',
        amount=20  # ratio, like Graphsage
        #weight=  # "Probabilities" of nodes to be sampled: Node degree follows power law distribution
        )
    
    loader = LinkNeighborLoader(
        data,
        num_neighbors={
            ('Job', 'REQUIRES', 'Skill'):num_neighbors,
            ('Skill', 'rev_REQUIRES', 'Job'):num_neighbors,
            ('Skill', 'IS_SIMILAR_SKILL', 'Skill'):num_neighbors, # In this example, index 0 will never be used, since neighboring edge to a job node can't be a skill-skill edge
            ('Skill', 'rev_IS_SIMILAR_SKILL', 'Skill'):num_neighbors,
            ('Job', 'IS_SIMILAR_JOB', 'Job'):num_neighbors, 
            ('Job', 'rev_IS_SIMILAR_JOB', 'Job'):num_neighbors,
        },
        edge_label_index=(edge, None), # None means all edges are considered
        #edge_label =train_data[edge].edge_label,
        neg_sampling=negative_sampling, # adds negative samples
        batch_size=batch_size,
        shuffle=is_training,
        #drop_last=True,
        num_workers=4,
        directed=True,  # contains only edges which are followed, False: contains full node induced subgraph
        #disjoint=True # sampled seed node creates its own, disjoint from the rest, subgraph, will add "batch vector" to loader output 
    )
    
    return loader
    
    
batch_size=256
num_neighbors = [5,2]

train_loaders, val_loaders, test_loaders = [], [], []
for edge_type in train_data.edge_types:
    # create mini-batches for each edge type, because LinkNeighborLoader only allows one target edge type
    
    datasets = {
        'train':train_data,
        'val': val_data,
        'test': test_data
    }
    loader = create_loader(
        data=train_data,
        edge=edge_type,
        num_neighbors=num_neighbors,
        batch_size=batch_size,
        is_training=True
    )
    train_loaders.append(loader)
    
    loader = create_loader(
        data=val_data,
        edge=edge_type,
        num_neighbors=num_neighbors,
        batch_size=batch_size,
        is_training=False
    )
    
    val_loaders.append(loader)
    
    loader = create_loader(
        data=test_data,
        edge=edge_type,
        num_neighbors=num_neighbors,
        batch_size=batch_size,
        is_training=False
    )
    
    test_loaders.append(loader)


def combined_iterator(iterables):
    iterators = [cycle(it) for it in iterables]
    
    while True:
        yield tuple(next(it) for it in iterators)
        
train_iterator = combined_iterator(train_loaders)
val_iterator = combined_iterator(val_loaders)
test_iterator = combined_iterator(test_loaders)


create mini-batches for ('Job', 'REQUIRES', 'Skill')
create mini-batches for ('Job', 'REQUIRES', 'Skill')
create mini-batches for ('Job', 'REQUIRES', 'Skill')
create mini-batches for ('Skill', 'IS_SIMILAR_SKILL', 'Skill')
create mini-batches for ('Skill', 'IS_SIMILAR_SKILL', 'Skill')
create mini-batches for ('Skill', 'IS_SIMILAR_SKILL', 'Skill')
create mini-batches for ('Job', 'IS_SIMILAR_JOB', 'Job')
create mini-batches for ('Job', 'IS_SIMILAR_JOB', 'Job')
create mini-batches for ('Job', 'IS_SIMILAR_JOB', 'Job')
create mini-batches for ('Skill', 'rev_REQUIRES', 'Job')
create mini-batches for ('Skill', 'rev_REQUIRES', 'Job')
create mini-batches for ('Skill', 'rev_REQUIRES', 'Job')
create mini-batches for ('Skill', 'rev_IS_SIMILAR_SKILL', 'Skill')
create mini-batches for ('Skill', 'rev_IS_SIMILAR_SKILL', 'Skill')
create mini-batches for ('Skill', 'rev_IS_SIMILAR_SKILL', 'Skill')
create mini-batches for ('Job', 'rev_IS_SIMILAR_JOB', 'Job')
create mini-batches for ('Job', 'rev_IS_SIMILAR_

	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [35]:
next(iter(train_loaders[0]))

HeteroData(
  [1mSkill[0m={
    x=[32801, 128],
    n_id=[32801]
  },
  [1mJob[0m={
    x=[9500, 128],
    n_id=[9500]
  },
  [1m(Job, REQUIRES, Skill)[0m={
    edge_index=[2, 12279],
    edge_weight=[12279],
    edge_label=[2688],
    edge_label_index=[2, 2688],
    e_id=[12279],
    input_id=[128]
  },
  [1m(Skill, IS_SIMILAR_SKILL, Skill)[0m={
    edge_index=[2, 28932],
    edge_weight=[28932],
    edge_label=[540489],
    edge_label_index=[2, 540489],
    e_id=[28932]
  },
  [1m(Job, IS_SIMILAR_JOB, Job)[0m={
    edge_index=[2, 1806],
    edge_weight=[1806],
    edge_label=[7437],
    edge_label_index=[2, 7437],
    e_id=[1806]
  },
  [1m(Skill, rev_REQUIRES, Job)[0m={
    edge_index=[2, 15624],
    edge_weight=[15624],
    e_id=[15624]
  },
  [1m(Skill, rev_IS_SIMILAR_SKILL, Skill)[0m={
    edge_index=[2, 26216],
    edge_weight=[26216],
    e_id=[26216]
  },
  [1m(Job, rev_IS_SIMILAR_JOB, Job)[0m={
    edge_index=[2, 1808],
    edge_weight=[1808],
    e_id=[1808]


In [36]:
# helpful article
# https://medium.com/stanford-cs224w/a-tour-of-pygs-data-loaders-9f2384e48f8f

# some info

# HeteroData(
#   Job={
#     x=[9222, 128], # node features
#     n_id=[9222] # the ids of the nodes in the original train_data set
#   },
#   (Job, REQUIRES, Skill)={
#     edge_index=[2, 14498], # sampled edges
#     edge_attr=[14498, 1],  # edge attributes of sampled edges
#     edge_label=[509170], # 1 if it is a true edge, 0 if it is a false
#     edge_label_index=[2, 509170], # all edges?
#     e_id=[14498] # edge ids of edges in the original train_data set



# if batchsize is 16 for the edge and we have neg_sampling=binary, we will have
# this many jobs:
#  Job={
#     x=[64, 128],
#     n_id=[64]
#   },
# since we sample a negative and a positive edge each, and each edge has 2 Job nodes (if our target is the job nodes)

# LinkNeighborloader will sample negative edges for the target edges only, as we expect it
# so for the "neighbor"-edges we get only positive ones

In [58]:
from typing import Tuple, Union
from torch import Tensor
from torch_geometric.nn import to_hetero, HeteroDictLinear, Linear
from torch_geometric.nn.conv import GraphConv, SAGEConv, SimpleConv
import torch.nn.functional as F
from torch_geometric.typing import Adj, OptPairTensor, OptTensor, Size




# PyG does not implement the exact max pooling aggregation as in the GraphSage paper
# with GraphConvWithPool we manually extend it by adding a linear layer on x before .propagate
# as our activation function is monotonically increasing, this modification corresponds to the max pooling aggregation

class GraphConvWithPool(GraphConv):
    def __init__(self, in_channels, out_channels: int, aggr: str = 'add', bias: bool = True, **kwargs):
        super().__init__(in_channels, out_channels, aggr, bias, **kwargs)
        self.linear = torch.nn.Linear(in_channels, in_channels, bias=False)
    
    def forward(self, x: Union[Tensor, OptPairTensor], edge_index: Adj,
                edge_weight: OptTensor = None, size: Size = None) -> Tensor:

        if isinstance(x, Tensor):
            x: OptPairTensor = (x, x)
            
        x = self.linear(x) # added this

        out = self.propagate(edge_index, x=x, edge_weight=edge_weight,
                             size=size)
        out = self.lin_rel(out)

        x_r = x[1]
        if x_r is not None:
            out = out + self.lin_root(x_r)

        return out
    

    
class WeightedSkillSage(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels, aggregator='max'):
        super().__init__()
        #self.linear1 = Linear(-1,-1)
        #self.conv1 = SimpleConv(aggr='sum')
        self.conv1 = GraphConv(in_channels=-1, out_channels=hidden_channels)
        self.conv2 = GraphConv(in_channels=hidden_channels, out_channels=hidden_channels)
        self.linear3 = Linear(hidden_channels,out_channels)
        
    def forward(self, x: HeteroData, edge_index, edge_weight):
        x = self.conv1(x, edge_index, edge_weight=edge_weight)
        x = F.relu(x)
        x = self.conv2(x, edge_index, edge_weight=edge_weight)
        x = F.relu(x)
        x = self.linear3(x)
        return x

model = WeightedSkillSage(hidden_channels=64, out_channels=64)
model = to_hetero(model, train_data.metadata(), aggr='sum')
model = model.to(device)


In [59]:
import gc
import os 
os.environ["TOKENIZERS_PARALLELISM"] = "true"
# with torch.no_grad():  # Initialize lazy modules.
#      out = model(batch.x_dict, batch.edge_index_dict, batch.edge_weight_dict)
def get_supervision_edge_type(heterodata):
    for edge_type in heterodata.edge_types:
        if 'input_id' in heterodata[edge_type].keys():
            return edge_type

def free_memory():
  """Clears the GPU cache and triggers garbage collection, to reduce OOMs."""
  torch.cuda.empty_cache()
  gc.collect()
  
free_memory()

optimizer = torch.optim.Adam(model.parameters(), lr=0.000001)
loss_fn = torch.nn.CrossEntropyLoss()
model.train()
for edge_batches in train_iterator:
    batch_loss = 0
    
    
    # each batch here is one edge type, since we want to learn for all edge types
    for batch in edge_batches:
        batch = batch.to(device)
        hetero_out = model(batch.x_dict, batch.edge_index_dict, batch.edge_weight_dict)
        
        supervision_edge_type = get_supervision_edge_type(batch)
        src_type, dst_type = supervision_edge_type[0], supervision_edge_type[2]
        edge_label = batch[supervision_edge_type].edge_label
        edge_label_index = batch[supervision_edge_type].edge_label_index
        src_node_embeddings = hetero_out[src_type][edge_label_index[0]]
        dst_node_embeddings = hetero_out[dst_type][edge_label_index[1]]
        logits = F.cosine_similarity(src_node_embeddings, dst_node_embeddings, dim=-1)
        loss = loss_fn(logits, edge_label)
        batch_loss += loss
        
    batch_loss.backward()
    optimizer.step()
    
    print('mini-batch loss:',float(batch_loss))
        


        

mini-batch loss: 13271.404296875
mini-batch loss: 13289.005859375
mini-batch loss: 13281.390625
mini-batch loss: 13287.20703125
mini-batch loss: 13274.556640625
mini-batch loss: 13274.1611328125
mini-batch loss: 13277.517578125
mini-batch loss: 13279.625
mini-batch loss: 13289.06640625


KeyboardInterrupt: 

In [41]:
free_memory()

In [None]:
supervision_edge_type = get_supervision_edge_type(batch)
src_type, dst_type = supervision_edge_type[0], supervision_edge_type[2]
edge_label = batch[supervision_edge_type].edge_label
edge_label_index = batch[supervision_edge_type].edge_label_index
src_node_embeddings = out[src_type][edge_label_index[0]]
dst_node_embeddings = out[dst_type][edge_label_index[1]]
torch.min(F.cosine_similarity(src_node_embeddings, dst_node_embeddings, dim=-1))

torch.Size([5376])

In [None]:
import gc

gc.collect()

torch.cuda.empty_cache()

RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


: 

In [None]:
batch[supervision_edge_type].edge_label_index

tensor([[4460,  237,  316,  ..., 2804, 1099, 3134],
        [ 158, 1595, 2606,  ...,  108, 1347, 4633]], device='cuda:0')

In [None]:
J2S = ('Job','REQUIRES','Skill')
batch[J2S].edge_label_index

tensor([[4460,  237,  316,  ..., 2804, 1099, 3134],
        [ 158, 1595, 2606,  ...,  108, 1347, 4633]], device='cuda:0')

In [None]:
batch['Job','REQUIRES','Skill'].

tensor([134718,  60771, 109334, 210252, 135223, 172799,   4604,  92422,  35868,
          2600, 129587,   6922, 222748,  81280, 139478, 167126,  69862, 160250,
         81644,    101, 100083, 193585, 172025, 216250,  93113, 192029, 106241,
        158043, 157450, 158744,  31111, 195675, 144262,  23150, 104022, 184564,
         72108,  85275, 101698, 143524, 161261, 213126,  23844, 143555, 124899,
         59291,  65414,   9667, 138614,  20424,  34069, 104404,  18711,  82948,
        156289, 103813,  12223, 201018, 212482, 175602, 200273, 219751, 170198,
         77534, 206897, 132175, 114306,  95478, 198122,  50403,  60504,  37108,
         25215, 133788,  94194, 100135, 192085,  81763,   5819, 132452,  95506,
         19953,  85212, 185467,  74795, 177952,  65368,  87641,  95378, 111448,
        167820,   1067,  99695,  85312, 172930, 218172, 217609, 100899, 112814,
        178490, 175506, 144506, 185587,  78916, 118564, 155939, 218726, 122990,
         95823,  20725, 142548,   5871, 

In [None]:
batch = next(iter(train_loaders[0]))


In [None]:
batch

HeteroData(
  [1mSkill[0m={
    x=[44584, 128],
    n_id=[44584]
  },
  [1mJob[0m={
    x=[11936, 128],
    n_id=[11936]
  },
  [1m(Job, REQUIRES, Skill)[0m={
    edge_index=[2, 18210],
    edge_weight=[18210],
    edge_label=[5376],
    edge_label_index=[2, 5376],
    e_id=[18210],
    input_id=[256]
  },
  [1m(Skill, IS_SIMILAR_SKILL, Skill)[0m={
    edge_index=[2, 48111],
    edge_weight=[48111],
    edge_label=[540489],
    edge_label_index=[2, 540489],
    e_id=[48111]
  },
  [1m(Job, IS_SIMILAR_JOB, Job)[0m={
    edge_index=[2, 2394],
    edge_weight=[2394],
    edge_label=[7437],
    edge_label_index=[2, 7437],
    e_id=[2394]
  },
  [1m(Skill, rev_REQUIRES, Job)[0m={
    edge_index=[2, 27294],
    edge_weight=[27294],
    e_id=[27294]
  },
  [1m(Skill, rev_IS_SIMILAR_SKILL, Skill)[0m={
    edge_index=[2, 42699],
    edge_weight=[42699],
    e_id=[42699]
  },
  [1m(Job, rev_IS_SIMILAR_JOB, Job)[0m={
    edge_index=[2, 2402],
    edge_weight=[2402],
    e_id=[2402

In [None]:
batch.edge_weight_dict

{('Job',
  'REQUIRES',
  'Skill'): tensor([33.3133,  2.4102,  8.3549,  ...,  6.3921, 24.6312,  3.6748],
        dtype=torch.float64),
 ('Skill',
  'IS_SIMILAR_SKILL',
  'Skill'): tensor([0.7200, 0.5300, 0.5100,  ..., 0.5400, 0.5200, 0.5800],
        dtype=torch.float64),
 ('Job', 'IS_SIMILAR_JOB', 'Job'): tensor([1, 1, 4,  ..., 2, 1, 4]),
 ('Skill',
  'rev_REQUIRES',
  'Job'): tensor([ 22.1621,  17.0839, 105.2547,  ...,   2.0872,   1.0532,   0.9343],
        dtype=torch.float64),
 ('Skill',
  'rev_IS_SIMILAR_SKILL',
  'Skill'): tensor([0.5300, 0.7200, 0.5300,  ..., 0.5500, 0.6000, 0.6600],
        dtype=torch.float64),
 ('Job', 'rev_IS_SIMILAR_JOB', 'Job'): tensor([1, 2, 1,  ..., 4, 1, 2])}

In [None]:
index = batch['Job','IS_SIMILAR_JOB','Job'].e_id
labels = batch['Job','IS_SIMILAR_JOB','Job'].edge_label

NameError: name 'batch' is not defined

In [None]:
import torch_geometric.transforms as T
from torch_geometric.datasets import OGB_MAG
from torch_geometric.nn import SAGEConv, to_hetero


dataset = OGB_MAG(root='./data', preprocess='metapath2vec', transform=T.ToUndirected())
data = dataset[0]

class GNN(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv(-1, hidden_channels)
        self.conv2 = SAGEConv(-1, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x


model = GNN(hidden_channels=64, out_channels=dataset.num_classes)
model = to_hetero(model, data.metadata(), aggr='sum')

In [None]:
#https://colab.research.google.com/drive/1GrAxHyZCZ13jpTkMy9vVO_v_U9nHDdvB#scrollTo=wmiFKI0ovYN4


In [None]:
# intially we use this GraphConv layer and aggregate using mean
# this layer allows the addition of edge weights: the adjacency matrix simply consists not of 1s and 0s but the corresponding weights
#https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.nn.conv.GraphConv.html

# using max pool
# https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.nn.pool.global_max_pool.html#torch_geometric.nn.pool.global_max_pool