In [27]:
import os
import sys
import numpy as np
from pathlib import Path
from tqdm.auto import tqdm
from google.colab import drive

drive.mount('/content/drive',force_remount=True)
DRIVE_FOLDER = Path('/content/drive/MyDrive/DataExplorationProject/Skill_Ontology_GNN')

if 'google.colab' in sys.modules:
  colab_path = '/content/'
else:
  colab_path = ''

Mounted at /content/drive


In [None]:
import torch
from matplotlib import pyplot as plt
import gc

class Trainer:
    def __init__(self, model, criterion, optimizer, device):
        self.model = model
        self.criterion = criterion
        self.optimizer = optimizer
        self.device = device
        self.train_losses = []
        self.val_losses = []

    def free_memory():
        """Clears the GPU cache and triggers garbage collection, to reduce OOMs."""
        torch.cuda.empty_cache()
        gc.collect()

    def train(self, dataloader, save_interval, save_path):
        self.free_memory()
        self.model.train()
        for batch_idx, (data, target) in enumerate(dataloader):
            data, target = data.to(self.device), target.to(self.device)
            self.optimizer.zero_grad()
            output = self.model(data)
            loss = self.criterion(output, target)
            loss.backward()
            self.optimizer.step()
            self.train_losses.append(loss.item())
            if batch_idx % save_interval == 0:
                self.save_checkpoint(batch_idx, save_path)

    def validate(self, dataloader):
        self.model.eval()
        with torch.no_grad():
            for data, target in dataloader:
                data, target = data.to(self.device), target.to(self.device)
                output = self.model(data)
                loss = self.criterion(output, target)
                self.val_losses.append(loss.item())

    def save_checkpoint(self, batch_idx, save_path):
        torch.save({
            'model_state_dict': self.model.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'train_losses': self.train_losses,
            'val_losses': self.val_losses
        }, f'{save_path}/checkpoint_{batch_idx}.pt')

    def load_checkpoint(self, load_path):
        checkpoint = torch.load(load_path)
        self.model.load_state_dict(checkpoint['model_state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        self.train_losses = checkpoint['train_losses']
        self.val_losses = checkpoint['val_losses']

    def plot_losses(self):
        plt.figure(figsize=(10,5))
        plt.title("Training and Validation Loss")
        plt.plot(self.train_losses,label="train")
        plt.plot(self.val_losses,label="val")
        plt.xlabel("iterations")
        plt.ylabel("Loss")
        plt.legend()
        plt.show()

In [None]:
# Install required packages.
import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

In [None]:
!pip install sentence-transformers

In [10]:
# unpack datasets
if not 'unzipped' in globals():
  !unzip /content/drive/MyDrive/DataExplorationProject/Skill_Ontology_GNN/neo4jgraph.zip
  unzipped =True

In [11]:
import pandas as pd
from torch_geometric.data import HeteroData
import torch
import torch_geometric.transforms as T
pd.set_option('display.max_rows', 50)

In [12]:
!ls /content/drive/MyDrive/DataExplorationProject/Skill_Ontology_GNN

neo4jgraph.zip


In [30]:
# only use skill nodes which have normalized_name != NaN, this is some indication of quality skill (?)
skill_nodes = pd.read_csv(colab_path+'neo4jgraph/skills.csv').dropna(subset=['normalized_name']).reset_index()
job_nodes = pd.read_csv(colab_path+'neo4jgraph/onet_skills_unique.csv')

# drop some skills "or"
skill_nodes = skill_nodes.loc[~skill_nodes.skill.isin(['or','technology'])]

In [31]:
# There are duplicate normalized names
skill_nodes.shape[0]-skill_nodes.normalized_name.unique().shape[0]

38830

In [32]:
# There are not as many skill names which are duplicate
skill_nodes.shape[0]-skill_nodes.skill.unique().shape[0]

2488

In [33]:
# we can not use normalized name instead of skill, because it is ambiguous, e.g. communication points to different normalized names
skill_nodes.loc[skill_nodes.skill=='communication']

Unnamed: 0,index,skill,category,normalized_name
695,2229,communication,communication,Third-Party Provider Communication
1294,4059,communication,healthcare,Communication (Including SBAR)
4231,12919,communication,communication,Friendly Communication
5532,16927,communication,communication,radio/telephone communication
6315,19452,communication,communication,communication (phone and email)
...,...,...,...,...
223945,759817,communication,communication,Calling/Applying
224672,762531,communication,communication,Communication
238708,818822,communication,communication,Email/Phone Communication
245547,848577,communication,soft skills,Communication (Phone/Face-to-Face)


In [34]:
skill_nodes.drop_duplicates(subset='skill', inplace=True)

In [35]:

skill_job_edges = pd.read_csv(colab_path+'neo4jgraph/tfidf_skill_job_edge.csv')
#skill_job_edges = skill_job_edges.loc[skill_job_edges.scaled_tfidf>8]
# only use edges where we have the skill and job for from the other files
skill_job_edges = skill_job_edges.loc[skill_job_edges['skill'].isin(skill_nodes['skill'])]
skill_job_edges = skill_job_edges.loc[skill_job_edges['alt_title'].isin(job_nodes.index)]

In [36]:
skill_job_edges

Unnamed: 0,alt_title,skill,scaled_tfidf,n_jobdesc_used
1,55010,design,9.887307,240
5,55010,cg,8.744163,240
10,55010,visual effects,6.299518,240
11,55010,software,5.288013,240
12,55010,unity,5.278638,240
...,...,...,...,...
7926039,15285,analysis,6.147100,1
7926040,15285,software,6.013723,1
7926041,15285,engineering,5.864380,1
7926050,15285,development,4.434249,1


In [37]:
#for each alt title select the first 20 skill_job edges, ordered by tfidf
skill_job_edges = skill_job_edges.groupby('alt_title').apply(lambda group: group.nlargest(20,'scaled_tfidf')).reset_index(drop=True)

In [38]:
skill_job_edges

Unnamed: 0,alt_title,skill,scaled_tfidf,n_jobdesc_used
0,7,development,35.545516,1
1,7,physical work environment,14.444801,1
2,7,microsoft teams,13.682348,1
3,7,assessment process,11.763047,1
4,7,limited supervision,10.088181,1
...,...,...,...,...
282883,55652,communications,6.736089,7
282884,55652,systems,6.629133,7
282885,55652,driving,6.265465,7
282886,55652,highly specialized,5.501605,7


In [39]:
skillmapping ={}
for i,skill in enumerate(skill_nodes.skill.unique()):
    skillmapping[skill] =i

jobmapping ={}
for i,index in enumerate(job_nodes['index'].unique()):
    jobmapping[index] =i

inverted_skillmapping = {v:k for k,v in skillmapping.items()}
inverted_jobmapping = {v:k for k,v in jobmapping.items()}

In [40]:
skill_job_edges['skill_dst'] = skill_job_edges['skill'].apply(lambda x:skillmapping[x])
skill_job_edges['job_src'] = skill_job_edges['alt_title'].apply(lambda x:jobmapping[x])

In [41]:
onet_alttitles = pd.read_csv(colab_path+'neo4jgraph/onet_alt_titles_unique.csv')
del onet_alttitles['Unnamed: 0']

In [42]:
onet_alttitle_str_mapping = {}
for i,row in onet_alttitles.iterrows():
    onet_alttitle_str_mapping[row['index']] = row['Alternate Title']

In [45]:
from sentence_transformers import SentenceTransformer, util
embedder = SentenceTransformer('all-MiniLM-L6-v2')


In [47]:
# create alttitle sbert embeddings to get pca dim

alttitle_sbert_embeddings = embedder.encode(list(onet_alttitle_str_mapping.values()), convert_to_tensor=False)
#alttitle_sbert_indices = [k for k,v in temp]
#corpus_embeddings = util.normalize_embeddings(corpus_embeddings)

In [48]:
import numpy as np
v = alttitle_sbert_embeddings[0]
np.matmul(v.T,v)

0.99999994

In [None]:
skill_sbert_embeddings = embedder.encode(list(skillmapping.keys()), convert_to_tensor=False)

In [None]:

from sklearn.decomposition import PCA
X = np.concatenate([alttitle_sbert_embeddings,skill_sbert_embeddings])

# print('Original:',X.shape[1])
# for variance_retained in [0.99,0.95,0.9,0.8,0.75,0.7]:
#     pca = PCA(n_components=variance_retained)
#     pca.fit(X)
#     n_components_retained = pca.n_components_
#     print(n_components_retained,' components retained', variance_retained, ' variance retained')

In [None]:
# choose 128
pca = PCA(n_components=128)
pca.fit(X)

skill_sbert_embeddings = pca.transform(embedder.encode(skill_nodes['skill'].tolist(), convert_to_numpy=True))
job_sbert_embeddings = pca.transform(embedder.encode(job_nodes['Alternate Title'].tolist(), convert_to_numpy=True))

In [None]:
# add job-job edges, dataset see https://www.onetcenter.org/dictionary/26.3/excel/related_occupations.html
job_job_edges = pd.read_csv(colab_path+'neo4jgraph/onet_related_occupations.csv')

In [None]:
job_job_edges

In [None]:
job_job_edges['job_src'] = job_job_edges['index_x'].apply(lambda x: jobmapping[x])
job_job_edges['job_dst'] = job_job_edges['index_y'].apply(lambda x: jobmapping[x])
relatedness_weight = {
    'Supplemental':1,
    'Primary-Long':2,
    'Primary-Short':4
}
job_job_edges['relatedness_weight'] = job_job_edges['Relatedness Tier'].apply(lambda x: relatedness_weight[x])

In [None]:
skill_skill_edges = pd.read_csv(colab_path+'neo4jgraph/skill_skill_edges.csv')


In [None]:
#filter out potentially bad skills (which are not in our original skillmapping)
skill_skill_edges = skill_skill_edges.loc[(skill_skill_edges.skill.isin(list(skillmapping.keys()))) & (skill_skill_edges.related_skill.isin(list(skillmapping.keys())))]

In [None]:
skill_skill_edges['skill_src'] = skill_skill_edges['skill'].apply(lambda x: skillmapping[x])
skill_skill_edges['skill_dst'] = skill_skill_edges['related_skill'].apply(lambda x: skillmapping[x])

In [None]:
data = HeteroData()
data['Skill'].x = torch.tensor(skill_sbert_embeddings)
data['Job'].x = torch.tensor(job_sbert_embeddings)

data['Job','REQUIRES','Skill'].edge_index = torch.tensor(skill_job_edges[['job_src','skill_dst']].to_numpy().T)
data['Skill','IS_SIMILAR_SKILL','Skill'].edge_index = torch.tensor(skill_skill_edges[['skill_src','skill_dst']].to_numpy().T)
data['Job','IS_SIMILAR_JOB','Job'].edge_index = torch.tensor(job_job_edges[['job_src','job_dst']].to_numpy().T)


data['Job','REQUIRES','Skill'].edge_weight = torch.tensor(skill_job_edges['scaled_tfidf'].to_numpy()).to(torch.float)
data['Skill','IS_SIMILAR_SKILL','Skill'].edge_weight = torch.tensor(skill_skill_edges['cosine_sim_score'].to_numpy()).to(torch.float)
data['Job','IS_SIMILAR_JOB','Job'].edge_weight = torch.tensor(job_job_edges['relatedness_weight'].to_numpy()).to(torch.float)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
data.has_isolated_nodes(), data.has_self_loops()

In [None]:
#data = data.to(device)

In [None]:
import torch_geometric.transforms as T

transform = T.Compose([
       T.RemoveIsolatedNodes(),
       T.RemoveDuplicatedEdges(),
       T.ToUndirected(merge=False) # don't merge reversed edges into the original edge type
])

data = transform(data)


In [None]:
transform = T.RandomLinkSplit(
    is_undirected=True,
    edge_types=[
        ('Job', 'REQUIRES', 'Skill'),
        ('Skill', 'IS_SIMILAR_SKILL', 'Skill'),
        ('Job', 'IS_SIMILAR_JOB', 'Job')
        ],
    # rev_edge_types=[
    #     ('Skill', 'rev_REQUIRES', 'Job'),
    #     ('Skill', 'rev_IS_SIMILAR_SKILL', 'Skill'),
    #     ('Job', 'rev_IS_SIMILAR_JOB', 'Job')
    # ],
    num_val=0.1,
    num_test=0.1,
    add_negative_train_samples=False, # only adds neg samples for val and test, neg train are added by LinkNeighborLoader. This means for each train batch, negs. are different, for val and train they stay the same
    neg_sampling_ratio=1.0,
    disjoint_train_ratio=0 #  training edges are shared for message passing and supervision

    )
train_data, val_data, test_data = transform(data)

In [None]:
# from torch_geometric.loader import NeighborLoader

# train_loader = NeighborLoader(
#     train_data,
#     # Sample 15 neighbors for each node and each edge type for 2 iterations:
#     num_neighbors={
#          ('Job', 'REQUIRES', 'Skill'):[1000,10], # [add x neighbors, add y neighbors for every x neighbor]
#          ('Skill', 'rev_REQUIRES', 'Job'):[10,0],
#         ('Skill', 'IS_SIMILAR_SKILL', 'Skill'):[10,10],
#         ('Skill', 'rev_IS_SIMILAR_SKILL', 'Skill'):[0,0],
#         ('Job', 'IS_SIMILAR_JOB', 'Job'):[0,20], # can't sample job-job in first iteration
#         ('Job', 'rev_IS_SIMILAR_JOB', 'Job'):[0,20],
#          },
#     # num_neighbors = [10,10],
#     # Use a batch size of 128 for sampling training nodes of type "paper":
#     batch_size=200,
#     input_nodes='Job', #if not set, we consider all nodes
#     shuffle=True,
#     drop_last=True,
#     num_workers=4,
#     directed=True,  # contains only edges which are followed randomly, False: contains full node induced subgraph
# )


In [None]:
from itertools import cycle
from typing import Tuple, List, Union
from torch_geometric.loader import LinkNeighborLoader
from torch_geometric.sampler import NegativeSampling

def create_loader(data:HeteroData, edge:Tuple[str,str,str], num_neighbors:List[int], batch_size:int, is_training:bool)->LinkNeighborLoader:

    print('create mini-batches for', edge)

    negative_sampling = NegativeSampling(
        mode='binary',
        amount=20  # ratio, like Graphsage
        #weight=  # "Probabilities" of nodes to be sampled: Node degree follows power law distribution
        )

    loader = LinkNeighborLoader(
        data,
        num_neighbors={
            ('Job', 'REQUIRES', 'Skill'):num_neighbors,
            ('Skill', 'rev_REQUIRES', 'Job'):num_neighbors,
            ('Skill', 'IS_SIMILAR_SKILL', 'Skill'):num_neighbors, # In this example, index 0 will never be used, since neighboring edge to a job node can't be a skill-skill edge
            ('Skill', 'rev_IS_SIMILAR_SKILL', 'Skill'):num_neighbors,
            ('Job', 'IS_SIMILAR_JOB', 'Job'):num_neighbors,
            ('Job', 'rev_IS_SIMILAR_JOB', 'Job'):num_neighbors,
        },
        edge_label_index=(edge, None), # None means all edges are considered
        #edge_label =train_data[edge].edge_label,
        neg_sampling=negative_sampling, # adds negative samples
        batch_size=batch_size,
        shuffle=is_training,
        #drop_last=True,
        num_workers=4,
        directed=True,  # contains only edges which are followed, False: contains full node induced subgraph
        #disjoint=True # sampled seed node creates its own, disjoint from the rest, subgraph, will add "batch vector" to loader output
    )

    return loader


batch_size=256
num_neighbors = [5,2]

train_loaders, val_loaders, test_loaders = [], [], []
for edge_type in train_data.edge_types:
    # create mini-batches for each edge type, because LinkNeighborLoader only allows one target edge type

    datasets = {
        'train':train_data,
        'val': val_data,
        'test': test_data
    }
    loader = create_loader(
        data=train_data,
        edge=edge_type,
        num_neighbors=num_neighbors,
        batch_size=batch_size,
        is_training=True
    )
    train_loaders.append(loader)

    loader = create_loader(
        data=val_data,
        edge=edge_type,
        num_neighbors=num_neighbors,
        batch_size=batch_size,
        is_training=False
    )

    val_loaders.append(loader)

    loader = create_loader(
        data=test_data,
        edge=edge_type,
        num_neighbors=num_neighbors,
        batch_size=batch_size,
        is_training=False
    )

    test_loaders.append(loader)


def combined_iterator(iterables):
    iterators = [cycle(it) for it in iterables]

    while True:
        yield tuple(next(it) for it in iterators)

train_iterator = combined_iterator(train_loaders)
val_iterator = combined_iterator(val_loaders)
test_iterator = combined_iterator(test_loaders)


In [None]:
next(iter(train_loaders[0]))

In [None]:
# helpful article
# https://medium.com/stanford-cs224w/a-tour-of-pygs-data-loaders-9f2384e48f8f

# some info

# HeteroData(
#   Job={
#     x=[9222, 128], # node features
#     n_id=[9222] # the ids of the nodes in the original train_data set
#   },
#   (Job, REQUIRES, Skill)={
#     edge_index=[2, 14498], # sampled edges
#     edge_attr=[14498, 1],  # edge attributes of sampled edges
#     edge_label=[509170], # 1 if it is a true edge, 0 if it is a false
#     edge_label_index=[2, 509170], # all edges?
#     e_id=[14498] # edge ids of edges in the original train_data set



# if batchsize is 16 for the edge and we have neg_sampling=binary, we will have
# this many jobs:
#  Job={
#     x=[64, 128],
#     n_id=[64]
#   },
# since we sample a negative and a positive edge each, and each edge has 2 Job nodes (if our target is the job nodes)

# LinkNeighborloader will sample negative edges for the target edges only, as we expect it
# so for the "neighbor"-edges we get only positive ones

In [None]:
from typing import Tuple, Union
from torch import Tensor
from torch_geometric.nn import to_hetero, HeteroDictLinear, Linear
from torch_geometric.nn.conv import GraphConv, SAGEConv, SimpleConv
import torch.nn.functional as F
from torch_geometric.typing import Adj, OptPairTensor, OptTensor, Size




# PyG does not implement the exact max pooling aggregation as in the GraphSage paper
# with GraphConvWithPool we manually extend it by adding a linear layer on x before .propagate
# as our activation function is monotonically increasing, this modification corresponds to the max pooling aggregation

class GraphConvWithPool(GraphConv):
    def __init__(self, in_channels, out_channels: int, aggr: str = 'add', bias: bool = True, **kwargs):
        super().__init__(in_channels, out_channels, aggr, bias, **kwargs)
        self.linear = torch.nn.Linear(in_channels, in_channels, bias=False)

    def forward(self, x: Union[Tensor, OptPairTensor], edge_index: Adj,
                edge_weight: OptTensor = None, size: Size = None) -> Tensor:

        if isinstance(x, Tensor):
            x: OptPairTensor = (x, x)

        x = self.linear(x) # added this

        out = self.propagate(edge_index, x=x, edge_weight=edge_weight,
                             size=size)
        out = self.lin_rel(out)

        x_r = x[1]
        if x_r is not None:
            out = out + self.lin_root(x_r)

        return out



class WeightedSkillSage(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels, aggregator='max'):
        super().__init__()
        #self.linear1 = Linear(-1,-1)
        #self.conv1 = SimpleConv(aggr='sum')
        self.conv1 = GraphConv(in_channels=-1, out_channels=hidden_channels)
        self.conv2 = GraphConv(in_channels=hidden_channels, out_channels=hidden_channels)
        self.linear3 = Linear(hidden_channels,out_channels)

    def forward(self, x: HeteroData, edge_index, edge_weight):
        x = self.conv1(x, edge_index, edge_weight=edge_weight)
        x = F.relu(x)
        x = self.conv2(x, edge_index, edge_weight=edge_weight)
        x = F.relu(x)
        x = self.linear3(x)
        return x

model = WeightedSkillSage(hidden_channels=64, out_channels=64)
model = to_hetero(model, train_data.metadata(), aggr='sum')
model = model.to(device)


In [None]:

class GNNTrainer(Trainer):
    def __init__(self, model, criterion, optimizer, device):
        super().__init__(model, criterion, optimizer, device)

    def train(self, dataloader, save_interval, save_path):
        self.model.train()
        for batch_idx, (data, target) in enumerate(dataloader):
            data, target = data.to(self.device), target.to(self.device)
            self.optimizer.zero_grad()
            output = self.model(data)
            loss = self.criterion(output, target)
            loss.backward()
            self.optimizer.step()
            self.train_losses.append(loss.item())
            if batch_idx % save_interval == 0:
                self.save_checkpoint(batch_idx, save_path)

    def validate(self, dataloader):
        self.model.eval()
        with torch.no_grad():
            for data, target in dataloader:
                data, target = data.to(self.device), target.to(self.device)
                output = self.model(data)
                loss = self.criterion(output, target)
                self.val_losses.append(loss.item())

In [None]:



trainer = Trainer(model, criterion, optimizer, device)

import gc
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"
# with torch.no_grad():  # Initialize lazy modules.
#      out = model(batch.x_dict, batch.edge_index_dict, batch.edge_weight_dict)
def get_supervision_edge_type(heterodata):
    for edge_type in heterodata.edge_types:
        if 'input_id' in heterodata[edge_type].keys():
            return edge_type

def free_memory():
  """Clears the GPU cache and triggers garbage collection, to reduce OOMs."""
  torch.cuda.empty_cache()
  gc.collect()

free_memory()

optimizer = torch.optim.Adam(model.parameters(), lr=0.000001)
criterion = torch.nn.CrossEntropyLoss()
model.train()
for edge_batches in train_iterator:
    minibatch_loss = 0


    # each batch here is one edge type, since we want to learn for all edge types
    for batch in edge_batches:
        batch = batch.to(device)
        hetero_out = model(batch.x_dict, batch.edge_index_dict, batch.edge_weight_dict)

        supervision_edge_type = get_supervision_edge_type(batch)
        src_type, dst_type = supervision_edge_type[0], supervision_edge_type[2]
        edge_label = batch[supervision_edge_type].edge_label
        edge_label_index = batch[supervision_edge_type].edge_label_index
        src_node_embeddings = hetero_out[src_type][edge_label_index[0]]
        dst_node_embeddings = hetero_out[dst_type][edge_label_index[1]]
        logits = F.cosine_similarity(src_node_embeddings, dst_node_embeddings, dim=-1)
        loss = criterion(logits, edge_label)
        minibatch_loss += loss

    minibatch_loss.backward()
    optimizer.step()

    print('mini-batch loss:',float(batch_loss))







trainer.train(train_dataloader, save_interval=100, save_path='./checkpoints')
trainer.validate(val_dataloader)
trainer.plot_losses()
trainer.load_checkpoint('./checkpoints/checkpoint_100.pt')

In [None]:
import gc
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"
# with torch.no_grad():  # Initialize lazy modules.
#      out = model(batch.x_dict, batch.edge_index_dict, batch.edge_weight_dict)
def get_supervision_edge_type(heterodata):
    for edge_type in heterodata.edge_types:
        if 'input_id' in heterodata[edge_type].keys():
            return edge_type





optimizer = torch.optim.Adam(model.parameters(), lr=0.000001)
loss_fn = torch.nn.CrossEntropyLoss()
model.train()
for edge_batches in train_iterator:
    minibatch_loss = 0


    # each batch here is one edge type, since we want to learn for all edge types
    for batch in edge_batches:
        batch = batch.to(device)
        hetero_out = model(batch.x_dict, batch.edge_index_dict, batch.edge_weight_dict)

        supervision_edge_type = get_supervision_edge_type(batch)
        src_type, dst_type = supervision_edge_type[0], supervision_edge_type[2]
        edge_label = batch[supervision_edge_type].edge_label
        edge_label_index = batch[supervision_edge_type].edge_label_index
        src_node_embeddings = hetero_out[src_type][edge_label_index[0]]
        dst_node_embeddings = hetero_out[dst_type][edge_label_index[1]]
        logits = F.cosine_similarity(src_node_embeddings, dst_node_embeddings, dim=-1)
        loss = loss_fn(logits, edge_label)
        minibatch_loss += loss

    minibatch_loss.backward()
    optimizer.step()

    print('mini-batch loss:',float(batch_loss))





In [None]:
free_memory()

In [None]:
supervision_edge_type = get_supervision_edge_type(batch)
src_type, dst_type = supervision_edge_type[0], supervision_edge_type[2]
edge_label = batch[supervision_edge_type].edge_label
edge_label_index = batch[supervision_edge_type].edge_label_index
src_node_embeddings = out[src_type][edge_label_index[0]]
dst_node_embeddings = out[dst_type][edge_label_index[1]]
torch.min(F.cosine_similarity(src_node_embeddings, dst_node_embeddings, dim=-1))

In [None]:
import gc

gc.collect()

torch.cuda.empty_cache()

In [None]:
batch[supervision_edge_type].edge_label_index

In [None]:
J2S = ('Job','REQUIRES','Skill')
batch[J2S].edge_label_index

In [None]:
batch['Job','REQUIRES','Skill'].

In [None]:
batch = next(iter(train_loaders[0]))


In [None]:
batch

In [None]:
batch.edge_weight_dict

In [None]:
index = batch['Job','IS_SIMILAR_JOB','Job'].e_id
labels = batch['Job','IS_SIMILAR_JOB','Job'].edge_label

In [None]:
import torch_geometric.transforms as T
from torch_geometric.datasets import OGB_MAG
from torch_geometric.nn import SAGEConv, to_hetero


dataset = OGB_MAG(root='./data', preprocess='metapath2vec', transform=T.ToUndirected())
data = dataset[0]

class GNN(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv(-1, hidden_channels)
        self.conv2 = SAGEConv(-1, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x


model = GNN(hidden_channels=64, out_channels=dataset.num_classes)
model = to_hetero(model, data.metadata(), aggr='sum')

In [None]:
#https://colab.research.google.com/drive/1GrAxHyZCZ13jpTkMy9vVO_v_U9nHDdvB#scrollTo=wmiFKI0ovYN4


In [None]:
# intially we use this GraphConv layer and aggregate using mean
# this layer allows the addition of edge weights: the adjacency matrix simply consists not of 1s and 0s but the corresponding weights
#https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.nn.conv.GraphConv.html

# using max pool
# https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.nn.pool.global_max_pool.html#torch_geometric.nn.pool.global_max_pool