In [1]:
# https://docs.dgl.ai/en/latest/tutorials/models/1_gnn/4_rgcn.html#sphx-glr-tutorials-models-1-gnn-4-rgcn-py
# https://github.com/dmlc/dgl/tree/master/examples/pytorch/rgcn
# https://github.com/dmlc/dgl/blob/master/python/dgl/contrib/data/knowledge_graph.py
# https://docs.dgl.ai/en/latest/api/python/graph.html

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3"

In [13]:
import re
import time
import pickle
import numpy as np
import pandas as pd
from time import time
from collections import Counter
from tqdm.notebook import tqdm

In [3]:
from datasets import load_dataset

dataset = load_dataset("conll2003")

Reusing dataset conll2003 (/opt/tmp/huggingface/datasets/conll2003/conll2003/1.0.0/26b70ce2b0f32cb35a27151dbfa2dbe88c82bcdaf8f29433bcdc612a9b314e83)


In [9]:
words_per_label = {}
for split in['train', 'validation', 'test']:
    for doc in tqdm(dataset[split], desc=split.upper()):
        for word, label in zip(doc['words'], doc['ner']):
            label = label.split('-')[-1]
            if label not in words_per_label: 
                words_per_label[label] = set()
            words_per_label[label].add(word)

HBox(children=(FloatProgress(value=0.0, description='TRAIN', max=14041.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='VALIDATION', max=3250.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='TEST', max=3453.0, style=ProgressStyle(description_width=…




In [10]:
words_per_label.keys()

dict_keys(['ORG', 'O', 'MISC', 'PER', 'LOC'])

In [12]:
len(words_per_label['MISC'])

1343

# Model Definition

In [18]:
! pip install --pre dgl-cu101
! ls -al /usr/local/cuda

Collecting dgl-cu101
  Downloading dgl_cu101-0.6a201201-cp38-cp38-manylinux1_x86_64.whl (20.9 MB)
[K     |████████████████████████████████| 20.9 MB 4.1 MB/s eta 0:00:01
Installing collected packages: dgl-cu101
Successfully installed dgl-cu101-0.6a201201
lrwxrwxrwx 1 root root 21 19 févr.  2020 /usr/local/cuda -> /usr/local/cuda-10.1/


In [19]:
! pip install --pre dgl

Collecting dgl
  Downloading dgl-0.6a201201-cp38-cp38-manylinux1_x86_64.whl (3.6 MB)
[K     |████████████████████████████████| 3.6 MB 4.1 MB/s eta 0:00:01
Installing collected packages: dgl
Successfully installed dgl-0.6a201201


In [37]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from dgl import DGLGraph
import dgl.function as fn
from functools import partial

class RGCNLayer(nn.Module):
    def __init__(self, in_feat, out_feat, num_rels, num_bases=-1, bias=None,
                 activation=None, is_input_layer=False):
        super(RGCNLayer, self).__init__()
        self.in_feat = in_feat
        self.out_feat = out_feat
        self.num_rels = num_rels
        self.num_bases = num_bases
        self.bias = bias
        self.activation = activation
        self.is_input_layer = is_input_layer

        # sanity check
        if self.num_bases <= 0 or self.num_bases > self.num_rels:
            self.num_bases = self.num_rels

        # weight bases in equation (3)
        self.weight = nn.Parameter(torch.Tensor(self.num_bases, self.in_feat,
                                                self.out_feat))
        if self.num_bases < self.num_rels:
            # linear combination coefficients in equation (3)
            self.w_comp = nn.Parameter(torch.Tensor(self.num_rels, self.num_bases))

        # add bias
        if self.bias:
            self.bias = nn.Parameter(torch.Tensor(out_feat))

        # init trainable parameters
        nn.init.xavier_uniform_(self.weight,
                                gain=nn.init.calculate_gain('relu'))
        if self.num_bases < self.num_rels:
            nn.init.xavier_uniform_(self.w_comp,
                                    gain=nn.init.calculate_gain('relu'))
        if self.bias:
            nn.init.xavier_uniform_(self.bias,
                                    gain=nn.init.calculate_gain('relu'))

    def forward(self, g):
        if self.num_bases < self.num_rels:
            # generate all weights from bases (equation (3))
            weight = self.weight.view(self.in_feat, self.num_bases, self.out_feat)
            weight = torch.matmul(self.w_comp, weight).view(self.num_rels,
                                                        self.in_feat, self.out_feat)
        else:
            weight = self.weight

        if self.is_input_layer:
            def message_func(edges):
                # for input layer, matrix multiply can be converted to be
                # an embedding lookup using source node id
                embed = weight.view(-1, self.out_feat)
                index = edges.data['rel_type'] * self.in_feat + edges.src['id']
                return {'msg': embed[index] * edges.data['norm']}
        else:
            def message_func(edges):
                w = weight[edges.data['rel_type']]
                msg = torch.bmm(edges.src['h'].unsqueeze(1), w).squeeze()
                msg = msg * edges.data['norm']
                return {'msg': msg}

        def apply_func(nodes):
            h = nodes.data['h']
            if self.bias:
                h = h + self.bias
            if self.activation:
                h = self.activation(h)
            return {'h': h}

        g.update_all(message_func, fn.sum(msg='msg', out='h'), apply_func)

In [21]:
class Model(nn.Module):
    def __init__(self, num_nodes, h_dim, out_dim, num_rels,
                 num_bases=-1, num_hidden_layers=1):
        super(Model, self).__init__()
        self.num_nodes = num_nodes
        self.h_dim = h_dim
        self.out_dim = out_dim
        self.num_rels = num_rels
        self.num_bases = num_bases
        self.num_hidden_layers = num_hidden_layers

        # create rgcn layers
        self.build_model()

        # create initial features
        self.features = self.create_features()

    def build_model(self):
        self.layers = nn.ModuleList()
        # input to hidden
        i2h = self.build_input_layer()
        self.layers.append(i2h)
        # hidden to hidden
        for _ in range(self.num_hidden_layers):
            h2h = self.build_hidden_layer()
            self.layers.append(h2h)
        # hidden to output
        h2o = self.build_output_layer()
        self.layers.append(h2o)

    # initialize feature for each node
    def create_features(self):
        features = torch.arange(self.num_nodes)
        return features

    def build_input_layer(self):
        return RGCNLayer(self.num_nodes, self.h_dim, self.num_rels, self.num_bases,
                         activation=F.relu, is_input_layer=True)

    def build_hidden_layer(self):
        return RGCNLayer(self.h_dim, self.h_dim, self.num_rels, self.num_bases,
                         activation=F.relu)

    def build_output_layer(self):
        return RGCNLayer(self.h_dim, self.out_dim, self.num_rels, self.num_bases,
                         activation=partial(F.softmax, dim=1))

    def forward(self, g):
        if self.features is not None:
            g.ndata['id'] = self.features
        for layer in self.layers:
            layer(g)
        return g.ndata.pop('h')


In [22]:
from dgl.contrib.data import load_data
data = load_data(dataset='aifb')
num_nodes = data.num_nodes
num_rels = data.num_rels
num_classes = data.num_classes
labels = data.labels
train_idx = data.train_idx
# split training and validation set
val_idx = train_idx[:len(train_idx) // 5]
train_idx = train_idx[len(train_idx) // 5:]

# edge type and normalization factor
edge_type = torch.from_numpy(data.edge_type)
edge_norm = torch.from_numpy(data.edge_norm).unsqueeze(1)

labels = torch.from_numpy(labels).view(-1)

Downloading /home/semantic/.dgl/aifb.tgz from https://data.dgl.ai/dataset/aifb.tgz...
Extracting file to /home/semantic/.dgl/aifb
Loading dataset aifb
Graph loaded, frequencies counted.
Number of nodes:  8285
Number of relations:  91
Number of edges:  66371
4 classes: {'http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id1instance', 'http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id2instance', 'http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id3instance', 'http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id4instance'}
Loading training set
Loading test set
Number of classes:  4
removing nodes that are more than 3 hops away


In [24]:
print(num_nodes)
print(num_rels)
print(num_classes)
print(labels)
print(edge_type)
print(edge_norm)
print(labels.shape)

8285
91
4
tensor([0, 0, 0,  ..., 0, 0, 0])
tensor([ 0,  2,  5,  ...,  0, 21,  0])
tensor([[1.0000],
        [0.0192],
        [0.0217],
        ...,
        [1.0000],
        [1.0000],
        [1.0000]])
torch.Size([8285])


In [28]:
Counter(labels.numpy())

Counter({0: 8182, 2: 60, 1: 28, 3: 15})

In [33]:
Counter(edge_norm.numpy()[:,0])

Counter({1.0: 27409,
         0.01923077: 208,
         0.02173913: 92,
         0.25: 2840,
         0.125: 1032,
         0.5: 6494,
         0.16666667: 1392,
         0.07692308: 507,
         0.33333334: 3786,
         0.14285715: 1176,
         0.1: 660,
         0.025: 120,
         0.041666668: 240,
         0.2: 1615,
         0.024390243: 205,
         0.035714287: 84,
         0.05882353: 204,
         0.06666667: 240,
         0.09090909: 737,
         0.0056497175: 177,
         0.014492754: 69,
         0.083333336: 456,
         0.11111111: 531,
         0.020833334: 192,
         0.021276595: 188,
         0.01754386: 57,
         0.020408163: 147,
         0.017857144: 56,
         0.045454547: 220,
         0.014705882: 68,
         0.014925373: 134,
         0.071428575: 266,
         0.00591716: 338,
         0.0069444445: 144,
         0.0037453184: 267,
         0.04347826: 138,
         0.037037037: 135,
         0.015151516: 66,
         0.013333334: 150,
      

In [36]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd

In [35]:
! pip install SPARQLWrapper

Collecting SPARQLWrapper
  Downloading SPARQLWrapper-1.8.5-py3-none-any.whl (26 kB)
Installing collected packages: SPARQLWrapper
Successfully installed SPARQLWrapper-1.8.5
