In [1]:
import  torch
from    torch_geometric.data import Data

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
os.chdir('..') # set directoroy to AIDAVA-KGE-Framework (run once)

In [3]:
import  pickle 

from    src.gcn import GCN

import  torch.nn.functional as F
import  tqdm

### Load and prepare data

In [4]:
data_path   = './data/m2skg/'
filename    = 'assertion_train.pkl'

node_embed_dim  = 100

In [5]:
with open(data_path+filename,'rb') as f:
    data = pickle.load(f)

In [6]:
entities= {e : i for i,e in enumerate( list(set([s for s,t in data])))}
types   = {t : i for i,t in enumerate( list(set([t for s,t in data])))}

type_list = {key : []  for key in types.keys()}

# create an edge between all nodes of the same type
entity_types = []
for i,(entity,type) in enumerate(data): 
    type_list[type] += [entity]
    entity_types += [types[type]]


edges = []


# as a toy example, fully connect all nodes with the same type (star topology)
for key,entity_list in type_list.items():

    for source in entity_list:
        for target in entity_list:
            if source != target:
                edges.append( [  entities[source] , entities[target]  ]   )

edges = torch.tensor(edges).T


In [8]:
types # toy data from one of SPHN classes

{'AdministrativeCase': 0,
 'DrugPrescription': 1,
 'OrganSupport': 2,
 'SubjectPseudoIdentifier': 3,
 'Code': 4,
 'LabTestEvent': 5}

### GCN for Node Prediction (demo)

In [7]:
pyg_df = Data(x = torch.nn.Embedding(len(entities), 
                                     node_embed_dim), 
                                     edge_index = edges, 
                                     y = torch.tensor(entity_types)  )

In [11]:
pyg_df

Data(x=Embedding(1253, 100), edge_index=[2, 463624], y=[1253])

In [12]:
model = GCN(node_embed_dim,hid_dim=100,num_classes=len(types))
print(model)

GCN(
  (conv1): GCNConv(100, 100)
  (conv2): GCNConv(100, 100)
  (classifier): Linear(in_features=100, out_features=6, bias=True)
)


In [13]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

In [14]:
epochs  = 20

for i in tqdm.tqdm(range(epochs)):

    model.train()
    optimizer.zero_grad()

    #embeds = model(node_embeds, edge_index)
    out,emb_out = model(pyg_df.x.weight,pyg_df.edge_index)

    #loss =  criterion(out, data.y)
    loss = criterion(out , pyg_df.y)

    acc  = (F.softmax(out,dim=1).argmax(dim=1) == pyg_df.y).numpy().sum() / len(pyg_df.y)

    loss.backward()
    optimizer.step()

    print(f'loss {loss.item():0.02f} acc {acc:0.02f}')

  5%|▌         | 1/20 [00:00<00:09,  1.93it/s]

loss 1.79 acc 0.13


 10%|█         | 2/20 [00:00<00:08,  2.18it/s]

loss 1.67 acc 0.49


 15%|█▌        | 3/20 [00:01<00:07,  2.26it/s]

loss 1.53 acc 0.49


 20%|██        | 4/20 [00:01<00:06,  2.33it/s]

loss 1.54 acc 0.49


 25%|██▌       | 5/20 [00:02<00:06,  2.39it/s]

loss 1.54 acc 0.49


 30%|███       | 6/20 [00:02<00:05,  2.39it/s]

loss 1.50 acc 0.49


 35%|███▌      | 7/20 [00:02<00:05,  2.40it/s]

loss 1.51 acc 0.49


 40%|████      | 8/20 [00:03<00:04,  2.42it/s]

loss 1.52 acc 0.49


 45%|████▌     | 9/20 [00:03<00:04,  2.42it/s]

loss 1.52 acc 0.49


 50%|█████     | 10/20 [00:04<00:04,  2.43it/s]

loss 1.50 acc 0.49


 55%|█████▌    | 11/20 [00:04<00:03,  2.42it/s]

loss 1.49 acc 0.49


 60%|██████    | 12/20 [00:05<00:03,  2.42it/s]

loss 1.50 acc 0.49


 65%|██████▌   | 13/20 [00:05<00:02,  2.43it/s]

loss 1.50 acc 0.49


 70%|███████   | 14/20 [00:05<00:02,  2.44it/s]

loss 1.50 acc 0.49


 75%|███████▌  | 15/20 [00:06<00:02,  2.44it/s]

loss 1.49 acc 0.49


 80%|████████  | 16/20 [00:06<00:01,  2.45it/s]

loss 1.49 acc 0.49


 85%|████████▌ | 17/20 [00:07<00:01,  2.45it/s]

loss 1.49 acc 0.49


 90%|█████████ | 18/20 [00:07<00:00,  2.45it/s]

loss 1.50 acc 0.49


 95%|█████████▌| 19/20 [00:07<00:00,  2.42it/s]

loss 1.50 acc 0.49


100%|██████████| 20/20 [00:08<00:00,  2.40it/s]

loss 1.49 acc 0.49





### TransE for Node Prediction (demo)

In [7]:
from    src.TransE import TransE

In [8]:
device = torch.device('cpu')

In [9]:
trans_e = TransE(num_entities = len(entities),
       num_relations = 1,
       device = device)

In [10]:
triples = []

r = 0 # one relation, i.e. rdfs:type 

for i,(entity,type) in enumerate(data): 

    triples.append( ( entities[entity], 0 , types[type] ) )

In [11]:
triples[0] # head, relation, tail

(627, 0, 4)

In [12]:
trans_e._train(triples);

  return torch.tensor(self.edges_index[idx][0]), self.edges_index[idx][1],   torch.tensor(self.edges_index[idx][2]), torch.tensor(neg_sample[0][0]) , torch.tensor(neg_sample[1]) , torch.tensor(neg_sample[0][1])


epoch 0,	 train loss 1.03
epoch 10,	 train loss 1.03
epoch 20,	 train loss 0.98
epoch 30,	 train loss 0.92
epoch 40,	 train loss 0.87
epoch 50,	 train loss 0.84
epoch 60,	 train loss 0.82
epoch 70,	 train loss 0.78
epoch 80,	 train loss 0.77
epoch 90,	 train loss 0.76


In [13]:
trans_e._eval(triples)

hits@1 0.00 hits@10 0.20  MR 26.89 MRR  0.07
