In [1]:
import torch
from torch import optim
from torch.optim.lr_scheduler import StepLR
from torch.nn.init import xavier_normal_

import numpy as np
import pandas as pd
from numba import jit
import matplotlib.pyplot as plt

import pickle
import time
from timeit import default_timer as timer

from general_functions1 import sqrt_err_relative, check_coo_tensor, gen_coo_tensor
import evaluation_functions as ef

#from general_functions1 import create_filter, hr

from t_alg import mttcrp, mttcrp1, get_elem_deriv_tensor, factors_to_tensor
from t_alg import gcp_grad, multi_ind_to_indices, indices_to_multi_ind

from samplings import give_ns, generate_data
from elementwise_grads import bernoulli_logit_loss, bernoulli_logit_loss_grad

In [2]:
def gcp_grad(coo, val, shape, a, b, l2, loss_function, loss_function_grad, device):
    """
        GCP loss function and gradient calculation.
        All the tensors have the same coordinate set: coo_tensor.
    """

    # Construct sparse kruskal tensor
    kruskal_val = torch.sum((a[coo[:,0], :] * b[coo[:,1], :] * a[coo[:,2], :]),1)
    #factors_to_tensor(coo_tensor, vals, a, b, c)
    
    # Calculate mean loss on known entries
    loss = loss_function(val, kruskal_val)
    # Compute the elementwise derivative tensor
    deriv_tensor_val = loss_function_grad(val, kruskal_val)
    
    #print ("in qcp_grad in deriv_tensor_val ", deriv_tensor_val)
    # Calculate gradients w.r.t. a, b, c factor matrices
    g_a = mttcrp1(coo, deriv_tensor_val, shape, 0, b, a, device)
    g_b = mttcrp1(coo, deriv_tensor_val, shape, 1, a, a, device)
    g_c = mttcrp1(coo, deriv_tensor_val, shape, 2, a, b, device)
    
    #print ("\n\n")
    
    
    # Add L2 regularization
    if l2 != 0:
        
        # Before !!!!!
        #g_a += l2 * a[coo[0], :]
        #g_b += l2 * b[coo[1], :]
        #g_c += l2 * c[coo[2], :]
        
        # After !!!!!
        g_a += l2 * a[coo[:, 0], :]
        g_b += l2 * b[coo[:, 1], :]
        g_c += l2 * a[coo[:, 2], :]
        
    
    return loss, g_a, g_b, g_c

In [3]:
!nvidia-smi -L

GPU 0: NVIDIA GeForce RTX 3090 (UUID: GPU-51be6691-df05-e69e-fc9c-a0a7181655ed)
GPU 1: NVIDIA GeForce RTX 3090 (UUID: GPU-f2d18c53-39cb-e6ee-cfe5-dcca14c2fce6)


In [4]:
path_data = "/notebook/Relations_Learning/Link_Prediction_Data/FB15K237/"
entity_list = pickle.load(open(path_data + 'entity_list', 'rb'))
relation_list = pickle.load(open(path_data + 'relation_list', 'rb'))

train_triples = pickle.load(open(path_data + 'train_triples', 'rb'))
valid_triples = pickle.load(open(path_data + 'valid_triples', 'rb'))
test_triples = pickle.load(open(path_data + 'test_triples', 'rb'))
train_valid_triples = pickle.load(open(path_data + 'train_valid_triples', 'rb'))

entity_map = pickle.load(open(path_data + 'entity_map', 'rb'))
relation_map = pickle.load(open(path_data + 'relation_map', 'rb'))

all_triples = train_valid_triples + test_triples

In [5]:
%%time
ft = ef.create_filter(all_triples)

CPU times: user 274 ms, sys: 2.7 ms, total: 277 ms
Wall time: 276 ms


In [6]:
values = [1] * len(train_triples)
values = np.array(values, dtype=np.int64)

coords = np.array(train_triples, dtype=np.int64)
nnz = len(train_triples)
data_shape = (len(entity_list), len(relation_list), len(entity_list))

print(f"KG shape (sub., rel., obj.): {data_shape};\nNum train samples: {values.shape};")

KG shape (sub., rel., obj.): (14541, 237, 14541);
Num train samples: (272115,);


In [7]:
coo_tensor = coords
vals = values
shape = data_shape
loss_function = bernoulli_logit_loss
loss_function_grad = bernoulli_logit_loss_grad

In [8]:
num_epoch = 4
l2 =  1e-2
lr = 1e-2 
seed = 13 
hm = 1000
how_many = 2
batch_size = 64#56

device=torch.device("cuda:1")
random_state = np.random.seed(seed)

In [9]:
init_mind_set = set(indices_to_multi_ind(coo_tensor, shape))
coo_ns = np.empty((how_many * len(init_mind_set) + vals.size, 3), dtype=np.int64)
vals_ns = np.empty((how_many * len(init_mind_set) + vals.size,), dtype=np.float64)

err_arr = np.empty((num_epoch*vals_ns.shape[0]//batch_size + 1, ), dtype=np.float64)

error = 0.0
it = 0

num_ent = 14541
dim_emb = 200
num_rel = 237

a_torch = torch.empty((num_ent, dim_emb), requires_grad=True, device=device)
xavier_normal_(a_torch)
a_torch.grad = torch.zeros(a_torch.shape, device=device)

b_torch = torch.empty((num_rel, dim_emb), requires_grad=True, device=device)
xavier_normal_(b_torch)
b_torch.grad = torch.zeros(b_torch.shape, device=device)

optimizer = optim.Adam([a_torch, b_torch], lr=1e-3)
scheduler = StepLR(optimizer, step_size=2, gamma=0.5)

show_iter = True

start = timer()
for epoch in range(num_epoch):
    
    #get negative samples:
    coo_ns, vals_ns = generate_data(coo_tensor, vals, init_mind_set, shape, how_many, epoch)
    coo_ns = torch.tensor(coo_ns, device=device)
    vals_ns = torch.tensor(vals_ns, device = device)
    
    shuffler = np.random.permutation(vals_ns.shape[0])
    coo_ns = coo_ns[shuffler]
    vals_ns = vals_ns[shuffler]
    
    #idxs = np.random.permutation(vals_ns.shape[0])
    print (vals_ns.shape[0], batch_size, vals_ns.shape[0]//batch_size)
    err_list = []
    for i in range(vals_ns.shape[0]//batch_size):
        # Get loss and gradients per sample
        # print ("coo_ns[i], vals_ns[i]", coo_ns[i], vals_ns[i])
        end = min(vals_ns.shape[0] - 1, (i+1)*batch_size)
        loss, g_a, g_b, g_c = gcp_grad(
            coo_ns[i*batch_size : end],
            vals_ns[i*batch_size : end],
            shape,
            a_torch, b_torch,
            l2, loss_function,
            loss_function_grad,
            device,
        )
        err_list.append(loss.cpu().detach().numpy().mean())

        a_elems = coo_ns[i*batch_size : end, 0]
        b_elems = coo_ns[i*batch_size : end, 1]
        c_elems = coo_ns[i*batch_size : end, 2]
        
        a_torch.grad[a_elems, :] = g_a
        b_torch.grad[b_elems, :] = g_b
        a_torch.grad[c_elems, :] = g_c
        
        optimizer.step()
       
        a_torch.grad = torch.zeros(a_torch.shape, device = device)
        b_torch.grad = torch.zeros(b_torch.shape, device = device)
        
        err_arr[it] = np.mean(err_list)
        if show_iter and i%2000 == 0:
            print("Iter: ", it, "; Error: ", np.mean(np.array(err_list)))
        it += 1
        
    scheduler.step()
    a = a_torch.cpu().data.numpy()
    b = b_torch.cpu().data.numpy()
    c = a_torch.cpu().data.numpy()
    print("count hr")
    hr_result = ef.hr(ft, valid_triples, a, b, c, [1, 3, 10])
    print(hr_result)
    end = timer()
    print(end - start, "\n")

816345 64 12755
Iter:  0 ; Error:  0.693138504830813
Iter:  2000 ; Error:  0.6931470939910155
Iter:  4000 ; Error:  0.6931471378645807
Iter:  6000 ; Error:  0.6931471515728426
Iter:  8000 ; Error:  0.6931471587513257
Iter:  10000 ; Error:  0.6931471628938952
Iter:  12000 ; Error:  0.693147165786125
count hr
[0.00136869 0.00182492 0.00222412 0.00164164]
243.64902688000075 

816345 64 12755
Iter:  12755 ; Error:  0.6931471483337449


KeyboardInterrupt: 

In [12]:
%%time
ef.hr(ft, valid_triples, a, b, c, [1, 3, 10])

NameError: name 'a' is not defined

In [32]:
%%time
ef.hr(ft, test_triples, a, b, c, [1, 3, 10])

CPU times: user 8.75 s, sys: 8.16 s, total: 16.9 s
Wall time: 3.24 s


array([0.00000000e+00, 9.77230529e-05, 3.90892211e-04])

# WARP:

In [21]:
import torch
import torch.nn as nn
from torch.autograd import Variable, Function
import random

import math

class WARP(Function): 
    '''
    autograd function of WARP loss
    '''
    @staticmethod
    def forward(input, target, max_num_trials = None):
        
        batch_size = target.size()[0]
        
        if max_num_trials is None: 
            max_num_trials = target.size()[1] - 1
        
        positive_indices = torch.zeros(input.size())
        negative_indices = torch.zeros(input.size())
        L = torch.zeros(input.size()[0])
        
        all_labels_idx = np.arange(target.size()[1])
        
        Y = float(target.size()[1])
        J = torch.nonzero(target)

        for i in range(batch_size): 
            
            msk = np.ones(target.size()[1], dtype = bool)
            
            # Find the positive label for this example
            j = J[i, 1]
            positive_indices[i, j] = 1
            msk[j] = False
            
            # initialize the sample_score_margin
            sample_score_margin = -1
            num_trials = 0
            
            neg_labels_idx = all_labels_idx[msk]

            while ((sample_score_margin < 0) and (num_trials < max_num_trials)):
                 
                #randomly sample a negative label
                neg_idx = random.sample(list(neg_labels_idx), 1)[0]
                msk[neg_idx] = False
                neg_labels_idx = all_labels_idx[msk]
                
                num_trials += 1
                # calculate the score margin 
                sample_score_margin = 1 + input[i, neg_idx] - input[i, j] 
            
            if sample_score_margin < 0:
                # checks if no violating examples have been found 
                continue
            else: 
                loss_weight = np.log(math.floor((Y-1)/(num_trials)))
                L[i] = loss_weight
                negative_indices[i, neg_idx] = 1
                
        loss = L * (1-torch.sum(positive_indices*input, dim = 1) + torch.sum(negative_indices*input, dim = 1))
        
        # ctx.save_for_backward(input, target)
        # ctx.L = L
        # ctx.positive_indices = positive_indices
        # ctx.negative_indices = negative_indices
        
        return torch.sum(loss , dim = 0, keepdim = True)

    # This function has only a single output, so it gets only one gradient 
    @staticmethod
    def backward(input, target, grad_output):
        #input, target = ctx.saved_variables
        L = Variable(torch.unsqueeze(ctx.L, 1), requires_grad = False)

        positive_indices = Variable(ctx.positive_indices, requires_grad = False) 
        negative_indices = Variable(ctx.negative_indices, requires_grad = False)
        grad_input = grad_output*L*(negative_indices - positive_indices)

        return grad_input, None, None    

      
class WARPLoss(nn.Module): 
    def __init__(self, max_num_trials = None): 
        super(WARPLoss, self).__init__()
        self.max_num_trials = max_num_trials
        
    def forward(self, input, target): 
        return WARP.apply(input, target, self.max_num_trials)

a = np.load('/notebook/Relations_Learning/a200.npz.npy')
b = np.load('/notebook/Relations_Learning/b200.npz.npy')
c = np.load('/notebook/Relations_Learning/c200.npz.npy')

In [22]:
lr_warp = 0.001
epoch =0

wp = WARP()
while (epoch < 1):
    print ("warp loss!")
    a_torch = torch.tensor(a, requires_grad=True)
    b_torch = torch.tensor(b, requires_grad=True)
    c_torch = torch.tensor(c, requires_grad=True)
    list_of_inputs = []
    list_of_targets = []
    for entity, filt in zip(test_triples, test_filter):
        p = entity[0]
        q = entity[1]
        r = entity[2]

        candidate_values = torch.sum(a_torch[p, :] * b_torch[q, :] * c_torch, axis=1)

        for obj in filt:
            idxs = (candidate_values == obj).nonzero(as_tuple=False)
            candidate_values[idxs] = 0.0
            
        candidate_values = torch.sigmoid(candidate_values)

        target = torch.zeros(len(candidate_values))
        target[r] = 1.0
        list_of_inputs.append(candidate_values)
        list_of_targets.append(target)

    inputs = torch.stack(list_of_inputs)
    print (list_of_inputs[0].shape, inputs.shape) #should be batch_size*
    targets = torch.stack(list_of_targets)
    print (list_of_targets[0].shape, targets.shape) #should be batch_size*
    
    #batch_size = 16
    #for i in range(inputs.shape[0]//batch_size):
        #print (i)
        #end = min(inputs.shape[0] - 1, (i+1)*batch_size)
    loss = wp.forward(inputs, targets) 
    print ("warp loss is counted", loss)

        #if (i ==0):
    loss.backward()
    

    a = a - lr_warp*a_torch.grad.data.numpy()
    b = b - lr_warp*b_torch.grad.data.numpy()
    c = c - lr_warp*c_torch.grad.data.numpy()
    epoch += 1

warp loss!
torch.Size([14541]) torch.Size([20466, 14541])
torch.Size([14541]) torch.Size([20466, 14541])
warp loss is counted tensor([36423.6797], grad_fn=<SumBackward1>)


In [None]:
#@jit(nopython=True)
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

#@jit(nopython=True)
def hr(test_filter, test_triples, a, b, c,
       how_many=[1, 3, 10], iter_show=False, freq=3000):
    """ Calculate HR@[how_many] and MRR using filter """
    
    total = len(test_triples)
    hit = [0, 0, 0, 0]
    iteration = 0
    for entity, filt in zip(test_triples, test_filter):
        p = entity[0]
        q = entity[1]
        r = entity[2]

        candidate_values = np.sum(a[p, :] * b[q, :] * c, axis=1)
        candidate_values = sigmoid(candidate_values)
        
        top = np.argsort(candidate_values)[::-1]
        top = list(top)
        
        for obj in filt:
            top.remove(obj)
        
        ind = top.index(r)
        for i, h in enumerate(how_many):
            if ind < h:
                hit[i] += 1
        hit[3] += 1 / (1 + ind)    
        
        iteration += 1
        if iter_show:
            if iteration % freq == 0:
                print(hit[2] / iteration, hit[2], iteration)
            
    return hit[0] / total, hit[1] / total, hit[2] / total, hit[3] / total

In [None]:
a = np.load('/notebook/wikidata_tensor/embeddings_tucker_als/embedding_size_variation/200/u0_200_237.npz.npy')
b = np.load('/notebook/wikidata_tensor/embeddings_tucker_als/embedding_size_variation/200/u1_200_237.npz.npy')
c = np.load('/notebook/wikidata_tensor/embeddings_tucker_als/embedding_size_variation/200/u2_200_237.npz.npy')

In [None]:
%%time
shape = (100, 100, 100)
coo, vals = gen_coo_tensor(init_shape, density=0.02)
assert check_coo_tensor(coo)!= "Bad"

In [None]:
max_iter = 20
rank = 3

In [None]:
%%time

a, b, c, err, it = gcp_gd(
    coo, vals, shape,
    bernoulli_logit_loss,
    bernoulli_logit_loss_grad,
    rank=rank,
    lr=0.1,
    l2=0,
    max_iter=max_iter,
    tol=1e-8,
    seed=13,
    show_iter=False,
    it_over=True,
)

In [None]:
plt.xlabel("Iteration")
plt.ylabel("Relative error")
plt.title(f"Random tensor / CP-ALS3(R={rank})")
#plt.xticks(np.arange(max_iter))
plt.yscale("log")
plt.plot(np.arange(max_iter), err[:max_iter], 'g-*')