In [38]:
import numpy as np # best linear algebra
import torch # same as numpy + GPU
from torch import optim # different optimizers
from torch.optim.lr_scheduler import StepLR # adjust the learning rate
from sklearn.preprocessing import normalize # to normalize final factor matrices

import argparse # add terminal arguments
import sys; sys.path.append('/notebook/Link-prediction/gpu/') # to use files from this directory
import os

import datetime
import pickle
import time
from timeit import default_timer as timer
from ipypb import track

import evaluation_functions as ef



from t_alg import mttcrp, mttcrp1, get_elem_deriv_tensor
from t_alg import factors_to_tensor, gcp_grad, multi_ind_to_indices, indices_to_multi_ind
from samplings import give_ns, generate_data
from elementwise_grads import bernoulli_logit_loss, bernoulli_logit_loss_grad, bernoulli_loss, bernoulli_loss_grad
from general_functions1 import sqrt_err_relative, check_coo_tensor, gen_coo_tensor
from general_functions1 import create_filter, hr
from decimal import Decimal
from experiments import data_storage, Trainer, run_epoch
from model import FoxIE

import wandb
from util import import_source_as_module

In [2]:
def static_vars(**kwargs):
    def decorate(func):
        for k in kwargs:
            setattr(func, k, kwargs[k])
        return func
    return decorate


@static_vars(fail_count=0)
def check_early_stop(target_score, previous_best, margin=0, max_attempts=1000):
    if (previous_best > target_score):
        previous_best = target_score
    if (margin >= 0) and (target_score > previous_best + margin):
        check_early_stop.fail_count += 1
    else:
        check_early_stop.fail_count = 0
    if check_early_stop.fail_count >= max_attempts:
        print('Interrupted due to early stopping condition.', check_early_stop.fail_count, flush = True)
        raise StopIteration

@static_vars(fail_count_score=0)        
def check_early_stop_score(target_score, previous_best, margin=0, max_attempts=3000):
    if (previous_best > target_score):
        previous_best = target_score
    if (margin >= 0) and (target_score < previous_best + margin):
        check_early_stop_score.fail_count_score += 1
    else:
        check_early_stop_score.fail_count_score = 0
    if check_early_stop_score.fail_count_score >= max_attempts:
        print(
            'Interrupted due to early stopping scoring condition.',
            check_early_stop_score.fail_count_score, flush = True,
        )
        raise StopIteration

In [None]:
# in yaml
# num_epoch = args.n_epoch
# rank = args.dim 
# lr = args.lr
# batch_size = args.batch_size
# step_size=args.scheduler_step
# gamma=args.scheduler_gamma
# momentum = args.momentum
# opt_type = args.opt_type (SGD, ADAM, AdamW)
# output_file = args.out_file

#import_source_as_module('/notebook/Relations_Learning/grid_search/configs/adam_grid.py')
cur_dir = os.path.dirname(os.path.abspath(__file__))

#import adam_grid  # python file with default hyperparameters
# Set up your default hyperparameters
#hyperparameters = adam_grid

# Pass them wandb.init
#wandb.init(project = 'FOxIE', entity = 'sayankotor')
# Access all hyperparameter values through wandb.config
#config = wandb.config

## Parse the parameters:

In [6]:
parser = argparse.ArgumentParser()
parser.add_argument("--n_epoch", type=int, required=True, default=200)
parser.add_argument("--lr", type=float, required=True) # depends on choice of data pack
parser.add_argument("--path_data", type=str, default="/notebook/Relations_Learning/Link_Prediction_Data/FB15K237/")
#parser.add_argument("--path_filters", type=str, default="/notebook/Relations_Learning/")
parser.add_argument("--batch_size", type=int, default=32)
parser.add_argument("--opt_type", type=str, default='adam')

parser.add_argument('--dim', type = int, default = 200)
parser.add_argument('--l2', type = float, default = 0.0)
parser.add_argument('--scheduler_step', type=int, default=2, help="Scheduler step size")
parser.add_argument("--scheduler_gamma", type=float, default = 0.5, help="scheduler_gamma")
parser.add_argument('--momentum', type=float, default=0.9, help='momentum in Sgd')
parser.add_argument('--nesterov', type=bool, default=False, help='nesterov momentum in Sgd')
parser.add_argument(
    '--out_file', type=str,
    default='/notebook/Link-prediction/Warp_grid_search/output/result.txt',
    help='path tot output_file',
)

args = parser.parse_args()

_StoreAction(option_strings=['--out_file'], dest='out_file', nargs=None, const=None, default='/notebook/Link-prediction/Warp_grid_search/output/result.txx', type=<class 'str'>, choices=None, help='path tot output_file', metavar=None)

In [61]:
class Parser(object):
    def __init__(self, n_epoch, lr,
                 path_data="/notebook/Relations_Learning/Link_Prediction_Data/FB15K237/",
                 batch_size=32, opt_type='adam', dim=200, l2=0.0, scheduler_step=2,
                 scheduler_gamma=0.5, momentum=0.9, weight_decay=0.1, nesterov=False, how_many=2,
                 out_file='/notebook/Link-prediction/Warp_grid_search/output/result.txt'):
        self.n_epoch = n_epoch
        self.lr = lr
        self.path_data = path_data
        self.batch_size = batch_size
        self.opt_type = opt_type
        self.dim = dim
        self.l2 = l2
        self.scheduler_step = scheduler_step
        self.scheduler_gamma = scheduler_gamma
        self.momentum = momentum
        self.weight_decay = weight_decay
        self.nesterov = nesterov
        self.how_many = how_many
        self.out_file = out_file

In [63]:
args = Parser(
    n_epoch=50,
    lr=0.0001,
    path_data="/notebook/Relations_Learning/Link_Prediction_Data/FB15K237/",
    batch_size=32,
    opt_type='adam',
    dim=200,
    l2=0.0,
    scheduler_step=2,
    scheduler_gamma=0.5,
    momentum=0.9,
    weight_decay=0.01,
    nesterov=False,
    how_many=2,
    out_file='/notebook/Link-prediction/Warp_grid_search/output/result.txt',
)

## Load the data:

In [43]:
os.getcwd()

'/notebook/Link-prediction/Warp_grid_search'

In [44]:
cur_dir = '/home/asayapin/Link-prediction/Warp_grid_search'

output_file = os.getcwd() + "/output" + "/result_" + str(datetime.datetime.now()) + ".txt" # files for text output
#output_folder = args.out_folder # folder where best factors are stored

In [45]:
file_out = open(output_file, "w")

In [46]:
path_data = args.path_data 

entity_list = pickle.load(open(path_data + 'entity_list', 'rb'))
relation_list = pickle.load(open(path_data + 'relation_list', 'rb'))

train_triples = pickle.load(open(path_data + 'train_triples', 'rb'))
valid_triples = pickle.load(open(path_data + 'valid_triples', 'rb'))
test_triples = pickle.load(open(path_data + 'test_triples', 'rb'))
train_valid_triples = pickle.load(open(path_data + 'train_valid_triples', 'rb'))

entity_map = pickle.load(open(path_data + 'entity_map', 'rb'))
relation_map = pickle.load(open(path_data + 'relation_map', 'rb'))

all_triples = train_valid_triples + test_triples

sample_positive = ef.create_filter(train_triples)
ft = ef.create_filter(all_triples)

In [58]:
file_out.write("loaded1_\n")

device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
num_epoch = args.n_epoch
rank = args.dim 
lr = args.lr
batch_size = args.batch_size 
how_many = args.how_many
l2 = args.l2

seed = 13

In [56]:
vals = [1] * len(train_triples)
vals = np.array(vals, dtype=np.int64)

coo_tensor = np.array(train_triples, dtype=np.int64)
nnz = len(train_triples)
shape = (len(entity_list), len(relation_list), len(entity_list))
    
print(shape, flush = True)

(14541, 237, 14541)


In [52]:
import GCP_WARP
import WARPLoss

In [59]:
n_entity = shape[0]
n_relation = shape[1]
n_factors = rank

model = GCP_WARP.GCP_WARP(n_entity, n_relation, n_factors, device=device).to(device)
print(model)

GCP_WARP()


def train_batch_warp(train_data, data_shape, sample_positive, model,
                     loss_fn, optimizer, un_pair, bs=64,
                     show=True, device='cpu'):

    user_idx = np.arange(len(un_pair))
    np.random.shuffle(user_idx)
    batches = np.array_split(user_idx, train_data.shape[0] // bs)
    cols = torch.arange(data_shape[2])
    
    for i, batch in enumerate(batches):
        rows = torch.tensor(batch)

        # Compute prediction error
        rating = torch.zeros((len(un_pair[rows]), data_shape[2])).to(device)
        for j in range(rating.shape[0]):
            rating[j][sample_positive[tuple(un_pair[rows][j].tolist())]] = 1.0

        prediction = model(
            torch.repeat_interleave(torch.tensor(un_pair[rows][:, 0]), cols.shape[0]),
            torch.repeat_interleave(torch.tensor(un_pair[rows][:, 1]), cols.shape[0]),
            torch.tile(cols, (un_pair[rows].shape[0], )),
        ).view(un_pair[rows].shape[0], cols.shape[0]).to(torch.float64)

        loss = loss_fn(prediction, rating)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if show and (i % 10 == 0):
            loss, current = loss.item(), i * len(rows)
            print(f"loss: {loss:>7f}  [{current:>5d}/{coords.shape[0]:>5d}]")

In [None]:
if (args.opt_type == 'adam'):
    score_margin_ = 0.01
    score_attempts_ = 15
    optimizer = optim.Adam(model.parameters(), lr=args.lr) #lr=5e-4

elif (args.opt_type == 'sdg'):
    score_margin_ = 0.0001
    score_attempts_ = 25
    optimizer = optim.SGD(model.parameters(),
                          lr=args.lr, momentum=args.momentum,
                          weight_decay=args.weight_decay)
          
elif (args.opt_type == 'adamw'):
    score_margin_ = 0.01
    score_attempts_ = 15
    optimizer = optim.AdamW(model.parameters(), lr = args.lr)

scheduler = StepLR(optimizer, step_size=args.scheduler_step, gamma=args.scheduler_gamma)
    
loss_fn = WARPLoss(20)
un_pair = np.array(list(sample_positive.keys()))
hr_k = (1, 3, 10)

In [None]:
show_iter = True
start = timer()
for epoch in range(num_epoch):
    try:
        run_epoch(data_s, epoch, device, model, optimizer, scheduler, batch_size, trainer, show_iter = True, fout = file_out)
    except StopIteration: # early stopping condition met
        break
        print ("early_stoping loss", flush = True)
        raise StopIteration


    hit3, hit5, hit10, mrr = model.evaluate(data_s)

    metrics = {"hit3": hit3,
                "hit5": hit5,
                "hit10": hit10,
                "mrr":mrr,
                "loss":np.mean(trainer.err_arr[len(trainer.err_arr) - 5:])
              }
    wandb.log(metrics)
    print (hit3, hit5, hit10, mrr, flush = True)

    file_out.write('%s %s %s %s \n' % (hit3, hit5, hit10, mrr))
    file_out.flush()
        # early stopping by hit@10
    try:
        check_early_stop_score(hit10, best_hit_10, margin=score_margin_, max_attempts=score_attempts_)
    except StopIteration: # early stopping condition met
        end = timer()
        time = end - start
        file_out.write("\n")
        file_out.write("In %s epoch; time %s \n" % (epoch, time))
        file_out.write("early_stoping score")
        file_out.write("Best scores ", best_tuple)
        file_out.flush()
        file_out.close()
        break
        print ("early_stoping score", flush = True)

    # if hit@10 grows update checkpoint
    if (hit10 > best_hit_10):
        best_hit_10 = hit10
        best_tuple = (hit3, hit5, hit10, mrr)
        #np.save('/notebook/Relations_Learning/gpu/gpu_a.npz', a_torch.cpu().data.numpy())
        #np.save('/notebook/Relations_Learning/gpu/gpu_b.npz', b_torch.cpu().data.numpy())
        #np.save('/notebook/Relations_Learning/gpu/gpu_c.npz', a_torch.cpu().data.numpy())
    
end = timer()
time = end - start
file_out.write("In %s epoch; time %s \n" % (epoch, time))
file_out.write("Best scores %s %s %s %s \n" % (best_tuple[0],best_tuple[1],best_tuple[2],best_tuple[3]))
file_out.write("\n")
file_out.flush()
file_out.close()

In [None]:
random_state = np.random.seed(seed)

# specify property of data
init_mind_set = set(indices_to_multi_ind(coo_tensor, shape))
coo_ns = np.empty((how_many * len(init_mind_set) + vals.size, 3), dtype=np.int64)
vals_ns = np.empty((how_many * len(init_mind_set) + vals.size,), dtype=np.float64)
    
data_s = data_storage(sparse_coords = coords, sparse_vals =values, mind_set = init_mind_set, shape=data_shape, how_many=2, valid_filters = valid_filter, valid_triples = valid_triples)

# specify property of training
err_arr = []
error = 0.0
it = 0
previous_best_loss = 100000.0
best_tuple = (0.0, 0.0, 0.0, 0.0)
best_hit_10 = 0.0
# specify training class
trainer = Trainer(best_hit_10, previous_best_loss, err_arr, it)
    
start = timer()

model = FoxIE(rank=rank, shape=data_shape, given_loss=bernoulli_logit_loss, given_loss_grad=bernoulli_logit_loss_grad, device=device)
model.init()


score_margin_ = 0.01
score_attempts_ = 15
optimizer = optim.Adam([model.a_torch, model.b_torch], lr = args.lr)

if (args.opt_type == 'sdg'):
    score_margin_ = 0.0001
    score_attempts_ = 25
    optimizer = optim.SGD([model.a_torch, model.b_torch], lr = args.lr, momentum = args.momentum)
          
elif (args.opt_type == 'adamw'):
    score_margin_ = 0.01
    score_attempts_ = 15
    optimizer = optim.AdamW([model.a_torch, model.b_torch], lr = args.lr)
          
scheduler = StepLR(optimizer, step_size=args.scheduler_step, gamma=args.scheduler_gamma)

show_iter = True
start = timer()
for epoch in range(num_epoch):
    try:
        d = 6
        run_epoch(data_s, epoch, device, model, optimizer, scheduler, batch_size, trainer, show_iter = True, fout = file_out)
    except StopIteration: # early stopping condition met
        break
        print ("early_stoping loss", flush = True)
        raise StopIteration


    hit3, hit5, hit10, mrr = model.evaluate(data_s)

    metrics = {"hit3": hit3,
                "hit5": hit5,
                "hit10": hit10,
                "mrr":mrr,
                "loss":np.mean(trainer.err_arr[len(trainer.err_arr) - 5:])
              }
    wandb.log(metrics)
    print (hit3, hit5, hit10, mrr, flush = True)

    file_out.write('%s %s %s %s \n' % (hit3, hit5, hit10, mrr))
    file_out.flush()
        # early stopping by hit@10
    try:
        check_early_stop_score(hit10, best_hit_10, margin=score_margin_, max_attempts=score_attempts_)
    except StopIteration: # early stopping condition met
        end = timer()
        time = end - start
        file_out.write("\n")
        file_out.write("In %s epoch; time %s \n" % (epoch, time))
        file_out.write("early_stoping score")
        file_out.write("Best scores ", best_tuple)
        file_out.flush()
        file_out.close()
        break
        print ("early_stoping score", flush = True)

    # if hit@10 grows update checkpoint
    if (hit10 > best_hit_10):
        best_hit_10 = hit10
        best_tuple = (hit3, hit5, hit10, mrr)
        #np.save('/notebook/Relations_Learning/gpu/gpu_a.npz', a_torch.cpu().data.numpy())
        #np.save('/notebook/Relations_Learning/gpu/gpu_b.npz', b_torch.cpu().data.numpy())
        #np.save('/notebook/Relations_Learning/gpu/gpu_c.npz', a_torch.cpu().data.numpy())
    
end = timer()
time = end - start
file_out.write("In %s epoch; time %s \n" % (epoch, time))
file_out.write("Best scores %s %s %s %s \n" % (best_tuple[0],best_tuple[1],best_tuple[2],best_tuple[3]))
file_out.write("\n")
file_out.flush()
file_out.close()
                  