In [1]:
import os
import gc
import re
import random
import time
from nodevectors import Node2Vec

import numpy as np

from scipy import sparse
from tqdm import tqdm
from typing import List, Tuple
import torch
from torch.utils.data import DataLoader, TensorDataset

from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_score
# from utils_ml import * 
from utils_word import *
from utils_graph import * 

from timeit import default_timer as timer
from nodevectors import ProNE
import joblib
import traceback
from sklearn.preprocessing import normalize

#m7 from paper 

# baseline node2vec

In [2]:
concept_filtered_arr = np.memmap("saved_files/embedding_concept_arr.dat",shape=(10225,), dtype="<U55")
ngram_abstracts = np.load("saved_files/ngram_abstracts.npy", mmap_mode="r")
saved_year_arr = np.load("saved_files/year_arr.npy", mmap_mode="r")
NUM_OF_VERTICES=len(concept_filtered_arr)
# NUM_OF_VERTICES=len(concept_filtered_arr)

print("Concepts which were tracked",concept_filtered_arr.shape)
print("Abstracts",ngram_abstracts.shape)
print("Year associated to abstract",saved_year_arr.shape)

Concepts which were tracked (10225,)
Abstracts (66839,)
Year associated to abstract (66839,)


# Create Graph

In [3]:
load = True 
if not load:
    word_co_occurrences = get_word_co_occurrences(concept_filtered_arr, ngram_abstracts, saved_year_arr)
    graph = create_graph_from_dict(word_co_occurrences)
    save_graph_edges(graph, 'saved_files/graph_edges.pkl')

    # Print the edge list to verify
    with open('saved_files/graph_edges.pkl', 'rb') as f:
        edge_list = pickle.load(f)
        print(edge_list[0])
        print(edge_list[-1])
        print(len(edge_list))


# Neural Network Train model

In [4]:
def train_model(model_semnet, data_train0, data_train1, data_test0, data_test1, lr_enc, batch_size, data_source,
                solution_train_smaller, solution_test, data_testr):
    """
    Training the neural network
    """

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    # device = torch.device("cpu")
    size_of_loss_check = 2000

    optimizer_predictor = torch.optim.Adam(model_semnet.parameters(), lr=lr_enc)

    data_train0 = torch.tensor(data_train0, dtype=torch.float).to(device)
    data_test0 = torch.tensor(data_test0, dtype=torch.float).to(device)

    data_train1 = torch.tensor(data_train1, dtype=torch.float).to(device)
    data_test1 = torch.tensor(data_test1, dtype=torch.float).to(device)

    test_loss_total = []
    moving_avg = []
    criterion = torch.nn.MSELoss()
    aucs = []

    # There are much more vertex pairs that wont be connected (0) rather than ones
    # that will be connected (1). However, we observed that training with an equally weighted
    # training set (same number of examples for (0) and (1)) results in more stable training.
    # (Imaging we have 1.000.000 nonconnected and 10.000 connected)
    #
    # For that reason, we dont have true 'episodes' (where each example from the training set
    # has been used in the training). Rather, in each of our iteration, we sample batch_size
    # random training examples from data_train0 and from data_train1.

    for iteration in range(500000):  # should be much larger, with good early stopping criteria
        model_semnet.train()
        data_sets = [data_train0, data_train1]
        total_loss = 0
        for idx_dataset in range(len(data_sets)):
            idx = torch.randint(0, len(data_sets[idx_dataset]), (batch_size,))
            data_train_samples = data_sets[idx_dataset][idx]
            calc_properties = model_semnet(data_train_samples)
            curr_pred = torch.tensor([idx_dataset] * batch_size, dtype=torch.float).to(device)
            real_loss = criterion(calc_properties, curr_pred)
            total_loss += torch.clamp(real_loss, min=0., max=50000.).double()

        optimizer_predictor.zero_grad()
        total_loss.backward()
        optimizer_predictor.step()

        # Evaluating the current quality.
        with torch.no_grad():
            model_semnet.eval()
            # calculate train set
            eval_datasets = [data_train0, data_train1, data_test0, data_test1]
            all_real_loss = []

            for idx_dataset in range(len(eval_datasets)):
                eval_datasets[idx_dataset]
                calc_properties = model_semnet(eval_datasets[idx_dataset][0:size_of_loss_check])
                curr_pred = torch.tensor([idx_dataset % 2] * len(eval_datasets[idx_dataset][0:size_of_loss_check]),
                                         dtype=torch.float).to(device)
                real_loss = criterion(calc_properties, curr_pred)
                all_real_loss.append(real_loss.detach().cpu().numpy())

            test_loss_total.append(np.mean(all_real_loss[2]) + np.mean(all_real_loss[3]))

            if iteration % 2500 == 0:
                info_str = 'iteration: ' + str(iteration) + ' - train loss: ' + str(
                    np.mean(all_real_loss[0]) + np.mean(all_real_loss[1])) + '; test loss: ' + str(
                    np.mean(all_real_loss[2]) + np.mean(all_real_loss[3]))
                print('    train_model: ' + str(info_str))
                # with open(get_log_location(data_source), "a") as myfile:
                #     myfile.write('\n    train_model: ' + info_str)

            if iteration % 2500 == 0 and iteration > 1:
                # occasional evaluations
                model_semnet.eval()

                data_test = torch.tensor(data_testr, dtype=torch.float).to(device)
                all_predictions_test = flatten(model_semnet(data_test).detach().cpu().numpy())
                sorted_predictions_test = np.flip(np.argsort(all_predictions_test, axis=0))
                AUC_test = calculate_ROC(sorted_predictions_test, solution_test)

                aucs.append(AUC_test)
                print("AUCS so far: " + str(aucs))

                model_semnet.train()

                if len(aucs) > 3:
                    if aucs[-1] < aucs[-2] < aucs[-3]:
                        print('    Early stopping kicked in')
                        break

            if len(test_loss_total) > 200:  # early stopping
                test_loss_moving_avg = sum(test_loss_total[-100:])
                moving_avg.append(test_loss_moving_avg)
                if len(moving_avg) > 10:
                    if moving_avg[-1] > moving_avg[-5] and moving_avg[-1] > moving_avg[-25]:
                        print('    Early stopping kicked in')
                        break

    plt.plot(test_loss_total)
    plt.show()

    plt.plot(test_loss_total[500:])
    plt.show()

    plt.plot(moving_avg)
    plt.show()

    return True


def timp(start, end):
    return str(round(end - start, 2))


def create_model(graph, save_location, dimensions=128, walk_length=80, num_walks=20, p=1, q=1,
                          window=100,
                          negative=5, epochs=5, batch_words=1000):
    # print("Generating node2vec (probabilities, walks)")
    # print("Parameters: ")
    # print("\tDimensions: " + str(dimensions))
    # print("\tWalk length: " + str(walk_length))
    # print("\tNumber of walks: " + str(num_walks))
    # print("\tp: " + str(p))
    # print("\tq: " + str(q))
    # print("\twindow: " + str(window))
    # print("\tnegative: " + str(negative))
    # print("\tepochs: " + str(epochs))
    # print("\tbatch words: " + str(batch_words))

    s1 = timer()
    # graph = nx.fast_gnp_random_graph(n=100, p=0.5)
    # node2vec = Node2Vec(graph, dimensions=dimensions, walk_length=walk_length, num_walks=num_walks, p=p, q=q,
    #                     workers=multiprocessing.cpu_count() // 2)
    # g2v = GGVec(n_components=DIMENSIONS, order=3)
    g2v = Node2Vec()
    e1 = timer()
    print("Done! Operation took " + timp(s1, e1) + " seconds.")

    # Embed nodes
    print("Embedding nodes")
    s1 = timer()
    # model = node2vec.fit(window=window,
    #                      min_count=1,
    #                      negative=negative,
    #                      epochs=epochs,
    #                      batch_words=batch_words)
    g2v.fit(graph)
    e1 = timer()
    print("Done! Operation took " + timp(s1, e1) + " seconds.")

    # Save embeddings for later use
    # if len(save_location) > 0:
    #     print("Saving embeddings")
    #     s1 = timer()
    #     model.wv.save_word2vec_format(save_location)
    #     e1 = timer()
    #     print("Done! Operation took " + timp(s1, e1) + " seconds.")

    # print("Loading into embedder")
    # s1 = timer()
    # edges_embs = HadamardEmbedder(keyed_vectors=model.wv)
    # e1 = timer()
    # print("Done! Operation took " + timp(s1, e1) + " seconds.")

    return g2v


def get_save_location():
    return "saved_embeddings_" + str(128) + "_" + str(3) + "_" + str(
        1) + "_" + str(
        1) + "_great.pkl"


def compute_all_properties_of_list_node2vec(all_sparse, vlist, suffix, dimensions=128, walk_length=80,
                                            num_walks=20,
                                            p=1, q=1, window=100,
                                            negative=5, epochs=5, batch_words=1000):
    # 1. attempt to find saved embeddings, if they exist.
    save_location = get_save_location()

    generate_embeddings = False
    edges_embs0, edges_embs1, edges_embs2 = None, None, None

    start = timer()
    print("Searching for saved embeddings...")
    if os.path.isfile(save_location):
        print("Found saved embeddings! Trying to load them.")

        try:
            edges_embs0, edges_embs1, edges_embs2 = joblib.load(save_location)
        except (AttributeError, EOFError, ImportError, IndexError) as e:
            print("ERROR: Got an unpicking error.")
            print(traceback.format_exc(e))
            print("Re-generating embeddings to be safe.")
            generate_embeddings = True
        except Exception as e:
            print("ERROR: Got an unknown unpicking error.")
            print(traceback.format_exc(e))
            print("Re-generating embeddings to be safe.")
            generate_embeddings = True
    else:
        print("Could not find saved embeddings.")
        generate_embeddings = True

    end = timer()
    print("Done deciding how to load embeddings, took " + timp(start, end) + " seconds.")

    # Actual embedding generation
    if generate_embeddings:
        start = timer()
        print("Creating embeddings from scratch! This might take a while...",
                     )
        print("Suffix: " + str(suffix))
        edges_embs0 = create_model(all_sparse[0],
                                            '',
                                            dimensions=dimensions, walk_length=walk_length, num_walks=num_walks, p=p,
                                            q=q,
                                            window=window,
                                            negative=negative, epochs=epochs, batch_words=batch_words)
        edges_embs1 = create_model(all_sparse[1], 
                                            '',
                                            dimensions=dimensions, walk_length=walk_length, num_walks=num_walks, p=p,
                                            q=q,
                                            window=window,
                                            negative=negative, epochs=epochs, batch_words=batch_words)
        edges_embs2 = create_model(all_sparse[2],
                                            '',
                                            dimensions=dimensions, walk_length=walk_length, num_walks=num_walks, p=p,
                                            q=q,
                                            window=window,
                                            negative=negative, epochs=epochs, batch_words=batch_words)
        end = timer()
        print("Done creating embeddings!!! Took " + timp(start, end) + " seconds.")

        start = timer()
        print("Saving embeddings for future use.")
        joblib.dump([edges_embs0, edges_embs1, edges_embs2], save_location)
        end = timer()
        print("Done saving embeddings; took " + timp(start, end) + " seconds.")

    all_properties = []
    print('Computed all node2vec embeddings!')

    print("Getting embeddings for each edge...")
    start = timer()

    for ii in range(len(vlist)):
        print("ii=" + str(ii))
        print("embedding = " + str(edges_embs0.predict(vlist[ii][0])))
        print('len=' + str(len(vlist)))

        g0vals = np.concatenate([edges_embs0.predict(vlist[ii][0]), edges_embs0.predict(vlist[ii][1])])
        g1vals = np.concatenate([edges_embs1.predict(vlist[ii][0]), edges_embs1.predict(vlist[ii][1])])
        g2vals = np.concatenate([edges_embs2.predict(vlist[ii][0]), edges_embs2.predict(vlist[ii][1])])

        vals = []
        vals.extend(g0vals)
        vals.extend(g1vals)
        vals.extend(g2vals)

        all_properties.append(vals)
        if ii % 10 == 0:
            end = timer()
            print('compute_all_properties_of_list progress: ', ii, '/', len(vlist), '    time elapsed: ',
                         timp(start, end),
                         ' seconds')
            time_start = time.time()

    end = timer()
    print("Done. Took " + timp(start, end) + " seconds.")

    np_props = np.array(all_properties)

    np_normed = normalize(np_props, norm='l1', axis=1)
    return np_normed


def flatten(t):
    return [item for sublist in t for item in sublist]

DIMENSIONS = 128
WALK_LENGTH = 80
NUM_WALKS = 20

P = 1.0
Q = 1.0
WINDOW = 100
NEGATIVE = 5
EPOCHS = 5
BATCH_WORDS = 1000
SIZE = 1000
OSIZE = 30

log_file = None
results_log_file = None


def link_prediction_embednet(full_dynamic_graph_sparse, unconnected_vertex_pairs, year_start, years_delta,
                             vertex_degree_cutoff, min_edges, hyper_parameters, data_source):
    """
    Gets an evolving semantic network and a list of unconnected vertices,
    and returns an index list of which vertex pairs are most likely to be
    connected at a later time t2 (from likely to unlikely)
    
    :param full_dynamic_graph_sparse: Full graph, numpy array dim(n,3)
            [vertex v1, vertex v2, time stamp t1] representing edges, up
            to t<=t1. The edge is formed between vertex v1 and v2 at time t
            (measured in days after 1.1.1990)
    :param unconnected_vertex_pairs, numpy array of vertex
            pairs [v1,v2] with no edge at t1 and deg(v1/2)>10. Question is
            whether these vertex pairs will have an edge at t2.
    :param year_start - Integer, year for t1. Edges with
            t1=(date(year_start,12,31)-date(1990,1,1)).days are included
            in full_dynamic_graph_sparse
    :param years_delta, Integer, number if years to predict,
            t2=(date(year_start+years_delta,12,31)-date(1990,1,1)).days
    :param vertex_degree_cutoff, Integer, number of minimal vertex degree for
            prediction 
    :param min_edges, Integer, Predict edges which grew from zero to min_edges 
        
    Output - sorted_predictions_eval, numpy array with
            len(..)=len(unconnected_vertex_pairs).
            Sorted of which pairs in unconnected_vertex_pairs
            are most likely to be connected at t2. Used for computing the
            AUC metric.

    
    This is a simple baseline model, with the following workflow:
    1) Learns to predict using training data from 2011 -> 2014.
          1.1) For that, it uses 
                     train_dynamic_graph_sparse,
                     train_edges_for_checking,
                     train_edges_solution=
                     create_training_data(
                                          full_graph,
                                          year_start=2014,
                                          years_delta=3,
                                          edges_used,
                                          vertex_degree_cutoff
                                         )
               train_dynamic_graph_sparse - Semantic network until 2014
               (numpy array with triples for each edge [v1,v2,t])
               train_edges_for_checking - list of unconnected vertices in
               2011 and computes whether they are connected by 2014.
                   
               edges_used, unconnected edges that are used in training.
               
               train_edges_solution is a numpy array stating whether an
               element in edges_used has been connected in t2
               
          1.2) It computes a list of 15 properties of each edge in the
               train_edges_for_checking. The properties contain the
               local degrees of the vertices, numbers of shared neighbors
               and paths of length 3, in the year 2011, 2010 and 2009.
          1.3) Those 15 properties per vertex pair are input into a neural
               network, which predicts whether the vertex pairs will be
               connected or not (using train_edges_solution)
          1.4) Computes the AUC for training and test data using
               calculate_ROC.
    
    2) Makes predictions for 2014 -> 2017 data.
          2.1) Computes the 15 properties for the 2014 data.
          2.2) Uses the trained network to predict whether edges are
               created by 2017.
    
    3) Creates a sorted index list, from highest predicted vertex pair to
        least predicted one (sorted_predictions)
    
    4) Returns sorted_predictions
    """

    edges_used, percent_positive_examples, batch_size, lr_enc, rnd_seed = hyper_parameters

    random.seed(rnd_seed)
    torch.manual_seed(rnd_seed)
    np.random.seed(rnd_seed)

    print(
        '1) Learns to predict using training data from ' + str(year_start - years_delta) + ' -> ' + str(year_start),
        )

    print('1.1) Create training data for ' + str(year_start - years_delta))

    day_origin = date(1990, 1, 1)

    # with open(get_log_location(data_source), "a") as myfile:
    #     myfile.write('1.1) Create training data for ' + str(year_start - years_delta) + '\n')

    train_dynamic_graph_sparse, train_edges_for_checking, train_edges_solution = create_training_data_biased(
        full_dynamic_graph_sparse, year_start - years_delta, years_delta, min_edges=min_edges,
        edges_used=edges_used,
        vertex_degree_cutoff=vertex_degree_cutoff, data_source=data_source,
        NUM_OF_VERTICES=NUM_OF_VERTICES)

    day_origin = date(1990, 1, 1)
    years = [year_start - years_delta, year_start - years_delta - 1, year_start - years_delta - 2]

    train_sparse = []
    for yy in years:
        print('    Create Graph for ', yy)
        day_curr = date(yy, 12, 31)
        train_edges_curr = train_dynamic_graph_sparse[
            train_dynamic_graph_sparse[:, 2] < (day_curr - day_origin).days]
        adj_mat_sparse_curr = sparse.csr_matrix(
            (np.ones(len(train_edges_curr)), (train_edges_curr[:, 0], train_edges_curr[:, 1])),
            shape=(NUM_OF_VERTICES, NUM_OF_VERTICES))

        train_sparse.append(adj_mat_sparse_curr)

    print('    Shuffle training data...')
    # with open(get_log_location(data_source), "a") as myfile:
    #     myfile.write('\n    Shuffle training data...\n')
    train_valid_test_size = [0.9, 0.1, 0.0]
    x = [i for i in range(len(train_edges_for_checking))]  # random shuffle input

    random.shuffle(x)
    train_edges_for_checking = train_edges_for_checking[x]
    train_edges_solution = train_edges_solution[x]

    print('    Split dataset...')
    idx_traintest = int(len(train_edges_for_checking) * train_valid_test_size[0])

    data_edges_train = train_edges_for_checking[0:idx_traintest]
    solution_train = train_edges_solution[0:idx_traintest]

    data_edges_test = train_edges_for_checking[idx_traintest:]
    solution_test = train_edges_solution[idx_traintest:]

    print('1.2) Compute 15 network properties for training data y=' + str(year_start - 3))

    print('    Prepare data for equally distributed training...')
    print('    This is an important design choice for training the NN.')
    print('    Note that the evaluation set (also for the competition) is NOT equally distributed!',
                 )
    # with open(get_log_location(data_source), "a") as myfile:
    #     myfile.write('\n1.2) Compute 15 network properties for training data y=' + str(year_start - 3))

    # Rather than using all connected and unconnected vertex pairs for training
    # (i.e. needing to compute their properties), we reject about 99% of all unconnected
    # examples, to have more examples of connected cases in the training. This significantly
    # speeds up the computation, at the price of precision.
    data_edges_train_smaller = []
    solution_train_smaller = []
    for ii in range(len(data_edges_train)):
        if (solution_train[ii] == 0 and random.random() < percent_positive_examples) or solution_train[ii] == 1:
            data_edges_train_smaller.append(data_edges_train[ii])
            solution_train_smaller.append(solution_train[ii])

    # with open(get_log_location(data_source), "a") as myfile:
    #     myfile.write('\nComputing properties for Training data')
    print('Computing properties for Training data')
    # data_train = compute_all_properties_of_list(train_sparse, data_edges_train_smaller, data_source)
    data_train = compute_all_properties_of_list_node2vec(train_sparse, data_edges_train_smaller, data_source,
                                                         dimensions=DIMENSIONS, walk_length=WALK_LENGTH,
                                                         num_walks=NUM_WALKS,
                                                         p=P, q=Q, window=WINDOW,
                                                         negative=NEGATIVE, epochs=EPOCHS, batch_words=BATCH_WORDS)

    data_train0 = []
    data_train1 = []
    for ii in range(len(data_edges_train_smaller)):
        if solution_train_smaller[ii] == 1:
            data_train1.append(data_train[ii])
        else:
            data_train0.append(data_train[ii])

    # with open(get_log_location(data_source), "a") as myfile:
    #     myfile.write('\nComputing properties for Test data')
    print('Computing properties for Test data')
    # data_test = compute_all_properties_of_list(train_sparse, data_edges_test, data_source)
    data_test = compute_all_properties_of_list_node2vec(train_sparse, data_edges_test, data_source,
                                                        dimensions=DIMENSIONS, walk_length=WALK_LENGTH,
                                                        num_walks=NUM_WALKS,
                                                        p=P, q=Q, window=WINDOW,
                                                        negative=NEGATIVE, epochs=EPOCHS, batch_words=BATCH_WORDS)
    data_test0 = []
    data_test1 = []
    for ii in range(len(data_edges_test)):
        if solution_test[ii] == 1:
            data_test1.append(data_test[ii])
        else:
            data_test0.append(data_test[ii])

    # with open('/home/dyusha/research/alvia-ai/data/saved_stuff.pkl', 'wb') as f:
    #     pickle.dump(
    #         [data_train0, data_train1, data_test0, data_test1, solution_train_smaller, solution_test, data_test,
    #          data_train0, data_train1, data_train], f)

    # with open(get_log_location(data_source), "a") as myfile:
    #     myfile.write('\n1.3) Train Neural Network')
    print('1.3) Train Neural Network')
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    # device = torch.device("cpu")
    # print(np.array(data_train0).shape)
    model_semnet = MLP(input_size=192).to(device)

    model_semnet.train()
    train_model(model_semnet, data_train0, data_train1, data_test0, data_test1, lr_enc, batch_size, data_source,
                solution_train_smaller, solution_test, data_test)

    # with open(get_log_location(data_source), "a") as myfile:
    #     myfile.write('\n1.4) Computes the AUC for training and test data using calculate_ROC.')
    print('1.4) Computes the AUC for training and test data using calculate_ROC.')
    model_semnet.eval()

    data_train = torch.tensor(data_train, dtype=torch.float).to(device)
    all_predictions_train = flatten(model_semnet(data_train).detach().cpu().numpy())
    sorted_predictions_train = np.flip(np.argsort(all_predictions_train, axis=0))
    AUC_train = calculate_ROC(sorted_predictions_train, solution_train_smaller)
    print('    AUC_train: ' + str(AUC_train))
    print('AUC_train: ' + str(AUC_train), file=results_log_file)

    data_test = torch.tensor(data_test, dtype=torch.float).to(device)
    all_predictions_test = flatten(model_semnet(data_test).detach().cpu().numpy())
    sorted_predictions_test = np.flip(np.argsort(all_predictions_test, axis=0))
    AUC_test = calculate_ROC(sorted_predictions_test, solution_test)
    print('    AUC_test: ' + str(AUC_test))
    print('AUC_test: ' + str(AUC_test), file=results_log_file)

    # Create properties for evaluation
    print('2) Makes predictions for ' + str(year_start) + ' -> ' + str(year_start + years_delta) + ' data.',
                 )
    years = [year_start, year_start - 1, year_start - 2]

    # with open(get_log_location(data_source), "a") as myfile:
    #     myfile.write('\n2.1) Computes the 15 properties for the ' + str(year_start) + ' data.')
    print('2.1) Computes the 15 properties for the ' + str(year_start) + ' data.')
    eval_sparse = []
    for yy in years:
        print('    Create Graph for ' + str(yy))
        day_curr = date(yy, 12, 31)
        eval_edges_curr = full_dynamic_graph_sparse[full_dynamic_graph_sparse[:, 2] < (day_curr - day_origin).days]
        adj_mat_sparse_curr = sparse.csr_matrix(
            (np.ones(len(eval_edges_curr)), (eval_edges_curr[:, 0], eval_edges_curr[:, 1])),
            shape=(NUM_OF_VERTICES, NUM_OF_VERTICES)
        )

        eval_sparse.append(adj_mat_sparse_curr)

    # with open(get_log_location(data_source), "a") as myfile:
    #     myfile.write('\n    compute all properties for evaluation')
    print('    compute all properties for evaluation')
    # eval_examples = compute_all_properties_of_list(eval_sparse, unconnected_vertex_pairs, data_source)
    eval_examples = compute_all_properties_of_list_node2vec(eval_sparse, unconnected_vertex_pairs, data_source,
                                                            dimensions=DIMENSIONS, walk_length=WALK_LENGTH,
                                                            num_walks=NUM_WALKS,
                                                            p=P, q=Q, window=WINDOW,
                                                            negative=NEGATIVE, epochs=EPOCHS, batch_words=BATCH_WORDS)
    eval_examples = np.array(eval_examples)

    # with open(get_log_location(data_source), "a") as myfile:
    #     myfile.write(
    #         '\n2.2) Uses the trained network to predict whether edges are created by ' + str(year_start + 3) + '.')
    print('2.2) Uses the trained network to predict whether edges are created by ' + str(year_start + 3) + '.',
                 )
    eval_examples = torch.tensor(eval_examples, dtype=torch.float).to(device)
    all_predictions_eval = flatten(model_semnet(eval_examples).detach().cpu().numpy())

    # with open(get_log_location(data_source), "a") as myfile:
    #     myfile.write(
    #         '\n3) Creates a sorted index list, from highest predicted vertex pair to least predicted one (sorted_predictions)')
    print(
        '3) Creates a sorted index list, from highest predicted vertex pair to least predicted one (sorted_predictions)',
        )
    sorted_predictions_eval = np.flip(np.argsort(all_predictions_eval, axis=0))

    print('4) Returns sorted_predictions')
    return sorted_predictions_eval

In [5]:
import argparse
import os
import pickle

cutoff_list=[1] # vertex degree is 1, means that the vertex is connected to at least one other vertex
delta_list=[3] # time gap, unconnected pairs in y, the corresponding connection in y+delta
min_edges_list=[1] # every two vertices should have at least 1 edge between them
year_start=2023-3

for current_delta in delta_list:
    for curr_vertex_degree_cutoff in cutoff_list:
        for current_min_edges in min_edges_list:
            
            print(os.getcwd(), )
            data_source = "saved_files/SemanticGraph_year_2020_delta_" + str(current_delta) + "_cutoff_" + str(
                curr_vertex_degree_cutoff) + "_minedge_" + str(current_min_edges) + ".pkl"
            print('Data source: ' + data_source + "\n", file=results_log_file)

            if os.path.isfile(data_source):
                with open(data_source, "rb") as pkl_file:
                    full_dynamic_graph_sparse, unconnected_vertex_pairs, unconnected_vertex_pairs_solution, year_start, years_delta, vertex_degree_cutoff, min_edges = pickle.load(
                        pkl_file)

              
                full_rnd_seed = [42]

                eu  = 100000 
                con = 100000
                ppe = 1 
                bs = 400 
                lr = 0.000003 


                for rnd_seed in full_rnd_seed:
                    hyper_paramters = [eu, ppe, bs, lr, rnd_seed]

                    all_idx = link_prediction_embednet(full_dynamic_graph_sparse,
                                                        unconnected_vertex_pairs[:con],
                                                        year_start,
                                                        years_delta,
                                                        vertex_degree_cutoff,
                                                        min_edges,
                                                        hyper_paramters,
                                                        data_source
                                                        )

                    AUC = calculate_ROC(all_idx, np.array(unconnected_vertex_pairs_solution[:con]))
                    print('Area Under Curve for Evaluation: ', AUC, '\n\n\n', )

                    # with open("morelogs" + data_source[0:-4] + ".txt", "a") as log_file:
                    print("---\n")
                        # print("edges_used=" + str(1e7) + "\n")
                        # print("percent_positive_examples=" + str(percent_positive_examples) + "\n")
                        # print("batch_size=" + str(batch_size) + "\n")
                        # print("lr_enc=" + str(lr_enc) + "\n")
                        # print("rnd_seed=" + str(rnd_seed) + "\n")
                    print("AUC=" + str(AUC) + "\n\n")
            else:
                print('File ', data_source, ' does not exist. Proceed to next parameter setting.', )

/home/felix/vscodeProjects/arxiv_nlp
Data source: saved_files/SemanticGraph_year_2020_delta_3_cutoff_1_minedge_1.pkl

1) Learns to predict using training data from 2017 -> 2020
1.1) Create training data for 2017


in create_training_data_biased 
Creating the following data: 
    year_start:  2017
    years_delta:  3
    min_edges:  1
    edges_used:  100000
    vertex_degree_cutoff:  1
    Create Graph for  2017
          num of edges:  394439
    Done: Create Graph for  2017
    Create Graph for  2020
          num of edges:  504671
    Done: Create Graph for  2020

len(all_vertices): 10225
len(vertex_large_degs): 9921

    edge progress (0.036576032638549805sec): 0.01M/0.1M; True: 19; False: 9981
    edge progress (0.03616142272949219sec): 0.02M/0.1M; True: 50; False: 19950
    edge progress (0.03537297248840332sec): 0.03M/0.1M; True: 71; False: 29929
    edge progress (0.035333871841430664sec): 0.04M/0.1M; True: 93; False: 39907
    edge progress (0.0381317138671875sec): 0.05M/0.1M;