In [6]:
import pandas as pd
import os
import math
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
# Import the NetworkX package
import networkx as nx
import torch
print("PyTorch has version {}".format(torch.__version__))
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

PyTorch has version 1.10.2+cu102


In [3]:
train_csv = pd.read_csv("../../data/SocialNetowrk/hw2_data/dataset1/train.csv")
test_csv = pd.read_csv("../../data/SocialNetowrk/hw2_data/dataset1/test.csv")
upload_csv = pd.read_csv("../../data/SocialNetowrk/hw2_data/dataset1/upload.csv")
content_csv = pd.read_csv("../../data/SocialNetowrk/hw2_data/dataset1/content.csv", header=None)

In [4]:
print("training data:", len(train_csv))
print(train_csv.head())
print("testing data:", len(test_csv))
print(test_csv.head())
print("upload data:", len(upload_csv))
print(upload_csv.head())
print("content data:", len(content_csv))

training data: 8686
       id    to  from  label
0  E10311  2399  2339      0
1  E10255  2397  1144      1
2  E10667   854  1726      0
3   E9395   872   702      0
4   E5926  2450  1312      1
testing data: 2172
       id    to  from
0  E10559  2323  2673
1   E4849    81  1634
2   E3964  2405  1765
3    E542  2114   498
4    E331  1013   849
upload data: 2172
       id  prob
0  E10559   0.5
1   E4849   0.5
2   E3964   0.5
3    E542   0.5
4    E331   0.5
content data: 2708


### Create Graph

In [5]:
train_csv[train_csv['to'] ==816]

Unnamed: 0,id,to,from,label
24,E9183,816,1769,0
3942,E5420,816,2461,0


In [6]:
train_csv['node_pair'] = train_csv[['to', 'from']].apply(tuple, axis=1)
train_csv['node_pair'].values[0]

(2399, 2339)

In [7]:
edge_csv = train_csv[train_csv["label"]==1]
print("edge_csv data:", len(edge_csv))

edge_csv data: 4324


In [7]:
class ReadData:
    def __init__(self, dataset):
        self.__edges = list()
        self.__feature = dict()
        
        train_csv = pd.read_csv("../../data/SocialNetowrk/hw2_data/{}/train.csv".format(dataset))
        test_csv = pd.read_csv("../../data/SocialNetowrk/hw2_data/{}/test.csv".format(dataset))
        content_csv = pd.read_csv("../../data/SocialNetowrk/hw2_data/{}/content.csv".format(dataset), header=None)
        edge_csv = train_csv[train_csv["label"]==1]
        train_csv['node_pair'] = train_csv[['to', 'from']].apply(tuple, axis=1)
        test_csv['node_pair'] = test_csv[['to', 'from']].apply(tuple, axis=1)
        
        # get feature
        for line in content_csv.to_numpy():
            l = line[0].split("\t")
            self.__feature[int(l[0])] = list(map(int,l[1:]))
        
        # Create an undirected graph G
        # read node pair
        a_list, b_list, Gtext = [], [], []
        for line in edge_csv.to_numpy():
            a_list.append(line[1])
            b_list.append(line[2])
            Gtext.append((line[1], line[2]))
        self.__edges = [a_list+b_list, b_list+a_list]
        #self.__edges = [a_list, b_list]
        
        
        self.__Graph = nx.Graph(Gtext)
        self.__nodes_list = [self.__feature[node] for node in self.__Graph.nodes]
        
        # get common neighbors
        _, common_neighbors = self.get_common_neighbors(train_csv['node_pair'].values)
        _, test_common_neighbors = self.get_common_neighbors(test_csv['node_pair'].values)
        train_csv["ComNei"] = common_neighbors
        test_csv["ComNei"] = test_common_neighbors
        
        # get Preferential Attachment
        _, preatt = self.get_preatt(train_csv['node_pair'].values)
        _, test_preatt = self.get_preatt(test_csv['node_pair'].values)
        train_csv["PreAtt"] = preatt
        test_csv["PreAtt"] = test_preatt
        
        # get Jaccard's Coefficient
        _, jaccard = self.get_jaccard(train_csv['node_pair'].values)
        _, test_jaccard = self.get_jaccard(test_csv['node_pair'].values)
        train_csv["Jaccard"] = jaccard
        test_csv["Jaccard"] = test_jaccard
        
        # get Adamic/Adar
        _, aa = self.get_aa(train_csv['node_pair'].values)
        _, test_aa = self.get_aa(test_csv['node_pair'].values)
        train_csv["AA"] = aa
        test_csv["AA"] = test_aa
        
#         # get Salton index
#         _, salton = self.get_salton(train_csv['node_pair'].values)
#         _, test_salton = self.get_salton(test_csv['node_pair'].values)
#         train_csv["Salton"] = salton
#         test_csv["Salton"] = test_salton
        
#         # get SΦrensen index
#         _, sphi = self.get_sphi(train_csv['node_pair'].values)
#         _, test_sphi = self.get_sphi(test_csv['node_pair'].values)
#         train_csv["Sphi"] = sphi
#         test_csv["Sphi"] = test_sphi
        
#         # get hub promoted index
#         _, hub = self.get_hubpro(train_csv['node_pair'].values)
#         _, test_hub = self.get_hubpro(test_csv['node_pair'].values)
#         train_csv["HubPro"] = hub
#         test_csv["HubPro"] = test_hub
        
#         # get hub depressed index
#         _, hub = self.get_hubdep(train_csv['node_pair'].values)
#         _, test_hub = self.get_hubdep(test_csv['node_pair'].values)
#         train_csv["HubDep"] = hub
#         test_csv["HubDep"] = test_hub
        
#         # get Leicht-Holme-Newman Index
#         _, leicht = self.get_leicht(train_csv['node_pair'].values)
#         _, test_leicht = self.get_leicht(test_csv['node_pair'].values)
#         train_csv["Leicht"] = leicht
#         test_csv["Leicht"] = test_leicht
        
#         # get Resource allocation
#         _, res = self.get_resource(train_csv['node_pair'].values)
#         _, test_res = self.get_resource(test_csv['node_pair'].values)
#         train_csv["Res"] = res
#         test_csv["Res"] = test_res
        
        # # get Random walk with restart
        # edge_file_name = "edge_{}.txt".format(dataset)
        # self.write_txt(edge_file_name)
        # train_csv["Rwr"] = self.get_rwr_scores(train_csv, edge_file_name)
        # test_csv["Rwr"] =self.get_rwr_scores(test_csv, edge_file_name)
        
            
        # print("training dataframe: ")
        # print(train_csv.sort_values(by="Rwr", ascending=False).head())
        # print("\ntesting dataframe: ")
        # print(test_csv.sort_values(by="Rwr", ascending=False).head())
        self.__data_array = train_csv.to_numpy()
        self.__test_array = test_csv.to_numpy()
    
    def get_nodes(self):
        return torch.Tensor(self.__nodes_list)

    def get_edges(self):
        return np.array(self.__edges)
    
    def get_torch_edges(self):
        return torch.tensor(self.__edges, dtype=torch.long)
    
    def get_nodes_x(self):
        features = self.get_feature()
        training_array = [features[i] for i in range(len(features.keys()))]
        
        return torch.tensor(training_array, dtype=torch.float)
    
        
    def get_num_feature(self):
        return self.__data_array[:, 5:].shape[1]
    
    def get_feature(self):
        return self.__feature
    
    def get_common_neighbors(self, node_pairs):
        common_neighbors = list()
        for i, j in node_pairs:
            if i not in self.__Graph.nodes or j not in self.__Graph.nodes:
                common_neighbors.append(0)
                continue
            to_nlist = [ n for n in self.__Graph.neighbors(i)]
            from_nlist = [ n for n in self.__Graph.neighbors(j)]
            common_neighbors.append(len(set(to_nlist)&set(from_nlist)))
            
        return torch.Tensor(common_neighbors), common_neighbors
    
    # Preferential Attachment
    def get_preatt(self, node_pairs):
        att = list()
        for i, j in node_pairs:
            if i not in self.__Graph.nodes or j not in self.__Graph.nodes:
                att.append(0)
                continue
            to_nlist = [ n for n in self.__Graph.neighbors(i)]
            from_nlist = [ n for n in self.__Graph.neighbors(j)]
            att.append(len(to_nlist) * len(from_nlist))
            
        return  torch.Tensor(att), att
    
    # Jaccard's Coeffieient
    def get_jaccard(self, node_pairs):
        jaccard = list()
        for i, j in node_pairs:
            if i not in self.__Graph.nodes or j not in self.__Graph.nodes:
                jaccard.append(0)
                continue
            to_nlist = [ n for n in self.__Graph.neighbors(i)]
            from_nlist = [ n for n in self.__Graph.neighbors(j)]
            jaccard.append(round(len(set(to_nlist)&set(from_nlist)) / len(set(to_nlist)|set(from_nlist)), 3))
            
        return torch.Tensor(jaccard), jaccard
    
    # Adamic/Adar
    def get_aa(self, node_pairs):
        aa = list()
        for i, j in node_pairs:
            if i not in self.__Graph.nodes or j not in self.__Graph.nodes:
                aa.append(0)
                continue
            to_nlist = [ n for n in self.__Graph.neighbors(i)]
            from_nlist = [ n for n in self.__Graph.neighbors(j)]
            neighbors = set(to_nlist)&set(from_nlist)
            aa_score = 0
            if len(neighbors) != 0:
                for neighbor in neighbors:
                    neighbor_friends = [i for i in self.__Graph.neighbors(neighbor)]
                    if len(neighbor_friends)==1:
                        continue
                    aa_score += 1/ math.log(len(neighbor_friends))
            aa.append(round(aa_score, 3))
            
        return torch.Tensor(aa), aa
    
    # Salton
    def get_salton(self, node_pairs):
        salton = list()
        for i, j in node_pairs:
            if i not in self.__Graph.nodes or j not in self.__Graph.nodes:
                salton.append(0)
                continue
            to_nlist = [ n for n in self.__Graph.neighbors(i)]
            from_nlist = [ n for n in self.__Graph.neighbors(j)]
            salton.append(round(len(set(to_nlist)&set(from_nlist)) / (len(set(to_nlist))*len(set(from_nlist))) ** 0.5, 3))
            
        return torch.Tensor(salton), salton 
    
    # SΦrensen index
    def get_sphi(self, node_pairs):
        sphi = list()
        for i, j in node_pairs:
            if i not in self.__Graph.nodes or j not in self.__Graph.nodes:
                sphi.append(0)
                continue
            to_nlist = [ n for n in self.__Graph.neighbors(i)]
            from_nlist = [ n for n in self.__Graph.neighbors(j)]
            sphi.append(round(len(set(to_nlist)&set(from_nlist)) * 2 / (len(set(to_nlist)) + len(set(from_nlist))), 3))
            
        return torch.Tensor(sphi), sphi 
    
    # Hub Promoted
    def get_hubpro(self, node_pairs):
        hub = list()
        for i, j in node_pairs:
            if i not in self.__Graph.nodes or j not in self.__Graph.nodes:
                hub.append(0)
                continue
            to_nlist = [ n for n in self.__Graph.neighbors(i)]
            from_nlist = [ n for n in self.__Graph.neighbors(j)]
            hub.append(round(len(set(to_nlist)&set(from_nlist)) * 2 / min(len(set(to_nlist)), len(set(from_nlist))), 3))
            
        return torch.Tensor(hub), hub 
    
    # Hub Depressed
    def get_hubdep(self, node_pairs):
        hub = list()
        for i, j in node_pairs:
            if i not in self.__Graph.nodes or j not in self.__Graph.nodes:
                hub.append(0)
                continue
            to_nlist = [ n for n in self.__Graph.neighbors(i)]
            from_nlist = [ n for n in self.__Graph.neighbors(j)]
            hub.append(round(len(set(to_nlist)&set(from_nlist)) * 2 / max(len(set(to_nlist)), len(set(from_nlist))), 3))
            
        return torch.Tensor(hub), hub 
    
    # Leicht-Holme-Newman
    def get_leicht(self, node_pairs):
        leicht = list()
        for i, j in node_pairs:
            if i not in self.__Graph.nodes or j not in self.__Graph.nodes:
                leicht.append(0)
                continue
            to_nlist = [ n for n in self.__Graph.neighbors(i)]
            from_nlist = [ n for n in self.__Graph.neighbors(j)]
            leicht.append(round(len(set(to_nlist)&set(from_nlist)) / (len(set(to_nlist))*len(set(from_nlist))), 3))
            
        return torch.Tensor(leicht), leicht 
    
    # Resource allocation
    def get_resource(self, node_pairs):
        res = list()
        for i, j in node_pairs:
            if i not in self.__Graph.nodes or j not in self.__Graph.nodes:
                res.append(0)
                continue
            to_nlist = [ n for n in self.__Graph.neighbors(i)]
            from_nlist = [ n for n in self.__Graph.neighbors(j)]
            neighbors = set(to_nlist)&set(from_nlist)
            res_score = 0
            if len(neighbors) != 0:
                for neighbor in neighbors:
                    neighbor_friends = [i for i in self.__Graph.neighbors(neighbor)]
                    if len(neighbor_friends)==1:
                        continue
                    res_score += 1/ len(neighbor_friends)
            res.append(round(res_score, 3))
            
        return torch.Tensor(res), res
    
    def get_training_data(self):
        target = self.__data_array[:, 3]
        training_array = self.__data_array[:, 5:].astype(float)
        training_array = self.normalize(training_array)
        
        return training_array, target
    
    def get_testing_data(self):
        testing_array = self.__test_array[:, 4:].astype(float)
        testing_array = self.normalize(testing_array)
        
        return testing_array
    
    def normalize(self, dataset):
        mean = np.mean(dataset, axis=0)
        std  = np.std(dataset, axis=0)
        
        return (dataset - mean)/std
    
    def get_graph(self):
        return self.__Graph
    
    # Random Work with restart
    def get_rwr_scores(self, df, edge_file_name):
        rrwr = RWR()
        rrwr.read_graph("./{}".format(edge_file_name), "undirected")
        max_size, start_node = max(self.__Graph.nodes) + 1, min(self.__Graph.nodes)
        edges = self.get_edges()
        rwr_score = np.zeros((len(df), 1))
        for i in tqdm(range(start_node, max_size), desc="rwr scores.."):
            t = df[df['from']==i]
            if t.empty is False:
                df_idx = t.index
                # i_scores = rrwr.compute(seed=i)
                i_scores = rwr(edges, i, max_size, alpha = 0.85, max_iter = 100)
                for idx, j in enumerate(t.to.values):
                    # j -= start_node
                    rwr_score[idx] = np.round(i_scores[j], 3)
        
        return rwr_score
    
    def write_txt(self, output_path):
        f = open(output_path, "w")
        for a, b in self.__Graph.edges:
            f.write(f"{a}\t{b}\n")
        f.close()
        
    def get_test_nodepairs(self):
        node_pairs = self.__test_array[:, 1:3].astype(int)
        
        return torch.tensor(node_pairs.T, dtype=torch.long)
    
    def get_training_nodepairs(self):
        node_pairs = self.__data_array[:, 1:3].astype(int)
        
        return torch.tensor(node_pairs.T, dtype=torch.long)
          
import os
import sys
sys.path.append(os.path.abspath("./pyrwr/"))
from pyrwr.rwr import RWR
data = ReadData("dataset2")
print(data.get_edges().shape)
print(data.get_nodes().shape)
#print(data.get_common_neighbors())
print(len(data.get_feature()[351]))

(2, 7472)
torch.Size([2994, 3703])
3703


### DirectedGraph

In [187]:
from copy import copy

class DirectedGraph:
    def __init__(self, dataset):
        self.__feature = dict()
        train_csv = pd.read_csv("../../data/SocialNetowrk/hw2_data/{}/train.csv".format(dataset))
        test_csv = pd.read_csv("../../data/SocialNetowrk/hw2_data/{}/test.csv".format(dataset))
        content_csv = pd.read_csv("../../data/SocialNetowrk/hw2_data/{}/content.csv".format(dataset), header=None)
        edge_csv = train_csv[train_csv["label"]==1]
        
        # get feature
        for line in content_csv.to_numpy():
            l = line[0].split("\t")
            self.__feature[int(l[0])] = np.array(list(map(int,l[1:])))
            
        # Create an directed graph G
        # from --> to
        a_list, b_list, Gtext = [], [], []
        for line in edge_csv.to_numpy():
            a_list.append(line[2])
            b_list.append(line[1])
            Gtext.append((line[2], line[1]))
            Gtext.append((line[1], line[2]))
        self.__edges = [a_list+b_list, b_list+a_list]
            
        self.__Graph = nx.DiGraph(Gtext)
        self.set_feature_nodes()
        self.set_feature_weight()
        self.set_edges_weight()
        
        # Random walk with restart
        # train_csv['Rwr'] = self.get_rwr_scores(train_csv)
        test_csv["Rwr"] =self.get_rwr_scores(test_csv)
        # print("training dataframe: ")
        # print(train_csv.sort_values(by="Rwr", ascending=False).head())
        print("testing dataframe: ")
        print(test_csv.sort_values(by="Rwr", ascending=False).head())
            
    def get_feature(self):
        return self.__feature
    
    def get_edges(self):
        return self.__edges
    
    def set_feature_nodes(self):
        for k in self.__feature.keys():
            attributes = np.nonzero(self.__feature[k])
            for att in attributes[0]:
                self.__Graph.add_edge(f"a{att}", k)
    
    def set_feature_weight(self):
        for node in self.__Graph.nodes:
            if 'a' in str(node):
                att_node_nei = [i for i in self.__Graph.neighbors(node)]
                for n in att_node_nei:
                    self.__Graph[node][n]['weight'] = 1/len(att_node_nei)
                    
    def set_edges_weight(self, lam=0.2):
        for node in self.__edges[0]:
            u_list, a_list = [], []
            for n in self.__Graph.neighbors(node):
                if 'a' in str(n):
                    a_list.append(n)
                else:
                    u_list.append(n)
            if len(u_list)!=0:
                for u in u_list:
                    self.__Graph[node][u]['weight'] = (1-lam)/len(u_list)
            if len(a_list)!=0:
                for a in a_list:
                    self.__Graph[node][a]['weight'] = lam/len(a_list)
    
    def get_rwr_scores(self, df):
        edges_list = copy(self.__edges)
        att_nodes = [node for node in self.__Graph.nodes if 'a' in str(node)]
        int_nodes = [node for node in self.__Graph.nodes if 'a' not in str(node)]
        
        data = [self.__Graph[edges_list[0][i]][edges_list[1][i]]['weight'] for i in range(len(edges_list[0]))]
        for i, att in enumerate(att_nodes):
            for n in self.__Graph.neighbors(att):
                edges_list[0].append(max(int_nodes)+i+1)
                edges_list[1].append(n)
                data.append(self.__Graph[att][n]['weight'])
        
        max_size, start_node = max(int_nodes) + 1, min(int_nodes)
        matrix_size = max(int_nodes)+len(att_nodes)+1
        rwr_score = np.zeros((len(df), 1))
        for i in tqdm(range(start_node, max_size), desc="rwr scores.."):
            t = df[df['from']==i]
            if t.empty is False:
                df_idx = t.index
                i_scores = rwr(np.array(edges_list), i, matrix_size, data=data, max_iter = 100) 
                for idx, j in enumerate(t.to.values):
                    rwr_score[idx] = i_scores[j]
        
        return rwr_score

        
dataset = DirectedGraph("dataset3")
feature_node = dataset.get_feature()


# G = nx.Graph([(1, 2), (1, 3), (2, 1)])
# G[1][2]['weight'] = 0.5
# G[2][1]['weight'] = 0.3
# print(G[1][2]['weight'])
# print(G[2][1]['weight'])
# nx.draw(G, with_labels=True)

rwr scores..: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 877/877 [00:11<00:00, 82.29it/s]

testing dataframe: 
        id   to  from           Rwr
5    E1391  117   793  1.185581e-04
4    E2161  466   199  8.880946e-08
2    E3190  739   468  2.318347e-10
0     E370   26   317  0.000000e+00
434  E1511  522   773  0.000000e+00





In [188]:
edges_list = dataset.get_edges()
edges_array = np.array(edges_list)
print(edges_array[:, 0])
r = rwr(edges_array, 1144, 4140)
np.sort(r, axis=0)[::-1]

[ 59 612]


array([[0.9],
       [0. ],
       [0. ],
       ...,
       [0. ],
       [0. ],
       [0. ]])

### 1. Randomwalk with restart

In [186]:
from scipy import sparse 
from scipy.sparse import csr_matrix
from sklearn.preprocessing import normalize
from numpy.linalg import norm

def rwr(edges, start, matrix_size, alpha = 0.1, epsilon=1e-6, max_iter = 100, data=None):
    rows, cols = edges[0,:], edges[1,:]
    if data is None:
        data = np.ones(edges.shape[1])
    adj_mat = csr_matrix((data, (rows, cols)),shape=(matrix_size, matrix_size))
    adj_mat_norm = normalize(adj_mat, norm='l1', axis=0)
    # create starting vector
    start_vec = np.zeros((matrix_size, 1))
    start_vec[start] = 1
    # init score vector
    # old_vec = np.full((matrix_size, 1), 0.8)
    old_vec = start_vec
    
    residuals = np.zeros(max_iter)
    for i in range(max_iter):
        score_vec = alpha*(adj_mat_norm.dot(old_vec)) + (1-alpha)*start_vec
        residuals[i] = norm(score_vec - old_vec, 1) 
        if residuals[i] < epsilon:
            break
        old_vec = score_vec
        
    return score_vec


## 1. Simple NN

In [341]:
import torch
from torchsummary import summary
from torch.nn import Linear, Sequential, Sigmoid

In [342]:
class NN(torch.nn.Module):
    def __init__(self, feature):
        super(NN, self).__init__()
        torch.manual_seed(12345)
        self.nn = Sequential(
            Linear(feature, 32),
            torch.nn.LeakyReLU(),
            Linear(32, 64), 
            torch.nn.LeakyReLU(),
            Linear(64, 128),
            torch.nn.LeakyReLU(),
            Linear(128, 64),
            torch.nn.LeakyReLU(),
            Linear(64, 32),
            torch.nn.LeakyReLU(),
            Linear(32, 16),
            torch.nn.LeakyReLU(),
            Linear(16, 1)
        )
        
    def forward(self, x):
        out = self.nn(x)
        
        return out
    
training_array, target = data.get_training_data()
training_array = torch.tensor(training_array).float()
target = torch.Tensor(target.astype(int))
X_train, X_val, y_train, y_val = train_test_split(training_array, target, test_size=0.1)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
nn_model = NN(X_train.shape[1]).to("cpu")
summary(nn_model, input_size = X_train.shape, device="cpu")
#print(nn_model)

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1             [-1, 6789, 32]             160
         LeakyReLU-2             [-1, 6789, 32]               0
            Linear-3             [-1, 6789, 64]           2,112
         LeakyReLU-4             [-1, 6789, 64]               0
            Linear-5            [-1, 6789, 128]           8,320
         LeakyReLU-6            [-1, 6789, 128]               0
            Linear-7             [-1, 6789, 64]           8,256
         LeakyReLU-8             [-1, 6789, 64]               0
            Linear-9             [-1, 6789, 32]           2,080
        LeakyReLU-10             [-1, 6789, 32]               0
           Linear-11             [-1, 6789, 16]             528
        LeakyReLU-12             [-1, 6789, 16]               0
           Linear-13              [-1, 6789, 1]              17
Total params: 21,473
Trainable params: 

In [343]:
import wandb
import torch.nn.functional as F

def train(X_train, X_val, y_train, y_val):
    # setting
    lr = wandb.config["learning_rate"]
    MAX_ITERATION = wandb.config["MAX_ITERATION"]
    
    model = NN(X_train.shape[1])
    criterion = torch.nn.CrossEntropyLoss(reduction='sum')  # Define loss criterion.
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    for epoch in range(MAX_ITERATION):
        optimizer.zero_grad() # clear existing gradients
        # compute BC ranking score
        out= model(X_train)
        # compute loss
        #yij, bij = sample_node(y_hat, edges)
        loss = F.binary_cross_entropy_with_logits(out, y_train.reshape(-1, 1), reduction="sum")
        if epoch % 500 == 0:
            print("[{}/{}] Loss:{:.4f}".format(epoch, MAX_ITERATION, loss.item()))
        loss.backward()
        # validation
        val_loss = validation(model, X_val, y_val)
        
        wandb.log({"training_loss": loss}, step=epoch)
        wandb.log({"val_loss": val_loss}, step=epoch)
        optimizer.step()
    
    #evaluation 
    y_train_hat = model(X_train)
    y_val_hat = model(X_val)
    training_auc, training_pre, val_auc, val_pre = evaluation(y_train.detach().numpy(),
                                                              y_val.detach().numpy(),
                                                              y_train_hat.detach().numpy(),
                                                              y_val_hat.detach().numpy())
    wandb.summary["Training AUC"] = training_auc
    wandb.summary["Training PRE"] = training_pre
    wandb.summary["Validation AUC"] = val_auc
    wandb.summary["Validation PRE"] = val_pre
    return model
        
def validation(model, X_val, y_val):
    model.eval()
    
    with torch.no_grad():
        out = model(X_val)
        loss = F.binary_cross_entropy_with_logits(out, y_val.reshape(-1, 1), reduction="sum")
    
    return loss.item()



### Training

In [16]:
wandb.init(project='Link_Prediction@MLG', entity="baron", group="NN")
wandb.config["learning_rate"] = 0.0001
wandb.config["MAX_ITERATION"] = 10000
wandb.config["Dataset"] = "dataset1"

data = ReadData(wandb.config["Dataset"])
training_array, target = data.get_training_data()
training_array = torch.tensor(training_array).float()
target = torch.Tensor(target.astype(int))
X_train, X_val, y_train, y_val = train_test_split(training_array, target, test_size=0.1)
nn_model1 = train(X_train, X_val, y_train, y_val)

wandb.finish()




0,1
training_loss,█▄▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
training_loss,3424.33008
val_loss,399.47021


rwr scores..: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2708/2708 [00:10<00:00, 256.06it/s]
rwr scores..: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2708/2708 [00:06<00:00, 440.57it/s]



testing dataframe: 
          id    to  from     node_pair  ComNei  PreAtt  Jaccard     AA    Rwr
2      E3964  2405  1765  (2405, 1765)       0       6    0.000  0.000  0.026
1      E4849    81  1634    (81, 1634)       0       6    0.000  0.000  0.011
1451   E5663  2104   593   (2104, 593)       0       0    0.000  0.000  0.000
1445  E10013  2033    63    (2033, 63)       1       8    0.125  0.417  0.000
1446   E3714  1130  2283  (1130, 2283)       0       6    0.000  0.000  0.000
[0/10000] Loss:5427.9141
[500/10000] Loss:3602.1384
[1000/10000] Loss:3541.0305
[1500/10000] Loss:3528.0815
[2000/10000] Loss:3523.0469
[2500/10000] Loss:3519.6436
[3000/10000] Loss:3515.8635
[3500/10000] Loss:3512.4138
[4000/10000] Loss:3507.6292
[4500/10000] Loss:3501.8550
[5000/10000] Loss:3495.8770
[5500/10000] Loss:3485.3613
[6000/10000] Loss:3477.8274
[6500/10000] Loss:3470.7473
[7000/10000] Loss:3465.3237
[7500/10000] Loss:3460.7217
[8000/10000] Loss:3456.7749
[8500/10000] Loss:3455.3547
[9000/10000

0,1
training_loss,█▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▄▄▃▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂

0,1
Training AUC,0.85316
Training PRE,0.87661
Validation AUC,0.85793
Validation PRE,0.88365
training_loss,3448.59326
val_loss,403.97058


In [17]:
wandb.init(project='Link_Prediction@MLG', entity="baron", group="NN")
wandb.config["learning_rate"] = 0.0001
wandb.config["MAX_ITERATION"] = 10000
wandb.config["Dataset"] = "dataset2"

data = ReadData(wandb.config["Dataset"])
training_array, target = data.get_training_data()
training_array = torch.tensor(training_array).float()
target = torch.Tensor(target.astype(int))
X_train, X_val, y_train, y_val = train_test_split(training_array, target, test_size=0.1)
nn_model2 = train(X_train, X_val, y_train, y_val)

wandb.finish()

rwr scores..: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3311/3311 [00:13<00:00, 245.08it/s]
rwr scores..: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3311/3311 [00:06<00:00, 516.38it/s]



testing dataframe: 
         id    to  from     node_pair  ComNei  PreAtt  Jaccard     AA    Rwr
3     E5670  1063  1101  (1063, 1101)       2     468    0.024  1.135  0.024
4     E5005  1067  1710  (1067, 1710)       0      14    0.000  0.000  0.009
0     E3064  1315   586   (1315, 586)       0      14    0.000  0.000  0.000
1254  E8203  1336    35    (1336, 35)       0       2    0.000  0.000  0.000
1266  E6154   661   485    (661, 485)       0       3    0.000  0.000  0.000
[0/10000] Loss:4714.1211
[500/10000] Loss:3325.3040
[1000/10000] Loss:3271.2559
[1500/10000] Loss:3211.3828
[2000/10000] Loss:3165.9941
[2500/10000] Loss:3153.4055
[3000/10000] Loss:3146.3865
[3500/10000] Loss:3142.2180
[4000/10000] Loss:3139.3093
[4500/10000] Loss:3137.2544
[5000/10000] Loss:3133.7568
[5500/10000] Loss:3129.4590
[6000/10000] Loss:3124.4360
[6500/10000] Loss:3119.7920
[7000/10000] Loss:3114.2019
[7500/10000] Loss:3106.8125
[8000/10000] Loss:3099.3953
[8500/10000] Loss:3093.5342
[9000/10000] Loss

0,1
training_loss,█▄▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▄▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
Training AUC,0.83158
Training PRE,0.85155
Validation AUC,0.84855
Validation PRE,0.86695
training_loss,3087.03027
val_loss,341.44824


In [18]:
wandb.init(project='Link_Prediction@MLG', entity="baron", group="NN")
wandb.config["learning_rate"] = 0.0001
wandb.config["MAX_ITERATION"] = 10000
wandb.config["Dataset"] = "dataset3"

data = ReadData(wandb.config["Dataset"])
training_array, target = data.get_training_data()
training_array = torch.tensor(training_array).float()
target = torch.Tensor(target.astype(int))
X_train, X_val, y_train, y_val = train_test_split(training_array, target, test_size=0.1)
nn_model3 = train(X_train, X_val, y_train, y_val)

wandb.finish()

rwr scores..: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 876/876 [00:01<00:00, 520.30it/s]
rwr scores..: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 876/876 [00:00<00:00, 894.56it/s]



testing dataframe: 
        id   to  from   node_pair  ComNei  PreAtt  Jaccard     AA    Rwr
5    E1391  117   793  (117, 793)       1       4    0.333  0.721  0.016
4    E2161  466   199  (466, 199)       0       1    0.000  0.000  0.008
2    E3190  739   468  (739, 468)       0       8    0.000  0.000  0.001
0     E370   26   317   (26, 317)       0       6    0.000  0.000  0.000
434  E1511  522   773  (522, 773)       1       2    0.500  0.721  0.000
[0/10000] Loss:1607.4530
[500/10000] Loss:957.5345
[1000/10000] Loss:923.1216
[1500/10000] Loss:906.2383
[2000/10000] Loss:897.7651
[2500/10000] Loss:888.2858
[3000/10000] Loss:882.5832
[3500/10000] Loss:879.2104
[4000/10000] Loss:875.0524
[4500/10000] Loss:870.3646
[5000/10000] Loss:865.9105
[5500/10000] Loss:862.6511
[6000/10000] Loss:860.5424
[6500/10000] Loss:857.6019
[7000/10000] Loss:854.8506
[7500/10000] Loss:853.2886
[8000/10000] Loss:852.1823
[8500/10000] Loss:851.5554
[9000/10000] Loss:850.5000
[9500/10000] Loss:850.5607



0,1
training_loss,█▄▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▅▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▄▄▄▅▅▅▅▅▆▆▆

0,1
Training AUC,0.90124
Training PRE,0.91267
Validation AUC,0.88124
Validation PRE,0.89188
training_loss,849.4801
val_loss,148.82861


## 2. Random Forest

In [3]:
from sklearn.ensemble import RandomForestRegressor
import joblib

def RandomForest(dataset, config=False):
    training_array, target = dataset.get_training_data()
    reg = RandomForestRegressor(n_estimators=int(config["n_estimators"]),
                               min_samples_leaf=int(config["min_samples_leaf"]),
                               min_samples_split=int(config["min_samples_split"]),
                               oob_score=bool(config["oob_score"]))

    X_train, X_val, y_train, y_val = train_test_split(training_array, target, test_size=0.2)
    y_train = y_train.astype('int')
    y_val = y_val.astype('int')

    reg.fit(X_train, y_train)
    y_train_hat = np.where(reg.predict(X_train)>0.5, 1, 0)
    y_val_hat = np.where(reg.predict(X_val)>0.5, 1, 0)

    training_auc, training_pre, val_auc, val_pre = evaluation(y_train, y_val, y_train_hat, y_val_hat)
    wandb.summary["feature_number"] = X_train.shape[1]
    wandb.log({
        "Training AUC":training_auc,
        "Training PRE": training_pre,
        "Validation AUC": val_auc,
        "Validation PRE": val_pre
    })

    joblib.dump(reg, os.path.join(wandb.run.dir, 'rf_model.h5'))
    
    return reg

### Training

In [4]:
import wandb 
from IPython.display import clear_output
sweep_config = {
    "method": "bayes",
    "metric":{
        "name": "Validation PRE",
        "goal": "maximize"
    },
    
    "parameters":{
        "n_estimators":{
            'min': 10,
            'max': 150
        },
        "min_samples_leaf":{
            'min': 1,
            'max': 50,
        },
        "min_samples_split":{
            'min': 2,
            'max': 5
        },
        "oob_score":{
            'values': [True, False]
        }
    }
    
}

In [17]:
def train_dataset1_rf():
    with wandb.init(group="rf") as run:
        config = wandb.config
        model = RandomForest(dataset1, config)
        
def train_dataset2_rf():
    with wandb.init(project='Link_Prediction@MLG', entity="baron", group="rf") as run:
        config = wandb.config
        model = RandomForest(dataset2, config)

def train_dataset3_rf():
    with wandb.init(project='Link_Prediction@MLG', entity="baron", group="rf") as run:
        config = wandb.config
        model = RandomForest(dataset3, config)

        
count = 100 # number of runs to execute
# training dataset 1
dataset1 = ReadData("dataset1")
sweep_id = wandb.sweep(sweep_config, project='Link_Prediction@MLG', entity="baron")
wandb.agent(sweep_id, function=train_dataset1_rf, count=count)
clear_output()
# training dataset 2
dataset2 = ReadData("dataset2")
sweep_id = wandb.sweep(sweep_config, project='Link_Prediction@MLG', entity="baron")
wandb.agent(sweep_id, function=train_dataset2_rf, count=count)
clear_output()
# training dataset 3
dataset3 = ReadData("dataset3")
sweep_id = wandb.sweep(sweep_config, project='Link_Prediction@MLG', entity="baron")
wandb.agent(sweep_id, function=train_dataset3_rf, count=count)  
clear_output()

In [18]:
import tempfile

def dict_to_config(d):
    class Object(object):
        pass

    config = Object()
    for key, value in d.items():
        setattr(config, key, value)
    return config

def parse_wandb_models(path, model_name, numbers_models=None, metric=None):
    '''Parse wandb models with either run paths or a sweep path.

    Args:
        path: a list contains either run paths or a sweep path
        numbers_models: a integer or a list of numbers of models.
                        if None, treat path as run paths, otherwise treat it as a sweep path.
        metric: metric to sort by when parsing a sweep path
    '''
    api = wandb.Api()
    models, configs, model_paths = list(), list(), list()
    sweep_name = ''

    modeldir = tempfile.mkdtemp()

    if numbers_models is not None: # sweep
        numbers_models = max(numbers_models) if isinstance(numbers_models, list) else numbers_models

        sweep = api.sweep(path[0])
        sweep_name = sweep.config.get('name', '')
        # sort runs by metric
        runs = sorted(sweep.runs, key=lambda run: run.summary.get(metric, 0), 
                            reverse=True)
        runs = runs[:numbers_models]
        
    else:
        runs = [api.run(p) for p in path]

    for run in runs:
        run.file('{}.h5'.format(model_name)).download(replace=True, root=modeldir)

        # load_model =
        models.append(joblib.load(modeldir + '/{}.h5'.format(model_name)))

        configs.append(dict_to_config(run.config))
        model_paths.append(run.path)

    return models, configs, model_paths, sweep_name

def MeanAccuracy(dataset, models):
    sum_training_auc, sum_training_pre, sum_val_auc, sum_val_pre = 0, 0, 0, 0
    n = len(model_paths)
    training_array, target = dataset.get_training_data()
    X_train, X_val, y_train, y_val = train_test_split(training_array, target, test_size=0.2)
    for model in models:
        y_train_hat = np.where(model.predict(X_train)>0.5, 1, 0)
        y_val_hat = np.where(model.predict(X_val)>0.5, 1, 0)
        y_train = y_train.astype('int')
        y_val = y_val.astype('int')
        training_auc, training_pre, val_auc, val_pre = evaluation(y_train, y_val, y_train_hat, y_val_hat)
        sum_training_auc += training_auc
        sum_training_pre += training_pre
        sum_val_auc +=  val_auc
        sum_val_pre +=  val_pre
    print("training auc: {:.3f} pre: {:.3f}, validation auc: {:.3f} pre: {:.3f}".format(sum_training_auc/n, sum_training_pre/n, sum_val_auc/n, sum_val_pre/n))


print("Dataset1: ")
rf_models1, configs, model_paths, sweep_name = parse_wandb_models(path=["baron/Link_Prediction@MLG/4uf9wyjf"], model_name="rf_model", numbers_models=5, metric="Validation PRE")
MeanAccuracy(dataset1, rf_models1)
print("Dataset2: ")
rf_models2, configs, model_paths, sweep_name = parse_wandb_models(path=["baron/Link_Prediction@MLG/z8qpl1kf"], model_name="rf_model", numbers_models=5, metric="Validation PRE")
MeanAccuracy(dataset2, rf_models2)
print("Dataset3: ")
rf_models3, configs, model_paths, sweep_name = parse_wandb_models(path=["baron/Link_Prediction@MLG/r8n1nt9r"], model_name="rf_model", numbers_models=5, metric="Validation PRE")
MeanAccuracy(dataset3, rf_models3)

Dataset1: 
training auc: 0.779 pre: 0.732, validation auc: 0.778 pre: 0.743
Dataset2: 
training auc: 0.747 pre: 0.702, validation auc: 0.751 pre: 0.710
Dataset3: 
training auc: 0.816 pre: 0.781, validation auc: 0.821 pre: 0.764


## 3. GCNEncoder

In [1]:
import torch
from torch_geometric.data import Data
from torch_geometric.utils import train_test_split_edges

In [2]:
from torch_geometric.nn import GCNConv, GAE

class GCNEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_chaneels):
        super(GCNEncoder, self).__init__()
        self.input = GCNConv(in_channels, 512, cached=True)
        self.output = GCNConv(512, out_chaneels, cached=True)
        
    # def forward(self, x, edge_index):
    def encode(self, x, edge_index):
        x = self.input(x, edge_index).relu()
        return self.output(x, edge_index)
    
    def decode(self, z, pos_edge_index, neg_edge_index):
        edge_index = torch.cat([pos_edge_index, neg_edge_index], dim=-1) # concatenate pos and neg edges
        logits = (z[edge_index[0]] * z[edge_index[1]]).sum(dim=-1)  # dot product 
        return logits
    
    def predict(self, z, node_pairs):
        logits = (z[node_pairs[0]] * z[node_pairs[1]]).sum(dim=-1)  # dot product 
        return logits

In [9]:
from torch_geometric.utils import negative_sampling
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score, average_precision_score

def get_link_labels(pos_edge_index, neg_edge_index, device = torch.device('cpu')):
    # returns a tensor:
    # [1,1,1,1,...,0,0,0,0,0,..] with the number of ones is equel to the lenght of pos_edge_index
    # and the number of zeros is equal to the length of neg_edge_index
    E = pos_edge_index.size(1) + neg_edge_index.size(1)
    link_labels = torch.zeros(E, dtype=torch.float, device=device)
    link_labels[:pos_edge_index.size(1)] = 1.
    return link_labels

def train(pg_data, device = torch.device('cpu')):
    # training data
    x = pg_data.x.to(device)
    train_pos_edge_index = pg_data.train_pos_edge_index.to(device)
    
    # parameters
    out_channel = 128
    epochs = 200
    lr = 0.0001
    num_feature = x.shape[1]
    
    # init Model 
    model = GCNEncoder(num_feature, out_channel)
    
    # init the optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    
    for epoch in range(1, epochs+1):
        # generate negative edge
        neg_edge_index = negative_sampling(
            edge_index=train_pos_edge_index, #positive edges
            num_nodes=pg_data.num_nodes, # number of nodes
            num_neg_samples=train_pos_edge_index.shape[1]) # number of neg_sample equal to number of pos_edges

        # encoder_model.train()
        optimizer.zero_grad()

        z = model.encode(x, train_pos_edge_index)
        link_logits = model.decode(z, train_pos_edge_index, neg_edge_index) # decode
        link_labels = get_link_labels(train_pos_edge_index, neg_edge_index)

        loss = F.binary_cross_entropy_with_logits(link_logits, link_labels)
        loss.backward()
        
        auc, ap = test(model, x)
        if epoch%100 ==0:
            print("[{}/{}] Val AUC: {:.4f}, Test AUC: {:.4f},  Val AP: {:.4f}, Test AP: {:.4f}".format(epoch, epochs, auc[0], ap[0], auc[1], ap[1]))
        
        optimizer.step()
    
    return model

def test(model, x):
    model.eval()
    with torch.no_grad():
        perfs = []
        perfs_pr = []
        for prefix in ["val", "test"]:
            pos_edge_index = pg_data[f'{prefix}_pos_edge_index']
            neg_edge_index = pg_data[f'{prefix}_neg_edge_index']

            z = model.encode(x, pos_edge_index) # encode train
            link_logits = model.decode(z, pos_edge_index, neg_edge_index) # decode test or val
            link_probs = link_logits.sigmoid() # apply sigmoid

            link_labels = get_link_labels(pos_edge_index, neg_edge_index) # get link

            perfs.append(roc_auc_score(link_labels.cpu(), link_probs.cpu())) #compute roc_auc score
            perfs_pr.append(average_precision_score(link_labels.cpu(), link_probs.cpu()))
        return perfs, perfs_pr
    # return encoder_model.test(z, pos_edge_index, neg_edge_index)

# for epoch in range(1, epochs+1):
#     loss = train(x, train_pos_edge_index)
#     auc, pre = test(encoder_model, x)
#     # auc, ap = test(pg_data.test_pos_edge_index, pg_data.test_neg_edge_index)
#     if epoch%50 ==0:
#         # print("Epoch: {:03d}, AUC: {:.4f}, AP: {:.4f}".format(epoch, auc, ap))
#         print("[{}/{}] Val AUC: {:.4f}, Test AUC: {:.4f},  Val PRE: {:.4f}, Test PRE: {:.4f}".format(epoch, epochs, auc[0], pre[0], auc[1], pre[1]))

### Train

In [10]:
print("To training dataset 1")
data = ReadData("dataset1")
edge_index = data.get_torch_edges()
x = data.get_nodes_x()
pg_data = Data(x=x, edge_index=edge_index)
pg_data = train_test_split_edges(pg_data)
# model
gcn_model1 = train(pg_data)

print("To training dataset 2")
data = ReadData("dataset2")
edge_index = data.get_torch_edges()
x = data.get_nodes_x()
pg_data = Data(x=x, edge_index=edge_index)
pg_data = train_test_split_edges(pg_data)
# model
gcn_model2 = train(pg_data)

print("To training dataset 3")
data = ReadData("dataset3")
edge_index = data.get_torch_edges()
x = data.get_nodes_x()
pg_data = Data(x=x, edge_index=edge_index)
pg_data = train_test_split_edges(pg_data)
# model
gcn_model3 = train(pg_data)

To training dataset 1




[100/200] Val AUC: 0.9231, Test AUC: 0.9270,  Val AP: 0.9302, Test AP: 0.9275
[200/200] Val AUC: 0.9389, Test AUC: 0.9430,  Val AP: 0.9400, Test AP: 0.9434
To training dataset 2




[100/200] Val AUC: 0.9434, Test AUC: 0.9527,  Val AP: 0.9498, Test AP: 0.9537
[200/200] Val AUC: 0.9321, Test AUC: 0.9443,  Val AP: 0.9471, Test AP: 0.9540
To training dataset 3




[100/200] Val AUC: 0.9503, Test AUC: 0.9515,  Val AP: 0.9516, Test AP: 0.9356
[200/200] Val AUC: 0.9258, Test AUC: 0.9391,  Val AP: 0.9506, Test AP: 0.9369


## 4.GCN + RF

In [13]:
from sklearn.ensemble import RandomForestRegressor
import joblib

def RandomForest(dataset, config=False, link_probs=False):
    training_array, target = dataset.get_training_data()
    if config:
        reg = RandomForestRegressor(n_estimators=int(config["n_estimators"]),
                                       min_samples_leaf=int(config["min_samples_leaf"]),
                                       min_samples_split=int(config["min_samples_split"]),
                                       oob_score=bool(config["oob_score"]))
    else:
        reg = RandomForestRegressor()
        
    if link_probs.shape:
        training_array = np.concatenate((training_array, link_probs.cpu().detach().numpy().reshape(-1, 1)), axis=1)

    X_train, X_val, y_train, y_val = train_test_split(training_array, target, test_size=0.2)
    y_train = y_train.astype('int')
    y_val = y_val.astype('int')

    reg.fit(X_train, y_train)
    y_train_hat = np.where(reg.predict(X_train)>0.5, 1, 0)
    y_val_hat = np.where(reg.predict(X_val)>0.5, 1, 0)

    training_auc, training_pre, val_auc, val_pre = evaluation(y_train, y_val, y_train_hat, y_val_hat)

    wandb.summary["feature_number"] = X_train.shape[1]
    wandb.log({
        "Training AUC":training_auc,
        "Training PRE": training_pre,
        "Validation AUC": val_auc,
        "Validation PRE": val_pre
    })

    joblib.dump(reg, os.path.join(wandb.run.dir, 'rf_model.h5'))
    
    return reg


### Training

In [14]:
import wandb 
from IPython.display import clear_output
sweep_config = {
    "method": "bayes",
    "metric":{
        "name": "Validation PRE",
        "goal": "maximize"
    },
    
    "parameters":{
        "n_estimators":{
            'min': 10,
            'max': 150
        },
        "min_samples_leaf":{
            'min': 1,
            'max': 50,
        },
        "min_samples_split":{
            'min': 2,
            'max': 5
        },
        "oob_score":{
            'values': [True, False]
        }
    }
    
}

In [19]:
def train_dataset1_rf():
    with wandb.init(project='Link_Prediction@MLG', entity="baron", group="rf") as run:
        config = wandb.config
        model = RandomForest(dataset1, config, link_probs=link_probs)
        
def train_dataset2_rf():
    with wandb.init(project='Link_Prediction@MLG', entity="baron", group="rf") as run:
        config = wandb.config
        model = RandomForest(dataset2, config, link_probs=link_probs)

def train_dataset3_rf():
    with wandb.init(project='Link_Prediction@MLG', entity="baron", group="rf") as run:
        config = wandb.config
        model = RandomForest(dataset3, config, link_probs=link_probs)

        
count = 100 # number of runs to execute
# training dataset 1
dataset1 = ReadData("dataset1")
x = dataset1.get_nodes_x()
node_pairs = dataset1.get_training_nodepairs()
link_logits = gcn_model1.predict(x, node_pairs)
link_probs = link_logits.sigmoid()

sweep_id = wandb.sweep(sweep_config, project='Link_Prediction@MLG', entity="baron")
wandb.agent(sweep_id, function=train_dataset1_rf, count=count)
clear_output()
# training dataset 2
dataset2 = ReadData("dataset2")
x = dataset2.get_nodes_x()
node_pairs = dataset2.get_training_nodepairs()
link_logits = gcn_model2.predict(x, node_pairs)
link_probs = link_logits.sigmoid()

sweep_id = wandb.sweep(sweep_config, project='Link_Prediction@MLG', entity="baron")
wandb.agent(sweep_id, function=train_dataset2_rf, count=count)
clear_output()
# training dataset 3
dataset3 = ReadData("dataset3")
x = dataset3.get_nodes_x()
node_pairs = dataset3.get_training_nodepairs()
link_logits = gcn_model3.predict(x, node_pairs)
link_probs = link_logits.sigmoid()

sweep_id = wandb.sweep(sweep_config, project='Link_Prediction@MLG', entity="baron")
wandb.agent(sweep_id, function=train_dataset3_rf, count=count)  
clear_output()

In [22]:
import tempfile

def dict_to_config(d):
    class Object(object):
        pass

    config = Object()
    for key, value in d.items():
        setattr(config, key, value)
    return config

def parse_wandb_models(path, model_name, numbers_models=None, metric=None):
    '''Parse wandb models with either run paths or a sweep path.

    Args:
        path: a list contains either run paths or a sweep path
        numbers_models: a integer or a list of numbers of models.
                        if None, treat path as run paths, otherwise treat it as a sweep path.
        metric: metric to sort by when parsing a sweep path
    '''
    api = wandb.Api()
    models, configs, model_paths = list(), list(), list()
    sweep_name = ''

    modeldir = tempfile.mkdtemp()

    if numbers_models is not None: # sweep
        numbers_models = max(numbers_models) if isinstance(numbers_models, list) else numbers_models

        sweep = api.sweep(path[0])
        sweep_name = sweep.config.get('name', '')
        # sort runs by metric
        runs = sorted(sweep.runs, key=lambda run: run.summary.get(metric, 0), 
                            reverse=True)
        runs = runs[:numbers_models]
        
    else:
        runs = [api.run(p) for p in path]

    for run in runs:
        run.file('{}.h5'.format(model_name)).download(replace=True, root=modeldir)

        # load_model =
        models.append(joblib.load(modeldir + '/{}.h5'.format(model_name)))

        configs.append(dict_to_config(run.config))
        model_paths.append(run.path)

    return models, configs, model_paths, sweep_name

def MeanAccuracy(dataset, models, link_probs):
    sum_training_auc, sum_training_pre, sum_val_auc, sum_val_pre = 0, 0, 0, 0
    n = len(model_paths)
    
    training_array, target = dataset.get_training_data()
    training_array = np.concatenate((training_array, link_probs.cpu().detach().numpy().reshape(-1, 1)), axis=1)
    
    X_train, X_val, y_train, y_val = train_test_split(training_array, target, test_size=0.2)
    for model in models:
        y_train_hat = np.where(model.predict(X_train)>0.5, 1, 0)
        y_val_hat = np.where(model.predict(X_val)>0.5, 1, 0)
        y_train = y_train.astype('int')
        y_val = y_val.astype('int')
        training_auc, training_pre, val_auc, val_pre = evaluation(y_train, y_val, y_train_hat, y_val_hat)
        sum_training_auc += training_auc
        sum_training_pre += training_pre
        sum_val_auc +=  val_auc
        sum_val_pre +=  val_pre
    print("training auc: {:.3f} pre: {:.3f}, validation auc: {:.3f} pre: {:.3f}".format(sum_training_auc/n, sum_training_pre/n, sum_val_auc/n, sum_val_pre/n))


print("Dataset1: ")
# gcn 
x = dataset1.get_nodes_x()
node_pairs = dataset1.get_training_nodepairs()
link_logits = gcn_model1.predict(x, node_pairs)
link_probs = link_logits.sigmoid()
gcn_rf_models1, configs, model_paths, sweep_name = parse_wandb_models(path=["baron/Link_Prediction@MLG/p9u07rhq"], model_name="rf_model", numbers_models=5, metric="Validation PRE")
MeanAccuracy(dataset1, gcn_rf_models1, link_probs)

print("Dataset2: ")
# gcn 
x = dataset2.get_nodes_x()
node_pairs = dataset2.get_training_nodepairs()
link_logits = gcn_model2.predict(x, node_pairs)
link_probs = link_logits.sigmoid()
gcn_rf_models2, configs, model_paths, sweep_name = parse_wandb_models(path=["baron/Link_Prediction@MLG/9k3h2wdu"], model_name="rf_model", numbers_models=5, metric="Validation PRE")
MeanAccuracy(dataset2, gcn_rf_models2, link_probs)

print("Dataset3: ")
# gcn 
x = dataset3.get_nodes_x()
node_pairs = dataset3.get_training_nodepairs()
link_logits = gcn_model3.predict(x, node_pairs)
link_probs = link_logits.sigmoid()
gcn_rf_models3, configs, model_paths, sweep_name = parse_wandb_models(path=["baron/Link_Prediction@MLG/zcjubt87"], model_name="rf_model", numbers_models=5, metric="Validation PRE")
MeanAccuracy(dataset3, gcn_rf_models3, link_probs)

Dataset1: 
training auc: 0.848 pre: 0.807, validation auc: 0.858 pre: 0.819
Dataset2: 
training auc: 0.876 pre: 0.837, validation auc: 0.886 pre: 0.852
Dataset3: 
training auc: 0.815 pre: 0.774, validation auc: 0.826 pre: 0.780


## 5. GCN+NN

In [354]:
# training
wandb.init(project='Link_Prediction@MLG', entity="baron", group="GCN")
wandb.config["learning_rate"] = 0.0001
wandb.config["MAX_ITERATION"] = 10000
wandb.config["Dataset"] = "dataset1"

dataset1 = ReadData(wandb.config["Dataset"])
x = dataset1.get_nodes_x()
node_pairs = dataset1.get_training_nodepairs()
link_probs = gcn_model1.predict(x, node_pairs)
# link_probs = link_logits.sigmoid()
# neighbor feature
training_array, target = dataset1.get_training_data()
training_data = torch.cat((torch.tensor(training_array, dtype=torch.float), link_probs.reshape(-1, 1)), 1)
target = torch.Tensor(target.astype(int))
X_train, X_val, y_train, y_val = train_test_split(training_data, target, test_size=0.1)
gcn_nn_model1 = train(X_train, X_val, y_train, y_val)

wandb.finish()



[0/10000] Loss:5429.8076
[500/10000] Loss:2842.8120
[1000/10000] Loss:2757.0232
[1500/10000] Loss:2724.2178
[2000/10000] Loss:2709.4746
[2500/10000] Loss:2699.7729
[3000/10000] Loss:2689.2524
[3500/10000] Loss:2678.4617
[4000/10000] Loss:2668.6633
[4500/10000] Loss:2658.2334
[5000/10000] Loss:2647.7224
[5500/10000] Loss:2638.2883
[6000/10000] Loss:2630.0439
[6500/10000] Loss:2623.0024
[7000/10000] Loss:2616.2830
[7500/10000] Loss:2610.8594
[8000/10000] Loss:2604.5078
[8500/10000] Loss:2598.6121
[9000/10000] Loss:2594.2192
[9500/10000] Loss:2590.9426



0,1
training_loss,█▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▃▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄

0,1
Training AUC,0.92548
Training PRE,0.93709
Validation AUC,0.91825
Validation PRE,0.93374
training_loss,2588.13379
val_loss,411.42435


In [355]:
# training
wandb.init(project='Link_Prediction@MLG', entity="baron", group="GCN")
wandb.config["learning_rate"] = 0.0001
wandb.config["MAX_ITERATION"] = 10000
wandb.config["Dataset"] = "dataset2"

dataset2 = ReadData(wandb.config["Dataset"])
x = dataset2.get_nodes_x()
node_pairs = dataset2.get_training_nodepairs()
link_probs = gcn_model2.predict(x, node_pairs)
# link_probs = link_logits.sigmoid()
# neighbor feature
training_array, target = dataset2.get_training_data()
training_data = torch.cat((torch.tensor(training_array, dtype=torch.float), link_probs.reshape(-1, 1)), 1)
target = torch.Tensor(target.astype(int))
X_train, X_val, y_train, y_val = train_test_split(training_data, target, test_size=0.1)
gcn_nn_model2 = train(X_train, X_val, y_train, y_val)

wandb.finish()



[0/10000] Loss:4719.3384
[500/10000] Loss:2065.1960
[1000/10000] Loss:1987.9291
[1500/10000] Loss:1963.9651
[2000/10000] Loss:1947.6254
[2500/10000] Loss:1934.4635
[3000/10000] Loss:1923.9811
[3500/10000] Loss:1913.7996
[4000/10000] Loss:1900.3619
[4500/10000] Loss:1890.3495
[5000/10000] Loss:1883.0386
[5500/10000] Loss:1873.4359
[6000/10000] Loss:1868.3459
[6500/10000] Loss:1863.8324
[7000/10000] Loss:1859.8173
[7500/10000] Loss:1856.1672
[8000/10000] Loss:1853.9807
[8500/10000] Loss:1851.3228
[9000/10000] Loss:1847.1819
[9500/10000] Loss:1845.3076



0,1
training_loss,█▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
Training AUC,0.95214
Training PRE,0.95763
Validation AUC,0.94712
Validation PRE,0.94995
training_loss,1843.21777
val_loss,222.42969


In [356]:
# training
wandb.init(project='Link_Prediction@MLG', entity="baron", group="GCN")
wandb.config["learning_rate"] = 0.0001
wandb.config["MAX_ITERATION"] = 20000
wandb.config["Dataset"] = "dataset3"

dataset = ReadData(wandb.config["Dataset"])
x = dataset.get_nodes_x()
node_pairs = dataset.get_training_nodepairs()
link_probs = gcn_model3.predict(x, node_pairs)
# link_probs = link_logits.sigmoid()
# neighbor feature
training_array, target = dataset.get_training_data()
training_data = torch.cat((torch.tensor(training_array, dtype=torch.float), link_probs.reshape(-1, 1)), 1)
target = torch.Tensor(target.astype(int))
X_train, X_val, y_train, y_val = train_test_split(training_data, target, test_size=0.1)
gcn_nn_model3 = train(X_train, X_val, y_train, y_val)

wandb.finish()



[0/20000] Loss:1609.2957
[500/20000] Loss:940.3589
[1000/20000] Loss:907.5302
[1500/20000] Loss:894.5874
[2000/20000] Loss:883.3215
[2500/20000] Loss:868.7643
[3000/20000] Loss:860.5479
[3500/20000] Loss:857.1821
[4000/20000] Loss:853.3588
[4500/20000] Loss:851.9291
[5000/20000] Loss:848.0945
[5500/20000] Loss:845.7708
[6000/20000] Loss:842.5715
[6500/20000] Loss:839.7527
[7000/20000] Loss:836.7341
[7500/20000] Loss:834.3440
[8000/20000] Loss:832.6584
[8500/20000] Loss:831.9530
[9000/20000] Loss:831.7958
[9500/20000] Loss:828.9706
[10000/20000] Loss:827.7080
[10500/20000] Loss:827.2259
[11000/20000] Loss:825.5567
[11500/20000] Loss:824.3710
[12000/20000] Loss:824.4882
[12500/20000] Loss:822.8684
[13000/20000] Loss:818.4312
[13500/20000] Loss:816.9916
[14000/20000] Loss:815.8712
[14500/20000] Loss:814.6362
[15000/20000] Loss:814.4839
[15500/20000] Loss:813.1956
[16000/20000] Loss:814.0078
[16500/20000] Loss:811.6947
[17000/20000] Loss:810.9167
[17500/20000] Loss:809.9041
[18000/20000] L

0,1
training_loss,█▄▄▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▂▁▁▂▁▂▁▁▁▁▁▁▁▁▁
val_loss,█▃▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▂▁▁▂▁▂▂▂▃▂▂▂▂▂▃▃▃▃

0,1
Training AUC,0.91629
Training PRE,0.92828
Validation AUC,0.90553
Validation PRE,0.9159
training_loss,810.92249
val_loss,100.25244


## Evaluation

In [17]:
from sklearn.metrics import roc_auc_score, average_precision_score

In [18]:
def evaluation(y_train, y_val, y_train_hat, y_val_hat):
    training_auc, val_auc = roc_auc_score(y_train, y_train_hat), roc_auc_score(y_val, y_val_hat)
    training_pre, val_pre = average_precision_score(y_train, y_train_hat), average_precision_score(y_val, y_val_hat)
    
    return training_auc, training_pre, val_auc, val_pre

## Output

### Simple NN

In [19]:
upload_csv = pd.read_csv("../../data/SocialNetowrk/hw2_data/dataset1/upload.csv")
dataset1 = ReadData("dataset1")
X_testing = dataset1.get_testing_data()
X_testing = torch.tensor(X_testing).float()
upload_csv["prob"] = F.sigmoid(nn_model1(X_testing)).detach().numpy()
print("dataset1:")
print(upload_csv.head())
upload_csv.to_csv('upload/upload_nn1_rwr.csv')

upload_csv = pd.read_csv("../../data/SocialNetowrk/hw2_data/dataset2/upload.csv")
dataset2 = ReadData("dataset2")
X_testing = dataset2.get_testing_data()
X_testing = torch.tensor(X_testing).float()
upload_csv["prob"] = F.sigmoid(nn_model2(X_testing)).detach().numpy()
print("\ndataset2:")
print(upload_csv.head())
upload_csv.to_csv('upload/upload_nn2_rwr.csv')

upload_csv = pd.read_csv("../../data/SocialNetowrk/hw2_data/dataset3/upload.csv")
dataset3 = ReadData("dataset3")
X_testing = dataset3.get_testing_data()
X_testing = torch.tensor(X_testing).float()
upload_csv["prob"] = F.sigmoid(nn_model3(X_testing)).detach().numpy()
print("\ndataset3:")
print(upload_csv.head())
upload_csv.to_csv('upload/upload_nn3_rwr.csv')

rwr scores..: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2708/2708 [00:10<00:00, 257.52it/s]
rwr scores..: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2708/2708 [00:06<00:00, 440.61it/s]



testing dataframe: 
          id    to  from     node_pair  ComNei  PreAtt  Jaccard     AA    Rwr
2      E3964  2405  1765  (2405, 1765)       0       6    0.000  0.000  0.026
1      E4849    81  1634    (81, 1634)       0       6    0.000  0.000  0.011
1451   E5663  2104   593   (2104, 593)       0       0    0.000  0.000  0.000
1445  E10013  2033    63    (2033, 63)       1       8    0.125  0.417  0.000
1446   E3714  1130  2283  (1130, 2283)       0       6    0.000  0.000  0.000
dataset1:
       id          prob
0  E10559  2.959970e-01
1   E4849  1.000000e+00
2   E3964  9.265349e-35
3    E542  5.134942e-01
4    E331  8.329121e-01


rwr scores..: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3311/3311 [00:13<00:00, 247.55it/s]
rwr scores..: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3311/3311 [00:06<00:00, 522.60it/s]



testing dataframe: 
         id    to  from     node_pair  ComNei  PreAtt  Jaccard     AA    Rwr
3     E5670  1063  1101  (1063, 1101)       2     468    0.024  1.135  0.024
4     E5005  1067  1710  (1067, 1710)       0      14    0.000  0.000  0.009
0     E3064  1315   586   (1315, 586)       0      14    0.000  0.000  0.000
1254  E8203  1336    35    (1336, 35)       0       2    0.000  0.000  0.000
1266  E6154   661   485    (661, 485)       0       3    0.000  0.000  0.000

dataset2:
      id      prob
0  E3064  0.607592
1   E298  0.587606
2  E3512  1.000000
3  E5670  1.000000
4  E5005  0.997793


rwr scores..: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 876/876 [00:01<00:00, 520.16it/s]
rwr scores..: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 876/876 [00:00<00:00, 899.27it/s]



testing dataframe: 
        id   to  from   node_pair  ComNei  PreAtt  Jaccard     AA    Rwr
5    E1391  117   793  (117, 793)       1       4    0.333  0.721  0.016
4    E2161  466   199  (466, 199)       0       1    0.000  0.000  0.008
2    E3190  739   468  (739, 468)       0       8    0.000  0.000  0.001
0     E370   26   317   (26, 317)       0       6    0.000  0.000  0.000
434  E1511  522   773  (522, 773)       1       2    0.500  0.721  0.000

dataset3:
      id          prob
0   E370  5.037491e-01
1   E667  5.964093e-01
2  E3190  2.294454e-13
3   E848  4.648177e-01
4  E2161  1.000000e+00




### Random forest

In [19]:
upload_csv = pd.read_csv("../../data/SocialNetowrk/hw2_data/dataset1/upload.csv")
X_testing = dataset1.get_testing_data()
sum_value = 0
for model in rf_models1:
    predict_value = model.predict(X_testing)
    sum_value += predict_value
upload_csv["prob"] = sum_value/len(rf_models1)
print("dataset1:")
print(upload_csv.head())
upload_csv.to_csv('upload/upload_rf1_rwr.csv')

upload_csv = pd.read_csv("../../data/SocialNetowrk/hw2_data/dataset2/upload.csv")
X_testing = dataset2.get_testing_data()
sum_value = 0
for model in rf_models2:
    predict_value = model.predict(X_testing)
    sum_value += predict_value
upload_csv["prob"] = sum_value/len(rf_models2)
print("\ndataset2:")
print(upload_csv.head())
upload_csv.to_csv('upload/upload_rf2_rwr.csv')

upload_csv = pd.read_csv("../../data/SocialNetowrk/hw2_data/dataset3/upload.csv")
X_testing = dataset3.get_testing_data()
sum_value = 0
for model in rf_models3:
    predict_value = model.predict(X_testing)
    sum_value += predict_value
upload_csv["prob"] = sum_value/len(rf_models3)
print("\ndataset3:")
print(upload_csv.head())
upload_csv.to_csv('upload/upload_rf3_rwr.csv')

dataset1:
       id      prob
0  E10559  0.517625
1   E4849  0.426587
2   E3964  0.426587
3    E542  0.693317
4    E331  0.920115

dataset2:
      id      prob
0  E3064  0.572921
1   E298  0.653814
2  E3512  0.999594
3  E5670  0.995861
4  E5005  0.572921

dataset3:
      id      prob
0   E370  0.477373
1   E667  0.445013
2  E3190  0.473666
3   E848  0.520315
4  E2161  0.413861


### GCN

In [370]:
upload_csv = pd.read_csv("../../data/SocialNetowrk/hw2_data/dataset1/upload.csv")
dataset1 = ReadData("dataset1")
x = dataset1.get_nodes_x()
test_node_pairs = dataset1.get_test_nodepairs()
link_logits = gcn_model1.predict(x, test_node_pairs)
link_probs = link_logits.sigmoid()
upload_csv["prob"] = link_probs.detach().numpy()
print("dataset1:")
print(upload_csv.head())
upload_csv.to_csv('upload/upload_gcn1_1000.csv')

upload_csv = pd.read_csv("../../data/SocialNetowrk/hw2_data/dataset2/upload.csv")
dataset2 = ReadData("dataset2")
x = dataset2.get_nodes_x()
test_node_pairs = dataset2.get_test_nodepairs()
link_logits = gcn_model2.predict(x, test_node_pairs)
link_probs = link_logits.sigmoid()
upload_csv["prob"] = link_probs.detach().numpy()
print("\ndataset2:")
print(upload_csv.head())
upload_csv.to_csv('upload/upload_gcn2_1000.csv')

upload_csv = pd.read_csv("../../data/SocialNetowrk/hw2_data/dataset3/upload.csv")
dataset3 = ReadData("dataset3")
x = dataset3.get_nodes_x()
test_node_pairs = dataset3.get_test_nodepairs()
link_logits = gcn_model3.predict(x, test_node_pairs)
link_probs = link_logits.sigmoid()
upload_csv["prob"] = link_probs.detach().numpy()
print("\ndataset3:")
print(upload_csv.head())
upload_csv.to_csv('upload/upload_gcn3_1000.csv')

dataset1:
       id      prob
0  E10559  0.880797
1   E4849  0.500000
2   E3964  0.982014
3    E542  0.731059
4    E331  0.731059

dataset2:
      id      prob
0  E3064  0.993307
1   E298  0.999089
2  E3512  0.952574
3  E5670  0.999089
4  E5005  0.731059

dataset3:
      id  prob
0   E370   1.0
1   E667   1.0
2  E3190   1.0
3   E848   1.0
4  E2161   1.0


### GCN + Random forest

In [23]:
upload_csv = pd.read_csv("../../data/SocialNetowrk/hw2_data/dataset1/upload.csv")
x = dataset1.get_nodes_x()
test_node_pairs = dataset1.get_test_nodepairs()
link_logits = gcn_model1.predict(x, test_node_pairs)
link_probs = link_logits.sigmoid()

X_testing = dataset1.get_testing_data()
X_testing = np.concatenate((X_testing, link_probs.cpu().detach().numpy().reshape(-1, 1)), axis=1)
sum_value = 0
for model in gcn_rf_models1:
    predict_value = model.predict(X_testing)
    sum_value += predict_value
upload_csv["prob"] = sum_value/len(gcn_rf_models1)
print("dataset1:")
print(upload_csv.head())
upload_csv.to_csv('upload/upload_gcn_rf1.csv')

upload_csv = pd.read_csv("../../data/SocialNetowrk/hw2_data/dataset2/upload.csv")
x = dataset2.get_nodes_x()
test_node_pairs = dataset2.get_test_nodepairs()
link_logits = gcn_model2.predict(x, test_node_pairs)
link_probs = link_logits.sigmoid()

X_testing = dataset2.get_testing_data()
X_testing = np.concatenate((X_testing, link_probs.cpu().detach().numpy().reshape(-1, 1)), axis=1)
sum_value = 0
for model in gcn_rf_models2:
    predict_value = model.predict(X_testing)
    sum_value += predict_value
upload_csv["prob"] = sum_value/len(gcn_rf_models2)
print("\ndataset2:")
print(upload_csv.head())
upload_csv.to_csv('upload/upload_gcn_rf2.csv')

upload_csv = pd.read_csv("../../data/SocialNetowrk/hw2_data/dataset3/upload.csv")
x = dataset3.get_nodes_x()
test_node_pairs = dataset3.get_test_nodepairs()
link_logits = gcn_model3.predict(x, test_node_pairs)
link_probs = link_logits.sigmoid()

X_testing = dataset3.get_testing_data()
X_testing = np.concatenate((X_testing, link_probs.cpu().detach().numpy().reshape(-1, 1)), axis=1)
sum_value = 0
for model in gcn_rf_models3:
    predict_value = model.predict(X_testing)
    sum_value += predict_value
upload_csv["prob"] = sum_value/len(gcn_rf_models3)
print("\ndataset3:")
print(upload_csv.head())
upload_csv.to_csv('upload/upload_gcn_rf3.csv')

dataset1:
       id      prob
0  E10559  0.534469
1   E4849  0.305609
2   E3964  0.649014
3    E542  0.568984
4    E331  0.912752

dataset2:
      id      prob
0  E3064  0.852497
1   E298  0.897005
2  E3512  0.988851
3  E5670  0.997816
4  E5005  0.373779

dataset3:
      id      prob
0   E370  0.509860
1   E667  0.501181
2  E3190  0.496838
3   E848  0.532785
4  E2161  0.454266


### GCN + NN

In [357]:
upload_csv = pd.read_csv("../../data/SocialNetowrk/hw2_data/dataset1/upload.csv")
dataset1 = ReadData("dataset1")
x = dataset1.get_nodes_x()
test_node_pairs = dataset1.get_test_nodepairs()
link_probs = gcn_model1.predict(x, test_node_pairs)
# link_probs = link_logits.sigmoid()
# neighbor
X_testing = dataset1.get_testing_data()
X_testing = torch.cat((torch.tensor(X_testing, dtype=torch.float), link_probs.reshape(-1, 1)), 1)
upload_csv["prob"] = F.sigmoid(gcn_nn_model1(X_testing)).detach().numpy()
print("dataset1:")
print(upload_csv.head())
upload_csv.to_csv('upload/upload_nn1_gcn_nosig.csv')

upload_csv = pd.read_csv("../../data/SocialNetowrk/hw2_data/dataset2/upload.csv")
dataset2 = ReadData("dataset2")
x = dataset2.get_nodes_x()
test_node_pairs = dataset2.get_test_nodepairs()
link_probs = gcn_model2.predict(x, test_node_pairs)
# link_probs = link_logits.sigmoid()
# neighbor
X_testing = dataset2.get_testing_data()
X_testing = torch.cat((torch.tensor(X_testing, dtype=torch.float), link_probs.reshape(-1, 1)), 1)
upload_csv["prob"] = F.sigmoid(gcn_nn_model2(X_testing)).detach().numpy()
print("\ndataset2:")
print(upload_csv.head())
upload_csv.to_csv('upload/upload_nn2_gcn_nosig.csv')

upload_csv = pd.read_csv("../../data/SocialNetowrk/hw2_data/dataset3/upload.csv")
dataset3 = ReadData("dataset3")
x = dataset3.get_nodes_x()
test_node_pairs = dataset3.get_test_nodepairs()
link_probs = gcn_model3.predict(x, test_node_pairs)
# link_probs = link_logits.sigmoid()
# neighbor
X_testing = dataset3.get_testing_data()
X_testing = torch.cat((torch.tensor(X_testing, dtype=torch.float), link_probs.reshape(-1, 1)), 1)
upload_csv["prob"] = F.sigmoid(gcn_nn_model3(X_testing)).detach().numpy()
print("\ndataset3:")
print(upload_csv.head())
upload_csv.to_csv('upload/upload_nn3_gcn_nosig.csv')



dataset1:
       id      prob
0  E10559  0.426127
1   E4849  0.033132
2   E3964  0.650964
3    E542  0.370743
4    E331  0.958347





dataset2:
      id      prob
0  E3064  0.838953
1   E298  0.929688
2  E3512  1.000000
3  E5670  1.000000
4  E5005  0.393786

dataset3:
      id      prob
0   E370  0.422646
1   E667  0.772561
2  E3190  0.501515
3   E848  0.530014
4  E2161  0.505471


