In [3]:
import os
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.optim import Adam

from torch_geometric.datasets import Planetoid
import torch_geometric.transforms as T
from torch_geometric.utils import train_test_split_edges,negative_sampling
from sklearn.linear_model import LogisticRegressionCV

from model_MVGE_three_loss_gcn_no_concat_para import MVGE1,MVGE2
from input_data import *
from dataset import CustomDataset
import pickle

torch.manual_seed(12345)
# torch.cuda.set_device(0)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

import warnings
warnings.filterwarnings('ignore')

### Node classification

In [4]:
dataset_name = 'cora'
link_pred = True #whether link prediction task
multi_view = True

weight = {'cora':[0.1,0.7],'citeseer':[0.1,0.1],'pubmed':[0.5,0.4],'cora_full':[0.1,0.1],'chamelemon':[0.3,0],'cornell':[0.7,1.0],'texas':[0.7,1.0],'wisconsin':[0.9,1.0]}
a = weight.get(dataset_name)[0]
b = weight.get(dataset_name)[1]

if 'syn' in dataset_name:
    assor = "h0.10-r1" #the global homohpiyl of synthetic dataset
    dataset = CustomDataset(root="./input/{}".format(dataset_name), name=assor, setting="gcn", seed=15)
    G=nx.DiGraph(dataset.adj)
    G_label = dataset.labels
    G_attr = pd.DataFrame(dataset.features.toarray())
    G_attr['nodes'] = G_attr.index
else:
    iG,G,G_label,G_attr = read_data(dataset_name)
    

data = process_data(G,G_attr,link_pred,multi_view)
data = data.to(device)

#training ten times
t = np.zeros((5,4))
for i in range(10):
    out_channels=128
    lr = 0.001
    epoch = 200

    model = MVGE1(data.x.shape[1],data.x_neighbor.shape[1],out_channels).to(device)
    optimizer = Adam(model.parameters(), lr=lr)
    os.makedirs("datasets", exist_ok=True)

    #start training
    for epoch in range(epoch):
        model.train()
        optimizer.zero_grad()
        loss = model.loss(data.x,data.x_neighbor, data.edge_index, data.edge_index,a,b)

        loss.backward()
        optimizer.step()

    #start evaluating
    z,z1,z2 = model.embed(data.x,data.x_neighbor,data.edge_index)
    node_embedding = z.cpu().detach().numpy()
    node_embedding = pd.DataFrame(node_embedding)


    k = get_all_cv_score(node_embedding, G, G_label, [LogisticRegression(n_jobs=10)])
    tr = pd.DataFrame(k).T
    ratios = [0.1, 0.3, 0.5, 0.7, 0.9]
    tr.columns = ['ratio {}'.format(j) for j in ratios]
    tr.index = ['train-micro', 'micro-std','train-macro','macro-std']
    print(tr)
    t = pd.DataFrame(k) + pd.DataFrame(t)
    del model

for tt in [t]:
    tt = tt/10
    k=[]
    ratios = [0.1, 0.3, 0.5, 0.7, 0.9]
    for i in range(len(ratios)): 
        micro = "%0.4f±%0.4f" % (tt.iloc[i][0],tt.iloc[i][1])
        macro = "%0.4f±%0.4f" % (tt.iloc[i][2],tt.iloc[i][3])
        k.append([micro, macro])

    tr = pd.DataFrame(k).T
    tr.columns = ['ratio {}'.format(j) for j in ratios]
    tr.index = ['train-micro', 'train-macro']
    display(tr)

cora Have 2708 Nodes, 10556 Edges, 1433 Attribute, 7 Classes
-----------start aggregate neighbor-----------


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2708.0), HTML(value='')))


-----------process data completed-----------
LogisticRegression(n_jobs=10)


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5.0), HTML(value='')))


             ratio 0.1  ratio 0.3  ratio 0.5  ratio 0.7  ratio 0.9
train-micro   0.844444   0.863311   0.867066   0.864380   0.867461
micro-std     0.050239   0.033287   0.043725   0.038287   0.030641
train-macro   0.822508   0.860374   0.850393   0.851723   0.855697
macro-std     0.072973   0.040244   0.055275   0.040346   0.036217


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 3331, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-4-f36bbdd0d60d>", line 38, in <module>
    loss = model.loss(data.x,data.x_neighbor, data.edge_index, data.edge_index,a,b)
  File "/notebooks/MVGE-main/model_MVGE_three_loss_gcn_no_concat_para.py", line 180, in loss
    neg_edge_index = negative_sampling(all_edge_index_tmp, z_self.size(0), pos_edge_index.size(1))
  File "/usr/local/lib/python3.6/dist-packages/torch_geometric/utils/negative_sampling.py", line 76, in negative_sampling
    perm = sample(size, int(alpha * num_neg_samples))
  File "/usr/local/lib/python3.6/dist-packages/torch_geometric/utils/negative_sampling.py", line 12, in sample
    return torch.tensor(random.sample(range(high), size), device=device)
  File "/usr/lib/python3.6/random.py", line 340, in sample
    result[i] = population[j]
KeyboardInterrupt



KeyboardInterrupt: 

### link prediction

link prediction task do not need to set the loss weight

In [10]:
weight = [0,0.2,0.4,0.6,0.8,1]
dataset_name = 'cora'
link_pred = True #whether link prediction task
multi_view = True


if 'syn' in dataset_name:
    assor = "h0.10-r1" #the global homohpiyl of synthetic dataset
    dataset = CustomDataset(root="./input/datasets/{}".format(dataset_name), name=assor, setting="gcn", seed=15)
    G=nx.DiGraph(dataset.adj)
    G_label = dataset.labels
    G_attr = pd.DataFrame(dataset.features.toarray())
    G_attr['nodes'] = G_attr.index
else:
    iG,G,G_label,G_attr = read_data(dataset_name)

data = process_data(G,G_attr,link_pred,multi_view)
data = data.to(device)
from torch_geometric.utils import negative_sampling, remove_self_loops, add_self_loops
all_edge_index_tmp, _ = remove_self_loops(data.edge_index)
all_edge_index_tmp, _ = add_self_loops(all_edge_index_tmp)
neg_edge_index = negative_sampling(all_edge_index_tmp, G_attr.shape[0], data.train_pos_edge_index.size(1))


out_channels=128
lr = 0.001
epoch = 200
aucs, aps = [], []

#training ten times
for i in range(10):
    model = MVGE2(data.x.shape[1],data.x_neighbor.shape[1],out_channels).to(device)
    optimizer = Adam(model.parameters(), lr=lr)
    os.makedirs("datasets", exist_ok=True)
    best_auc = 0
    best_ap = 0

    #start training
    for epoch in range(epoch):
        model.train()
        optimizer.zero_grad()
        loss = model.loss(data.x,data.x_neighbor, data.train_pos_edge_index, data.edge_index,a,b)

        loss.backward()
        optimizer.step()

    #start evaluating
    z,z1,z2 = model.embed(data.x,data.x_neighbor,data.edge_index)
    del model
    train = torch.cat([data.train_pos_edge_index,neg_edge_index],axis=1)
    test = torch.cat([data.test_pos_edge_index,data.test_neg_edge_index],axis=1)
    nodes_first = z[train[0],:]
    nodes_second = z[train[1],:]
    pred_train = nodes_first*nodes_second
    pred_train = np.array(pred_train.cpu().detach().numpy().sum(axis=1)).reshape(-1,1)
    label1 = np.ones([data.train_pos_edge_index.shape[1],])
    label0 = np.zeros([neg_edge_index.shape[1],])
    label_train = np.concatenate((label1,label0))

    nodes_first = z[test[0],:]
    nodes_second = z[test[1],:]
    pred_test = nodes_first*nodes_second
    pred_test = np.array(pred_test.cpu().detach().numpy().sum(axis=1)).reshape(-1,1)
    label1 = np.ones([data.test_pos_edge_index.shape[1],])
    label0 = np.zeros([data.test_neg_edge_index.shape[1],])
    label_test = np.concatenate((label1,label0))

    clf = LogisticRegressionCV(Cs=10,max_iter=100,n_jobs=10,verbose=1,scoring='roc_auc') 
    clf.fit(pred_train,label_train)
    roc_auc = roc_auc_score(label_test,clf.predict_proba(pred_test)[:,1])
    ap = average_precision_score(label_test,clf.predict_proba(pred_test)[:,1])
    print('roc_auc:{:.4f}, ap:{:.4f}'.format(roc_auc,ap))

    aucs.append(roc_auc)
    aps.append(ap)

print('End of Tainning {}! weight of loss:'.format(i),a,b)
print('AUC ROC :{:.4f}±{:.4f}'.format(np.mean(aucs),np.std(aucs)))
print('AP: {:.4f}±{:.4f}'.format(np.mean(aps),np.std(aps)))

### Pairwise node classification 

In [12]:
def get_pair_wise_score(node_embedding,name,test_size):
    iG,G,G_label,G_attr = read_data(name)
    mask_link_positive = pd.read_pickle("./input/pairwise_node/{}_mask_link_positive.pickle".format(name))
    mask_link_negtive = pd.read_pickle("./input/pairwise_node/{}_mask_link_negtive.pickle".format(name))

    num = len(G.edges())
    mask_link_positive = mask_link_positive.T.sample(n=num).reset_index(drop=True)
    mask_link_negtive = mask_link_negtive.sample(n=num).reset_index(drop=True)

    from sklearn.model_selection import train_test_split
    mask_link_positive_train, mask_link_positive_test = train_test_split(mask_link_positive,test_size=test_size)
    mask_link_negtive_train, mask_link_negtive_test = train_test_split(mask_link_negtive,test_size=test_size)
    train = pd.concat([mask_link_positive_train,mask_link_negtive_train],axis=0)
    test = pd.concat([mask_link_positive_test,mask_link_negtive_test],axis=0)

    nodes_first = node_embedding.loc[train[0]].reset_index(drop=True)
    nodes_second = node_embedding.loc[train[1]].reset_index(drop=True)
    train_pred = pd.DataFrame.mul(nodes_first,nodes_second)
    train_label = [1 for i in range(train.shape[0]//2)] + [0 for i in range(train.shape[0]//2)]

    nodes_first = node_embedding.loc[test[0]].reset_index(drop=True)
    nodes_second = node_embedding.loc[test[1]].reset_index(drop=True)
    test_pred = pd.DataFrame.mul(nodes_first,nodes_second)
    test_label = [1 for i in range(test.shape[0]//2)] + [0 for i in range(test.shape[0]//2)]

    from sklearn.linear_model import LogisticRegressionCV
    clf = LogisticRegressionCV(cv=5,Cs=10,max_iter=100,n_jobs=20,verbose=1,scoring='roc_auc')

    clf.fit(train_pred,train_label)
    auc, ap = roc_auc_score(test_label,clf.predict_proba(test_pred)[:,1]),average_precision_score(test_label,clf.predict_proba(test_pred)[:,1])
    return auc,ap

def get_pair_wise(name,test_size):
    iG,G,G_label,G_attr = read_data(name)
    mask_link_positive = pd.read_pickle("./input/pairwise_node/{}_mask_link_positive.pickle".format(name))
    mask_link_negtive = pd.read_pickle("./input/pairwise_node/{}_mask_link_negtive.pickle".format(name))
    
    num = len(G.edges())
    mask_link_positive = mask_link_positive.T.sample(n=num).reset_index(drop=True)
    mask_link_negtive = mask_link_negtive.sample(n=num).reset_index(drop=True)

    from sklearn.model_selection import train_test_split
    mask_link_positive_train, mask_link_positive_test = train_test_split(mask_link_positive,test_size=test_size)
    mask_link_negtive_train, mask_link_negtive_test = train_test_split(mask_link_negtive,test_size=test_size)
    train = pd.concat([mask_link_positive_train,mask_link_negtive_train],axis=0)
    test = pd.concat([mask_link_positive_test,mask_link_negtive_test],axis=0)
    
    train_label = [1 for i in range(train.shape[0]//2)] + [0 for i in range(train.shape[0]//2)]
    test_label = [1 for i in range(test.shape[0]//2)] + [0 for i in range(test.shape[0]//2)]
    return train,test,train_label,test_label

def get_score(node_embedding,train,test,train_label,test_label):
    nodes_first = node_embedding.loc[train[0]].reset_index(drop=True)
    nodes_second = node_embedding.loc[train[1]].reset_index(drop=True)
    train_pred = pd.DataFrame.mul(nodes_first,nodes_second)

    nodes_first = node_embedding.loc[test[0]].reset_index(drop=True)
    nodes_second = node_embedding.loc[test[1]].reset_index(drop=True)
    test_pred = pd.DataFrame.mul(nodes_first,nodes_second)
    
    from sklearn.linear_model import LogisticRegressionCV
    clf = LogisticRegressionCV(cv=5,Cs=10,max_iter=100,n_jobs=20,verbose=1,scoring='roc_auc')

    clf.fit(train_pred,train_label)
    auc, ap = roc_auc_score(test_label,clf.predict_proba(test_pred)[:,1]),average_precision_score(test_label,clf.predict_proba(test_pred)[:,1])
    return auc,ap

In [15]:
weight = [0,0.2,0.4,0.6,0.8,1]
dataset_name = 'cora'
link_pred = True #whether link prediction task
multi_view = True


if 'syn' in dataset_name:
    assor = "h0.10-r1" #the global homohpiyl of synthetic dataset
    dataset = CustomDataset(root="./input/datasets/{}".format(dataset_name), name=assor, setting="gcn", seed=15)
    G=nx.DiGraph(dataset.adj)
    G_label = dataset.labels
    G_attr = pd.DataFrame(dataset.features.toarray())
    G_attr['nodes'] = G_attr.index
else:
    iG,G,G_label,G_attr = read_data(dataset_name)

data = process_data(G,G_attr,link_pred,multi_view)
data = data.to(device)


train,test,train_label,test_label = get_pair_wise(dataset_name,test_size=0.15)
out_channels=128
lr = 0.001
epoch = 200
aucs, aps = [], []

#training ten times
for i in range(5):
    model = MVGE2(data.x.shape[1],data.x_neighbor.shape[1],out_channels).to(device)
    optimizer = Adam(model.parameters(), lr=lr)
    os.makedirs("datasets", exist_ok=True)

    best_auc,best_ap = 0,0
    #start training
    for epoch in range(epoch):
        model.train()
        optimizer.zero_grad()
        loss = model.loss(data.x,data.x_neighbor, data.edge_index, data.edge_index,a,b)

        loss.backward()
        optimizer.step()

    #start evaluating
    z,z1,z2 = model.embed(data.x,data.x_neighbor,data.edge_index)
    node_embedding = z.cpu().detach().numpy()
    node_embedding = pd.DataFrame(node_embedding)
    auc,ap = get_score(node_embedding,train,test,train_label,test_label)
    aucs.append(auc)
    aps.append(ap)
    del model
print('End of Tainning of 10 times!')
print('AUC ROC :{:.4f}±{:.4f}'.format(np.mean(aucs),np.std(aucs)))
print('AP: {:.4f}±{:.4f}'.format(np.mean(aps),np.std(aps)))

cora Have 2708 Nodes, 10556 Edges, 1433 Attribute, 7 Classes
-----------start aggregate neighbor-----------


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2708.0), HTML(value='')))


-----------process data completed-----------
cora Have 2708 Nodes, 10556 Edges, 1433 Attribute, 7 Classes


[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of   5 | elapsed:    4.1s remaining:    6.1s
[Parallel(n_jobs=20)]: Done   5 out of   5 | elapsed:    4.5s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of   5 | elapsed:    3.1s remaining:    4.6s
[Parallel(n_jobs=20)]: Done   5 out of   5 | elapsed:    3.1s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of   5 | elapsed:    2.6s remaining:    3.9s
[Parallel(n_jobs=20)]: Done   5 out of   5 | elapsed:    3.0s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of   5 | elapsed:    3.3s remaining:    4.9s
[Parallel(n_jobs=20)]: Done   5 out of   5 | elapsed:    3.5s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parall

End of Tainning of 10 times!
AUC ROC :0.8333±0.0097
AP: 0.8345±0.0075
