In [1]:
import os.path as osp
import pandas as pd
import networkx as nx
import numpy as np
import scipy.sparse as sp

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch_geometric.datasets import Planetoid, Coauthor, Amazon
from torch_geometric.utils import train_test_split_edges, dense_to_sparse
from torch_geometric.nn import GAE, VGAE, APPNP,GCNConv,InnerProductDecoder
import torch_geometric.transforms as T
from torch_geometric.data import Data

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the Cora dataset
dataset = Planetoid(root='/tmp/cora', name='Cora')
data = dataset[0]
data = T.NormalizeFeatures()(data)

In [3]:
path = "hw2_data/"
dss = ['dataset1','dataset2','dataset3'] #datasets
datasets = dict()
for ds in dss:
    datasets[ds] = dict()
    datasets[ds]['content'] = pd.read_csv(path+ds+"/content.csv",delimiter = '\t',header = None)
    datasets[ds]['train'] = pd.read_csv(path+ds+"/train.csv",delimiter = ',')
    datasets[ds]['test'] = pd.read_csv(path+ds+"/test.csv",delimiter = ',')
    datasets[ds]['upload'] = pd.read_csv(path+ds+"/upload.csv",delimiter = ',')
datasets[dss[2]]['test'].head()

Unnamed: 0,id,to,from
0,E370,26,317
1,E667,196,323
2,E3190,739,468
3,E848,576,156
4,E2161,466,199


In [4]:
for dss_num in range(3):
    Content = datasets[dss[dss_num]]['content']
    Train = datasets[dss[dss_num]]['train']
    Test = datasets[dss[dss_num]]['test']
    Upload = datasets[dss[dss_num]]['upload']
    print(dss[dss_num])
    print(f'Nodes {Content.shape[0]}')
    print(f'Attributes for each node:{Content.shape[1]-1}')
    pos = Train[Train['label']==1].shape[0]
    print(f'Positive edges given: {pos}')

dataset1
Nodes 2708
Attributes for each node:1433
Positive edges given: 4324
dataset2
Nodes 3312
Attributes for each node:3703
Positive edges given: 3736
dataset3
Nodes 877
Attributes for each node:1703
Positive edges given: 1273


In [282]:
#For example:
dss_num = 0
Content = datasets[dss[dss_num]]['content']
Train = datasets[dss[dss_num]]['train']
Test = datasets[dss[dss_num]]['test']
Upload = datasets[dss[dss_num]]['upload']

In [283]:
def load_data(content,train, test):
    G = nx.Graph()
    # for easier split the edges, create 2 graph, 1 with positive edge, the other with given negative edges
    G_pos = nx.Graph()
    G_neg = nx.Graph()
    for i in range(len(content)):
        G.add_node(int(content.iloc[i,0]),features = np.array(content.iloc[i,1:]))
        # pos and neg
        G_pos.add_node(int(content.iloc[i,0]),features = np.array(content.iloc[i,1:]))
        G_neg.add_node(int(content.iloc[i,0]),features = np.array(content.iloc[i,1:]))
        
    for i in range(len(train)):
        # Adding nodes into G
        '''
        if train.loc[i,'from'] not in G:
            G.add_node(train.loc[i,'from'],features = graph_node_features_dict[train.loc[i,'from']])
        if train.loc[i,'to'] not in G:
            G.add_node(train.loc[i,'to'],features = graph_node_features_dict[train.loc[i,'to']])
        ''' 
        # Adding edges
        G.add_edge(train.loc[i,'from'],train.loc[i,'to'],label = train.loc[i,'label'])
        
        # pos and neg
        if train.loc[i,'label'] == 0: 
            G_neg.add_edge(train.loc[i,'from'],train.loc[i,'to'],label = train.loc[i,'label'])
        else:
            G_pos.add_edge(train.loc[i,'from'],train.loc[i,'to'],label = train.loc[i,'label'])

    adj = nx.adjacency_matrix(G,sorted(G.nodes()))
    adj_pos = nx.adjacency_matrix(G_pos,sorted(G_pos.nodes()))
    adj_neg = nx.adjacency_matrix(G_neg,sorted(G_neg.nodes()))
    
    features = np.array(
        [features for _, features in sorted(G.nodes(data='features'), key=lambda x: x[0])])
    #features[:,:1433] = preprocess_features(features[:,:1433])
    #features[:,1433:] = preprocess_features(features[:,1433:])
    # Skip train,valid,and test mask
    
    num_features = features.shape[1]
    features = torch.FloatTensor(features)
    return adj, adj_pos, adj_neg, features, num_features

In [284]:
def preprocess_features(features):
    # Row-normalize feature matrix and convert to tuple representation
    rowsum = np.array(features.sum(1),dtype = np.float32)
    rowsum = (rowsum==0)*1+rowsum
    r_inv = np.power(rowsum, -1).flatten()
    r_inv[np.isinf(r_inv)] = 0.
    r_mat_inv = sp.diags(r_inv)
    features = r_mat_inv.dot(features)
    return features

In [285]:
def increase_features(content,train):
    G = nx.Graph()
    for i in range(len(content)):
        G.add_node(int(content.iloc[i,0]),features = np.array(content.iloc[i,1:]))
        
    # Adding edges
    for i in range(len(train)):
        # pos and neg
        if train.loc[i,'label'] == 1: 
            G.add_edge(train.loc[i,'from'],train.loc[i,'to'],label = train.loc[i,'label'])
            
    df = content.copy()
    df['pagerank'] = nx.pagerank(G,alpha=0.9).values()
    df['pagerank'] = df['pagerank']/np.linalg.norm(df['pagerank'])
    
    hubs,aut = nx.hits(G)
    df['hubs'] = hubs.values()
    df['hubs'] = df['hubs']/np.linalg.norm(df['hubs'])
    df['aut'] = aut.values()
    df['aut'] = df['hubs']/np.linalg.norm(df['aut'])
    
    
    features_name = ['degree','in_degree','out_degree','eigenvec','katz','closeness','info',
                     'betweenness','rwbetween','combetween','load','subgraph','harmonic','local_reach',
                    'global_reach', 'percolation','order2','trophic','laplacian']
    
    df[features_name[0]] = nx.degree_centrality(G).values()
    #df[features_name[1]] = nx.in_degree_centrality(G).values()
    #df[features_name[2]] = nx.out_degree_centrality(G).values()
    df[features_name[3]] = nx.eigenvector_centrality(G).values()
    #df[features_name[4]] = nx.katz_centrality(G).values()
    df[features_name[5]] = nx.closeness_centrality(G).values()
    #df[features_name[6]] = nx.information_centrality(G).values()
    #df[features_name[7]] = nx.betweenness_centrality(G).values()
    #print('finish feature 7')
    #df[features_name[8]] = nx.current_flow_betweenness_centrality(G).values()
    #df[features_name[9]] = nx.communicability_betweenness_centrality(G).values()
    #print('finish feature 9')
    df[features_name[10]] = nx.load_centrality(G).values()
    print('finish feature 10')
    df[features_name[11]] = nx.subgraph_centrality(G).values()
    print('finish feature 11')
    df[features_name[12]] = nx.harmonic_centrality(G).values()
    print('finish feature 12')
    #df[features_name[13]] = nx.local_reaching_centrality(G).values()
    #print('finish feature 13')
    #df[features_name[14]] = nx.global_reaching_centrality(G).values()
    #print('finish feature 14')
    #df[features_name[15]] = nx.percolation_centrality(G).values()
    #print('finish feature 15')
    #df[features_name[16]] = nx.second_order_centrality(G).values()
    #print('finish feature 16')
    #df[features_name[17]] = nx.trophic_levels(G).values()
    #print('finish feature 17')
    #df[features_name[18]] = nx.laplacian_centrality(G).values()
    #print('finish feature 18')
    
    for fea_num in [0,3,5,10,11,12]:
        df[features_name[fea_num]] /= np.linalg.norm(df[features_name[fea_num]])
    
    return df

In [286]:
Content_mod = increase_features(Content,Train)
g, pos, neg, features, num_features = load_data(Content_mod,Train,Test)
A = g.toarray()
edge_index,_ = dense_to_sparse(torch.tensor(A))
Apos = pos.toarray()
edge_index_pos,_ = dense_to_sparse(torch.tensor(Apos))
Aneg = neg.toarray()
edge_index_neg,_ = dense_to_sparse(torch.tensor(Aneg))
data = Data(edge_index=edge_index_pos,x=features.to(torch.float),test = torch.tensor([Test['from'],Test['to']]))
data

finish feature 10
finish feature 11
finish feature 12


Data(x=[2708, 1442], edge_index=[2, 8472], test=[2, 2172])

In [287]:
Content_mod

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1433,pagerank,hubs,aut,degree,eigenvec,closeness,load,subgraph,harmonic
0,351,0,0,0,0,0,0,0,0,0,...,0,0.010754,1.213282e-06,1.402619e-05,0.010923,1.225641e-06,0.016812,2.736266e-07,0.000068,0.015284
1,1357,0,0,0,0,0,0,0,0,0,...,0,0.004605,1.842516e-05,2.130047e-04,0.003641,1.862686e-05,0.020120,0.000000e+00,0.000043,0.000055
2,272,0,0,0,0,0,0,0,0,0,...,0,0.006926,7.418108e-04,8.575730e-03,0.007282,7.646430e-04,0.023116,2.009531e-05,0.000215,0.000000
3,583,0,0,0,0,0,0,0,0,0,...,0,0.010008,3.558364e-04,4.113659e-03,0.010923,3.666547e-04,0.022610,1.294164e-04,0.000067,0.022150
4,391,0,0,0,0,0,0,0,0,0,...,0,0.013578,2.751365e-20,3.180725e-19,0.003641,1.250241e-31,0.000060,0.000000e+00,0.000013,0.023648
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2703,1081,0,0,0,0,0,0,0,0,0,...,0,0.011211,5.801834e-03,6.707231e-02,0.010923,5.801995e-03,0.021930,2.185307e-04,0.000106,0.024321
2704,743,0,0,0,0,0,0,0,0,0,...,0,0.008698,5.426008e-04,6.272755e-03,0.007282,5.426795e-04,0.018976,9.576333e-05,0.000021,0.000055
2705,1006,0,0,0,0,0,0,0,0,0,...,0,0.009364,2.794108e-21,3.230139e-20,0.003641,1.467326e-27,0.000090,0.000000e+00,0.000013,0.012533
2706,1826,0,0,0,0,1,0,0,0,0,...,0,0.013002,9.188410e-07,1.062229e-05,0.010923,9.226634e-07,0.016094,1.887064e-03,0.000032,0.021058


In [288]:
dev = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
channels = 256

In [289]:
train_rate = 0.9
val_ratio = (1-train_rate)#/3
test_ratio = 0#(1-train_rate) / 3 * 2
data = train_test_split_edges(data.to(dev), val_ratio=val_ratio, test_ratio=0)
               #,train_pos_edge_index = edge_index_pos,neg = edge_index_neg,test = edge_index_test)
data = T.NormalizeFeatures()(data)



In [290]:
model_used = 'Hybrid'
scaling_factor = 1.8
class Encoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels, edge_index,model_used = 'GNAE'):
        super(Encoder, self).__init__()
        self.linear1 = nn.Linear(in_channels, out_channels)
        self.linear2 = nn.Linear(in_channels, out_channels)
        self.propagate = APPNP(K=2, alpha=0)
        self.model_used = model_used

    def forward(self, x, edge_index,not_prop=0):
        if self.model_used  == 'GNAE':
            x = self.linear1(x)
            x = F.normalize(x,p=2,dim=1)  * scaling_factor
            x = self.propagate(x, edge_index)
            return x

        if self.model_used  == 'VGNAE':
            x_ = self.linear1(x)
            x_ = self.propagate(x_, edge_index)

            x = self.linear2(x)
            x = F.normalize(x,p=2,dim=1) * scaling_factor
            x = self.propagate(x, edge_index)
            return x, x_

        return x

In [291]:
def train(model,optimizer):
    model.train()
    optimizer.zero_grad()
    z  = model.encode(x, train_pos_edge_index)
    loss = model.recon_loss(z, train_pos_edge_index)
    if model_used  in ['VGNAE']:
        loss = loss + (1 / data.num_nodes) * model.kl_loss()
    loss.backward()
    optimizer.step()
    return loss

In [292]:
def test(pos_edge_index, neg_edge_index,model):
    model.eval()
    with torch.no_grad():
        z = model.encode(x, train_pos_edge_index)
    return model.test(z, pos_edge_index, neg_edge_index)

In [293]:
if model_used  == 'GNAE':   
    model = GAE(Encoder(data.x.size()[1], channels, data.train_pos_edge_index,model_used = 'GNAE')).to(dev)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
if model_used  == 'VGNAE':
    model = VGAE(Encoder(data.x.size()[1],channels, data.train_pos_edge_index,model_used = 'VGNAE')).to(dev)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
if model_used == 'Hybrid':
    model1 = GAE(Encoder(data.x.size()[1], channels, data.train_pos_edge_index,model_used = 'GNAE')).to(dev)
    model2 = VGAE(Encoder(data.x.size()[1],channels, data.train_pos_edge_index,model_used = 'VGNAE')).to(dev)
    optimizer1 = torch.optim.Adam(model1.parameters(), lr=0.005)
    optimizer2 = torch.optim.Adam(model2.parameters(), lr=0.005)
#model = VGAE(VariationalGCNEncoder(data.x.size()[1],channels)).to(dev)

In [294]:
data.train_mask = data.val_mask = data.test_mask = data.y = None
x, train_pos_edge_index = data.x.to(dev), data.train_pos_edge_index.to(dev)

In [295]:
test(data.val_pos_edge_index, data.val_neg_edge_index,model1)

(0.670025540856988, 0.6961988499634456)

In [296]:
epochs = 400
tolerance = 30
best_auc = float('-inf')
best_ap = float('-inf')
num_tol = 0
for epoch in range(1,epochs):
    loss = train(model1,optimizer1)
    loss = float(loss)
    
    with torch.no_grad():
        #valid_pos, valid_neg = data.val_pos_edge_index, data.val_neg_edge_index
        auc, ap = test(data.val_pos_edge_index, data.val_neg_edge_index,model1)
        if auc<best_auc and ap<best_ap:
            tol += 1
            if tol>=tolerance:
                break
        else:
            best_auc = auc
            best_ap = ap
            best_model = model1.state_dict()
            tol = 0
        print('Epoch: {:03d}, LOSS: {:.4f}, AUC: {:.4f}, AP: {:.4f}'.format(epoch, loss, auc, ap))
model1_path = './model/VGAE_best_model_GNAE_newfea_'+str(dss_num+1)+'.pt'
torch.save(best_model, model1_path)

Epoch: 001, LOSS: 2.8423, AUC: 0.6924, AP: 0.7151
Epoch: 002, LOSS: 2.4256, AUC: 0.8074, AP: 0.8170
Epoch: 003, LOSS: 1.4762, AUC: 0.8718, AP: 0.8748
Epoch: 004, LOSS: 1.3009, AUC: 0.8573, AP: 0.8594
Epoch: 005, LOSS: 1.3874, AUC: 0.8832, AP: 0.8856
Epoch: 006, LOSS: 1.3106, AUC: 0.9173, AP: 0.9213
Epoch: 007, LOSS: 1.1866, AUC: 0.9330, AP: 0.9363
Epoch: 008, LOSS: 1.1143, AUC: 0.9362, AP: 0.9389
Epoch: 009, LOSS: 1.0866, AUC: 0.9373, AP: 0.9394
Epoch: 010, LOSS: 1.0687, AUC: 0.9381, AP: 0.9398
Epoch: 011, LOSS: 1.0535, AUC: 0.9391, AP: 0.9406
Epoch: 012, LOSS: 1.0415, AUC: 0.9397, AP: 0.9409
Epoch: 013, LOSS: 1.0287, AUC: 0.9402, AP: 0.9413
Epoch: 014, LOSS: 1.0166, AUC: 0.9401, AP: 0.9410
Epoch: 015, LOSS: 1.0060, AUC: 0.9399, AP: 0.9407
Epoch: 016, LOSS: 0.9894, AUC: 0.9395, AP: 0.9400
Epoch: 017, LOSS: 0.9727, AUC: 0.9387, AP: 0.9393
Epoch: 018, LOSS: 0.9549, AUC: 0.9378, AP: 0.9385
Epoch: 019, LOSS: 0.9494, AUC: 0.9374, AP: 0.9379
Epoch: 020, LOSS: 0.9329, AUC: 0.9375, AP: 0.9381


In [297]:
model1.load_state_dict(torch.load(model1_path))
test(data.val_pos_edge_index, data.val_neg_edge_index,model1)

(0.9496280647631183, 0.9499042983371123)

In [298]:
z = model1.encode(x, data.train_pos_edge_index)
Upload['prob'] = model1.decoder(z,data.test,sigmoid = True).detach().cpu().numpy()
Upload

Unnamed: 0,id,prob
0,E10559,0.553812
1,E4849,0.508235
2,E3964,0.847608
3,E542,0.560758
4,E331,0.560399
...,...,...
2167,E2524,0.376282
2168,E4324,0.447571
2169,E1384,0.930544
2170,E7582,0.955629


In [299]:
Upload.to_csv('output/GNAE_undirect_newfea_best_'+str(dss_num+1)+'.csv',index=False)

In [300]:
test(data.val_pos_edge_index, data.val_neg_edge_index,model2)

(0.6709476943368597, 0.6967165931209114)

In [301]:
epochs = 400
tolerance = 30
best_auc = float('-inf')
best_ap = float('-inf')
num_tol = 0
for epoch in range(1,epochs):
    loss = train(model2,optimizer2)
    loss = float(loss)
    
    with torch.no_grad():
        #valid_pos, valid_neg = data.val_pos_edge_index, data.val_neg_edge_index
        auc, ap = test(data.val_pos_edge_index, data.val_neg_edge_index,model2)
        if auc<best_auc and ap<best_ap:
            tol += 1
            if tol>=tolerance:
                break
        else:
            best_auc = auc
            best_ap = ap
            best_model = model2.state_dict()
            tol = 0
        print('Epoch: {:03d}, LOSS: {:.4f}, AUC: {:.4f}, AP: {:.4f}'.format(epoch, loss, auc, ap))
model2_path = './model/VGAE_best_model_VGAE_newfea_'+str(dss_num+1)+'.pt'
torch.save(best_model, model2_path)

Epoch: 001, LOSS: 14.1596, AUC: 0.6721, AP: 0.6977
Epoch: 002, LOSS: 14.7162, AUC: 0.6736, AP: 0.6989
Epoch: 003, LOSS: 14.1092, AUC: 0.6751, AP: 0.6999
Epoch: 004, LOSS: 13.7000, AUC: 0.6762, AP: 0.7008
Epoch: 005, LOSS: 13.2037, AUC: 0.6771, AP: 0.7015
Epoch: 006, LOSS: 13.0075, AUC: 0.6780, AP: 0.7021
Epoch: 007, LOSS: 13.3985, AUC: 0.6788, AP: 0.7027
Epoch: 008, LOSS: 12.8324, AUC: 0.6795, AP: 0.7032
Epoch: 009, LOSS: 12.9536, AUC: 0.6802, AP: 0.7038
Epoch: 010, LOSS: 12.5981, AUC: 0.6808, AP: 0.7042
Epoch: 011, LOSS: 12.4974, AUC: 0.6814, AP: 0.7045
Epoch: 012, LOSS: 12.2342, AUC: 0.6820, AP: 0.7050
Epoch: 013, LOSS: 12.2805, AUC: 0.6827, AP: 0.7055
Epoch: 014, LOSS: 11.8244, AUC: 0.6833, AP: 0.7058
Epoch: 015, LOSS: 11.9071, AUC: 0.6839, AP: 0.7063
Epoch: 016, LOSS: 11.6832, AUC: 0.6846, AP: 0.7068
Epoch: 017, LOSS: 11.4435, AUC: 0.6854, AP: 0.7075
Epoch: 018, LOSS: 11.3773, AUC: 0.6862, AP: 0.7082
Epoch: 019, LOSS: 11.0265, AUC: 0.6869, AP: 0.7088
Epoch: 020, LOSS: 11.3098, AUC:

In [302]:
model2.load_state_dict(torch.load(model2_path))
test(data.val_pos_edge_index, data.val_neg_edge_index,model2)

(0.9370197117292334, 0.935928376024234)

In [303]:
z = model2.encode(x, data.train_pos_edge_index)
Upload['prob'] = model2.decoder(z,data.test,sigmoid = True).detach().cpu().numpy()
Upload

Unnamed: 0,id,prob
0,E10559,0.599202
1,E4849,0.769632
2,E3964,0.830444
3,E542,0.528973
4,E331,0.842820
...,...,...
2167,E2524,0.260114
2168,E4324,0.527408
2169,E1384,0.926734
2170,E7582,0.965313


In [304]:
Upload.to_csv('output/VGAE_undirect_newfea_best_'+str(dss_num+1)+'.csv',index=False)

In [305]:
z1 = model1.encode(x, data.train_pos_edge_index)
z2 = model2.encode(x, data.train_pos_edge_index)
Upload['prob'] = (2*model1.decoder(z1,data.test,sigmoid = True).detach().cpu().numpy()+model2.decoder(z2,data.test,sigmoid = True).detach().cpu().numpy())/3
Upload

Unnamed: 0,id,prob
0,E10559,0.568942
1,E4849,0.595368
2,E3964,0.841887
3,E542,0.550163
4,E331,0.654539
...,...,...
2167,E2524,0.337560
2168,E4324,0.474184
2169,E1384,0.929274
2170,E7582,0.958857


In [306]:
Upload.to_csv('output/Hybrid_undirect_imb_newfea_best_'+str(dss_num+1)+'.csv',index=False)