In [1]:
import os.path as osp
import pandas as pd
import networkx as nx
import numpy as np
import scipy.sparse as sp

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch_geometric.datasets import Planetoid, Coauthor, Amazon
from torch_geometric.utils import train_test_split_edges, dense_to_sparse
from torch_geometric.nn import GAE, VGAE, APPNP,GCNConv,InnerProductDecoder
import torch_geometric.transforms as T
from torch_geometric.data import Data

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the Cora dataset
dataset = Planetoid(root='/tmp/cora', name='Cora')
data = dataset[0]
data = T.NormalizeFeatures()(data)

In [3]:
path = "hw2_data/"
dss = ['dataset1','dataset2','dataset3'] #datasets
datasets = dict()
for ds in dss:
    datasets[ds] = dict()
    datasets[ds]['content'] = pd.read_csv(path+ds+"/content.csv",delimiter = '\t',header = None)
    datasets[ds]['train'] = pd.read_csv(path+ds+"/train.csv",delimiter = ',')
    datasets[ds]['test'] = pd.read_csv(path+ds+"/test.csv",delimiter = ',')
    datasets[ds]['upload'] = pd.read_csv(path+ds+"/upload.csv",delimiter = ',')
datasets[dss[2]]['test'].head()

Unnamed: 0,id,to,from
0,E370,26,317
1,E667,196,323
2,E3190,739,468
3,E848,576,156
4,E2161,466,199


In [4]:
for dss_num in range(3):
    Content = datasets[dss[dss_num]]['content']
    Train = datasets[dss[dss_num]]['train']
    Test = datasets[dss[dss_num]]['test']
    Upload = datasets[dss[dss_num]]['upload']
    print(dss[dss_num])
    print(f'Nodes {Content.shape[0]}')
    print(f'Attributes for each node:{Content.shape[1]-1}')
    pos = Train[Train['label']==1].shape[0]
    print(f'Positive edges given: {pos}')

dataset1
Nodes 2708
Attributes for each node:1433
Positive edges given: 4324
dataset2
Nodes 3312
Attributes for each node:3703
Positive edges given: 3736
dataset3
Nodes 877
Attributes for each node:1703
Positive edges given: 1273


In [7]:
#For example:
dss_num = 0
Content = datasets[dss[dss_num]]['content']
Train = datasets[dss[dss_num]]['train']
Test = datasets[dss[dss_num]]['test']
Upload = datasets[dss[dss_num]]['upload']

In [8]:
def load_data(content,train, test):
    G = nx.Graph()
    # for easier split the edges, create 2 graph, 1 with positive edge, the other with given negative edges
    G_pos = nx.Graph()
    G_neg = nx.Graph()
    for i in range(len(content)):
        G.add_node(int(content.iloc[i,0]),features = np.array(content.iloc[i,1:]))
        # pos and neg
        G_pos.add_node(int(content.iloc[i,0]),features = np.array(content.iloc[i,1:]))
        G_neg.add_node(int(content.iloc[i,0]),features = np.array(content.iloc[i,1:]))
        
    for i in range(len(train)):
        # Adding nodes into G
        '''
        if train.loc[i,'from'] not in G:
            G.add_node(train.loc[i,'from'],features = graph_node_features_dict[train.loc[i,'from']])
        if train.loc[i,'to'] not in G:
            G.add_node(train.loc[i,'to'],features = graph_node_features_dict[train.loc[i,'to']])
        ''' 
        # Adding edges
        G.add_edge(train.loc[i,'from'],train.loc[i,'to'],label = train.loc[i,'label'])
        
        # pos and neg
        if train.loc[i,'label'] == 0: 
            G_neg.add_edge(train.loc[i,'from'],train.loc[i,'to'],label = train.loc[i,'label'])
        else:
            G_pos.add_edge(train.loc[i,'from'],train.loc[i,'to'],label = train.loc[i,'label'])

    adj = nx.adjacency_matrix(G,sorted(G.nodes()))
    adj_pos = nx.adjacency_matrix(G_pos,sorted(G_pos.nodes()))
    adj_neg = nx.adjacency_matrix(G_neg,sorted(G_neg.nodes()))
    
    features = np.array(
        [features for _, features in sorted(G.nodes(data='features'), key=lambda x: x[0])])
    #features[:,:1433] = preprocess_features(features[:,:1433])
    #features[:,1433:] = preprocess_features(features[:,1433:])
    # Skip train,valid,and test mask
    
    num_features = features.shape[1]
    features = torch.FloatTensor(features)
    return adj, adj_pos, adj_neg, features, num_features

In [9]:
def preprocess_features(features):
    # Row-normalize feature matrix and convert to tuple representation
    rowsum = np.array(features.sum(1),dtype = np.float32)
    rowsum = (rowsum==0)*1+rowsum
    r_inv = np.power(rowsum, -1).flatten()
    r_inv[np.isinf(r_inv)] = 0.
    r_mat_inv = sp.diags(r_inv)
    features = r_mat_inv.dot(features)
    return features

In [10]:
def increase_features(content,train):
    G = nx.Graph()
    for i in range(len(content)):
        G.add_node(int(content.iloc[i,0]),features = np.array(content.iloc[i,1:]))
        
    # Adding edges
    for i in range(len(train)):
        # pos and neg
        if train.loc[i,'label'] == 1: 
            G.add_edge(train.loc[i,'from'],train.loc[i,'to'],label = train.loc[i,'label'])
            
    df = content.copy()
    df['pagerank'] = nx.pagerank(G,alpha=0.9).values()
    df['pagerank'] = df['pagerank']/np.linalg.norm(df['pagerank'])
    
    hubs,aut = nx.hits(G)
    df['hubs'] = hubs.values()
    df['hubs'] = df['hubs']/np.linalg.norm(df['hubs'])
    df['aut'] = aut.values()
    df['aut'] = df['hubs']/np.linalg.norm(df['aut'])
    
    
    features_name = ['degree','in_degree','out_degree','eigenvec','katz','closeness','info',
                     'betweenness','rwbetween','combetween','load','subgraph','harmonic','local_reach',
                    'global_reach', 'percolation','order2','trophic','laplacian']
    
    df[features_name[0]] = nx.degree_centrality(G).values()
    #df[features_name[1]] = nx.in_degree_centrality(G).values()
    #df[features_name[2]] = nx.out_degree_centrality(G).values()
    df[features_name[3]] = nx.eigenvector_centrality(G).values()
    #df[features_name[4]] = nx.katz_centrality(G).values()
    df[features_name[5]] = nx.closeness_centrality(G).values()
    #df[features_name[6]] = nx.information_centrality(G).values()
    #df[features_name[7]] = nx.betweenness_centrality(G).values()
    #print('finish feature 7')
    #df[features_name[8]] = nx.current_flow_betweenness_centrality(G).values()
    #df[features_name[9]] = nx.communicability_betweenness_centrality(G).values()
    #print('finish feature 9')
    df[features_name[10]] = nx.load_centrality(G).values()
    print('finish feature 10')
    df[features_name[11]] = nx.subgraph_centrality(G).values()
    print('finish feature 11')
    df[features_name[12]] = nx.harmonic_centrality(G).values()
    print('finish feature 12')
    #df[features_name[13]] = nx.local_reaching_centrality(G).values()
    #print('finish feature 13')
    #df[features_name[14]] = nx.global_reaching_centrality(G).values()
    #print('finish feature 14')
    #df[features_name[15]] = nx.percolation_centrality(G).values()
    #print('finish feature 15')
    #df[features_name[16]] = nx.second_order_centrality(G).values()
    #print('finish feature 16')
    #df[features_name[17]] = nx.trophic_levels(G).values()
    #print('finish feature 17')
    #df[features_name[18]] = nx.laplacian_centrality(G).values()
    #print('finish feature 18')
    
    for fea_num in [0,3,5,10,11,12]:
        df[features_name[fea_num]] /= np.linalg.norm(df[features_name[fea_num]])
    
    return df

In [23]:
Content_mod = increase_features(Content,Train)
g, pos, neg, features, num_features = load_data(Content_mod,Train,Test)
A = g.toarray()
edge_index,_ = dense_to_sparse(torch.tensor(A))
Apos = pos.toarray()
edge_index_pos,_ = dense_to_sparse(torch.tensor(Apos))
Aneg = neg.toarray()
edge_index_neg,_ = dense_to_sparse(torch.tensor(Aneg))
data = Data(edge_index=edge_index_pos,x=features.to(torch.float),test = torch.tensor([Test['from'],Test['to']]))
data

finish feature 10
finish feature 11
finish feature 12


Data(x=[2708, 1442], edge_index=[2, 8472], test=[2, 2172])

In [24]:
dev = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
channels = 256

In [25]:
train_rate = 0.9
val_ratio = (1-train_rate)#/3
test_ratio = 0#(1-train_rate) / 3 * 2
data = train_test_split_edges(data.to(dev), val_ratio=val_ratio, test_ratio=0)
               #,train_pos_edge_index = edge_index_pos,neg = edge_index_neg,test = edge_index_test)
#data = T.NormalizeFeatures()(data) # Should not do this normalization for new features

In [26]:
data.x[0]

tensor([0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 1.3954e-05,
        1.9786e-02], device='cuda:0')

In [27]:
model_used = 'Hybrid'
scaling_factor = 1.8
class Encoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels, edge_index,model_used = 'GNAE'):
        super(Encoder, self).__init__()
        self.linear1 = nn.Linear(in_channels, out_channels)
        self.linear2 = nn.Linear(in_channels, out_channels)
        self.propagate = APPNP(K=2, alpha=0)
        self.model_used = model_used

    def forward(self, x, edge_index,not_prop=0):
        if self.model_used  == 'GNAE':
            x = self.linear1(x)
            x = F.normalize(x,p=2,dim=1)  * scaling_factor
            x = self.propagate(x, edge_index)
            return x

        if self.model_used  == 'VGNAE':
            x_ = self.linear1(x)
            x_ = self.propagate(x_, edge_index)

            x = self.linear2(x)
            x = F.normalize(x,p=2,dim=1) * scaling_factor
            x = self.propagate(x, edge_index)
            return x, x_

        return x

In [28]:
def train(model,optimizer):
    model.train()
    optimizer.zero_grad()
    z  = model.encode(x, train_pos_edge_index)
    loss = model.recon_loss(z, train_pos_edge_index)
    if model_used  in ['VGNAE']:
        loss = loss + (1 / data.num_nodes) * model.kl_loss()
    loss.backward()
    optimizer.step()
    return loss

In [29]:
def test(pos_edge_index, neg_edge_index,model):
    model.eval()
    with torch.no_grad():
        z = model.encode(x, train_pos_edge_index)
    return model.test(z, pos_edge_index, neg_edge_index)

In [30]:
if model_used  == 'GNAE':   
    model = GAE(Encoder(data.x.size()[1], channels, data.train_pos_edge_index,model_used = 'GNAE')).to(dev)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
if model_used  == 'VGNAE':
    model = VGAE(Encoder(data.x.size()[1],channels, data.train_pos_edge_index,model_used = 'VGNAE')).to(dev)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
if model_used == 'Hybrid':
    model1 = GAE(Encoder(data.x.size()[1], channels, data.train_pos_edge_index,model_used = 'GNAE')).to(dev)
    model2 = VGAE(Encoder(data.x.size()[1],channels, data.train_pos_edge_index,model_used = 'VGNAE')).to(dev)
    optimizer1 = torch.optim.Adam(model1.parameters(), lr=0.005)
    optimizer2 = torch.optim.Adam(model2.parameters(), lr=0.005)
#model = VGAE(VariationalGCNEncoder(data.x.size()[1],channels)).to(dev)

In [31]:
data.train_mask = data.val_mask = data.test_mask = data.y = None
x, train_pos_edge_index = data.x.to(dev), data.train_pos_edge_index.to(dev)

In [32]:
test(data.val_pos_edge_index, data.val_neg_edge_index,model1)

(0.9005806772518709, 0.8997826508562685)

In [33]:
epochs = 400
tolerance = 30
best_auc = float('-inf')
best_ap = float('-inf')
num_tol = 0
for epoch in range(1,epochs):
    loss = train(model1,optimizer1)
    loss = float(loss)
    
    with torch.no_grad():
        #valid_pos, valid_neg = data.val_pos_edge_index, data.val_neg_edge_index
        auc, ap = test(data.val_pos_edge_index, data.val_neg_edge_index,model1)
        if auc<best_auc and ap<best_ap:
            tol += 1
            if tol>=tolerance:
                break
        else:
            best_auc = auc
            best_ap = ap
            best_model = model1.state_dict()
            tol = 0
        print('Epoch: {:03d}, LOSS: {:.4f}, AUC: {:.4f}, AP: {:.4f}'.format(epoch, loss, auc, ap))
model1_path = './model/VGAE_best_model_GNAE_newfea_nonorm_'+str(dss_num+1)+'.pt'
torch.save(best_model, model1_path)

Epoch: 001, LOSS: 1.1936, AUC: 0.7952, AP: 0.8021
Epoch: 002, LOSS: 1.6208, AUC: 0.9281, AP: 0.9253
Epoch: 003, LOSS: 1.1339, AUC: 0.9494, AP: 0.9424
Epoch: 004, LOSS: 1.0863, AUC: 0.9434, AP: 0.9334
Epoch: 005, LOSS: 1.1593, AUC: 0.9499, AP: 0.9418
Epoch: 006, LOSS: 1.0624, AUC: 0.9509, AP: 0.9469
Epoch: 007, LOSS: 0.9655, AUC: 0.9500, AP: 0.9468
Epoch: 008, LOSS: 0.9483, AUC: 0.9490, AP: 0.9456
Epoch: 009, LOSS: 0.9718, AUC: 0.9481, AP: 0.9447
Epoch: 010, LOSS: 0.9757, AUC: 0.9476, AP: 0.9442
Epoch: 011, LOSS: 0.9643, AUC: 0.9476, AP: 0.9445
Epoch: 012, LOSS: 0.9434, AUC: 0.9470, AP: 0.9441
Epoch: 013, LOSS: 0.9097, AUC: 0.9468, AP: 0.9443
Epoch: 014, LOSS: 0.8982, AUC: 0.9473, AP: 0.9444
Epoch: 015, LOSS: 0.8926, AUC: 0.9480, AP: 0.9451
Epoch: 016, LOSS: 0.9036, AUC: 0.9488, AP: 0.9458
Epoch: 017, LOSS: 0.9061, AUC: 0.9494, AP: 0.9465
Epoch: 018, LOSS: 0.8999, AUC: 0.9498, AP: 0.9471
Epoch: 019, LOSS: 0.8905, AUC: 0.9501, AP: 0.9477
Epoch: 020, LOSS: 0.8908, AUC: 0.9505, AP: 0.9482


In [34]:
model1.load_state_dict(torch.load(model1_path))
test(data.val_pos_edge_index, data.val_neg_edge_index,model1)

(0.9583521955636034, 0.9591264237720224)

In [35]:
z = model1.encode(x, data.train_pos_edge_index)
Upload['prob'] = model1.decoder(z,data.test,sigmoid = True).detach().cpu().numpy()
Upload

Unnamed: 0,id,prob
0,E10559,0.518071
1,E4849,0.625961
2,E3964,0.811212
3,E542,0.544081
4,E331,0.735138
...,...,...
2167,E2524,0.415291
2168,E4324,0.426177
2169,E1384,0.928310
2170,E7582,0.944584


In [36]:
Upload.to_csv('output/GNAE_undirect_best_newfea_nonorm_'+str(dss_num+1)+'.csv',index=False)

In [37]:
test(data.val_pos_edge_index, data.val_neg_edge_index,model2)

(0.8789016872614277, 0.8801290321343768)

In [38]:
epochs = 400
tolerance = 30
best_auc = float('-inf')
best_ap = float('-inf')
num_tol = 0
for epoch in range(1,epochs):
    loss = train(model2,optimizer2)
    loss = float(loss)
    
    with torch.no_grad():
        #valid_pos, valid_neg = data.val_pos_edge_index, data.val_neg_edge_index
        auc, ap = test(data.val_pos_edge_index, data.val_neg_edge_index,model2)
        if auc<best_auc and ap<best_ap:
            tol += 1
            if tol>=tolerance:
                break
        else:
            best_auc = auc
            best_ap = ap
            best_model = model2.state_dict()
            tol = 0
        print('Epoch: {:03d}, LOSS: {:.4f}, AUC: {:.4f}, AP: {:.4f}'.format(epoch, loss, auc, ap))
model2_path = './model/VGAE_best_model_VGAE_newfea_nonorm_'+str(dss_num+1)+'.pt'
torch.save(best_model, model2_path)

Epoch: 001, LOSS: 14.2811, AUC: 0.8308, AP: 0.8273
Epoch: 002, LOSS: 12.4117, AUC: 0.8075, AP: 0.8064
Epoch: 003, LOSS: 10.6135, AUC: 0.8047, AP: 0.8042
Epoch: 004, LOSS: 9.2931, AUC: 0.8128, AP: 0.8109
Epoch: 005, LOSS: 8.2341, AUC: 0.8276, AP: 0.8236
Epoch: 006, LOSS: 6.9023, AUC: 0.8483, AP: 0.8415
Epoch: 007, LOSS: 5.8603, AUC: 0.8734, AP: 0.8641
Epoch: 008, LOSS: 5.1023, AUC: 0.9013, AP: 0.8901
Epoch: 009, LOSS: 4.4143, AUC: 0.9260, AP: 0.9146
Epoch: 010, LOSS: 3.7225, AUC: 0.9442, AP: 0.9342
Epoch: 011, LOSS: 3.2615, AUC: 0.9534, AP: 0.9460
Epoch: 012, LOSS: 2.8611, AUC: 0.9557, AP: 0.9505
Epoch: 013, LOSS: 2.5508, AUC: 0.9541, AP: 0.9504
Epoch: 014, LOSS: 2.3428, AUC: 0.9511, AP: 0.9479
Epoch: 015, LOSS: 2.1475, AUC: 0.9482, AP: 0.9452
Epoch: 016, LOSS: 2.0105, AUC: 0.9463, AP: 0.9435
Epoch: 017, LOSS: 1.8918, AUC: 0.9452, AP: 0.9425
Epoch: 018, LOSS: 1.8127, AUC: 0.9451, AP: 0.9426
Epoch: 019, LOSS: 1.7050, AUC: 0.9456, AP: 0.9432
Epoch: 020, LOSS: 1.6420, AUC: 0.9464, AP: 0.94

In [39]:
model2.load_state_dict(torch.load(model2_path))
test(data.val_pos_edge_index, data.val_neg_edge_index,model2)

(0.9504496196815496, 0.9487248128664978)

In [40]:
z = model2.encode(x, data.train_pos_edge_index)
Upload['prob'] = model2.decoder(z,data.test,sigmoid = True).detach().cpu().numpy()
Upload

Unnamed: 0,id,prob
0,E10559,0.563326
1,E4849,0.694361
2,E3964,0.829851
3,E542,0.533911
4,E331,0.839077
...,...,...
2167,E2524,0.308287
2168,E4324,0.387113
2169,E1384,0.916480
2170,E7582,0.938650


In [41]:
Upload.to_csv('output/VGAE_undirect_best_newfea_nonorm_'+str(dss_num+1)+'.csv',index=False)

In [42]:
z1 = model1.encode(x, data.train_pos_edge_index)
z2 = model2.encode(x, data.train_pos_edge_index)
Upload['prob'] = (2*model1.decoder(z1,data.test,sigmoid = True).detach().cpu().numpy()+model2.decoder(z2,data.test,sigmoid = True).detach().cpu().numpy())/3
Upload

Unnamed: 0,id,prob
0,E10559,0.533156
1,E4849,0.648761
2,E3964,0.817425
3,E542,0.540691
4,E331,0.769784
...,...,...
2167,E2524,0.379623
2168,E4324,0.413156
2169,E1384,0.924367
2170,E7582,0.942606


In [43]:
Upload.to_csv('output/Hybrid_undirect_imb_best_newfea_nonorm_'+str(dss_num+1)+'.csv',index=False)