In [36]:
import os.path as osp
import pandas as pd
import networkx as nx
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch_geometric.datasets import Planetoid, Coauthor, Amazon
from torch_geometric.utils import train_test_split_edges, dense_to_sparse
from torch_geometric.nn import GAE, VGAE, APPNP
import torch_geometric.transforms as T
from torch_geometric.data import Data

In [37]:
# Load the Cora dataset
dataset = Planetoid(root='/tmp/cora', name='Cora')

In [38]:
data = dataset[0]
data = T.NormalizeFeatures()(data)

In [39]:
path = "hw2_data/"
dss = ['dataset1','dataset2','dataset3'] #datasets
datasets = dict()
for ds in dss:
    datasets[ds] = dict()
    datasets[ds]['content'] = pd.read_csv(path+ds+"/content.csv",delimiter = '\t',header = None)
    datasets[ds]['train'] = pd.read_csv(path+ds+"/train.csv",delimiter = ',')
    datasets[ds]['test'] = pd.read_csv(path+ds+"/test.csv",delimiter = ',')
    datasets[ds]['upload'] = pd.read_csv(path+ds+"/upload.csv",delimiter = ',')
datasets[dss[2]]['test'].head()

Unnamed: 0,id,to,from
0,E370,26,317
1,E667,196,323
2,E3190,739,468
3,E848,576,156
4,E2161,466,199


In [72]:
#For example:
dss_num = 2
Content = datasets[dss[dss_num]]['content']
Train = datasets[dss[dss_num]]['train']
Test = datasets[dss[dss_num]]['test']
Upload = datasets[dss[dss_num]]['upload']

In [73]:
def load_data(content,train, test):
    G = nx.DiGraph()
    # for easier split the edges, create 2 graph, 1 with positive edge, the other with given negative edges
    G_pos = nx.DiGraph()
    G_neg = nx.DiGraph()
    graph_node_features_dict = dict()
    
    for i in range(len(content)):
        #graph_node_features_dict[content.iloc[i,0]] = np.array(content.iloc[i,1:])
        G.add_node(int(content.iloc[i,0]),features = np.array(content.iloc[i,1:]))
        # pos and neg
        G_pos.add_node(int(content.iloc[i,0]),features = np.array(content.iloc[i,1:]))
        G_neg.add_node(int(content.iloc[i,0]),features = np.array(content.iloc[i,1:]))
        
    for i in range(len(train)):
        # Adding nodes into G
        '''
        if train.loc[i,'from'] not in G:
            G.add_node(train.loc[i,'from'],features = graph_node_features_dict[train.loc[i,'from']])
        if train.loc[i,'to'] not in G:
            G.add_node(train.loc[i,'to'],features = graph_node_features_dict[train.loc[i,'to']])
        ''' 
        # Adding edges
        G.add_edge(train.loc[i,'from'],train.loc[i,'to'],label = train.loc[i,'label'])
        
        # pos and neg
        if train.loc[i,'label'] == 0: 
            G_neg.add_edge(train.loc[i,'from'],train.loc[i,'to'],label = train.loc[i,'label'])
        else:
            G_pos.add_edge(train.loc[i,'from'],train.loc[i,'to'],label = train.loc[i,'label'])

    adj = nx.adjacency_matrix(G,sorted(G.nodes()))
    adj_pos = nx.adjacency_matrix(G_pos,sorted(G_pos.nodes()))
    adj_neg = nx.adjacency_matrix(G_neg,sorted(G_neg.nodes()))
    features = np.array(
        [features for _, features in sorted(G.nodes(data='features'), key=lambda x: x[0])])
    
    # Skip train,valid,and test mask
    
    num_features = features.shape[1]
    features = torch.FloatTensor(features)
    return adj, adj_pos, adj_neg, features, num_features

In [74]:
g, pos, neg, features, num_features = load_data(Content,Train,Test)
A = g.toarray()
edge_index,_ = dense_to_sparse(torch.tensor(A))
Apos = pos.toarray()
edge_index_pos,_ = dense_to_sparse(torch.tensor(Apos))
Aneg = neg.toarray()
edge_index_neg,_ = dense_to_sparse(torch.tensor(Aneg))
data = Data(edge_index=edge_index_pos,x=features.to(torch.float),test = torch.tensor([Test['from'],Test['to']]))
data

Data(x=[877, 1703], edge_index=[2, 1273], test=[2, 644])

In [75]:
dev = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
channels = 512

In [76]:
train_rate = 0.95
val_ratio = (1-train_rate)#/3
test_ratio = 0#(1-train_rate) / 3 * 2
data = train_test_split_edges(data.to(dev), val_ratio=val_ratio, test_ratio=0)
               #,train_pos_edge_index = edge_index_pos,neg = edge_index_neg,test = edge_index_test)
data = T.NormalizeFeatures()(data)
data



Data(x=[877, 1703], test=[2, 644], val_pos_edge_index=[2, 55], test_pos_edge_index=[2, 0], train_pos_edge_index=[2, 1006], train_neg_adj_mask=[877, 877], val_neg_edge_index=[2, 55], test_neg_edge_index=[2, 0])

In [77]:
model_used = 'VGNAE'
scaling_factor = 1.8
class Encoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels, edge_index):
        super(Encoder, self).__init__()
        self.linear1 = nn.Linear(in_channels, out_channels)
        self.linear2 = nn.Linear(in_channels, out_channels)
        self.propagate = APPNP(K=1, alpha=0)

    def forward(self, x, edge_index,not_prop=0):
        if model_used  == 'GNAE':
            x = self.linear1(x)
            x = F.normalize(x,p=2,dim=1)  * scaling_factor
            x = self.propagate(x, edge_index)
            return x

        if model_used  == 'VGNAE':
            x_ = self.linear1(x)
            x_ = self.propagate(x_, edge_index)

            x = self.linear2(x)
            x = F.normalize(x,p=2,dim=1) * scaling_factor
            x = self.propagate(x, edge_index)
            return x, x_

        return x

In [78]:
def train():
    model.train()
    optimizer.zero_grad()
    z  = model.encode(x, train_pos_edge_index)
    loss = model.recon_loss(z, train_pos_edge_index)
    if model_used  in ['VGNAE']:
        loss = loss + (1 / data.num_nodes) * model.kl_loss()
    loss.backward()
    optimizer.step()
    return loss

In [79]:
def test(pos_edge_index, neg_edge_index):
    model.eval()
    with torch.no_grad():
        z = model.encode(x, train_pos_edge_index)
    return model.test(z, pos_edge_index, neg_edge_index)

In [80]:
N = int(data.x.size()[0])
if model_used  == 'GNAE':   
    model = GAE(Encoder(data.x.size()[1], channels, data.train_pos_edge_index)).to(dev)
if model_used  == 'VGNAE':
    model = VGAE(Encoder(data.x.size()[1], channels, data.train_pos_edge_index)).to(dev)

In [81]:
data.train_mask = data.val_mask = data.test_mask = data.y = None
x, train_pos_edge_index = data.x.to(dev), data.train_pos_edge_index.to(dev)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

In [82]:
test(data.val_pos_edge_index, data.val_neg_edge_index)

(0.7282644628099174, 0.7760739585717037)

In [83]:
epochs = 300
for epoch in range(1,epochs):
    loss = train()
    loss = float(loss)
    
    with torch.no_grad():
        #valid_pos, valid_neg = data.val_pos_edge_index, data.val_neg_edge_index
        auc, ap = test(data.val_pos_edge_index, data.val_neg_edge_index)
        print('Epoch: {:03d}, LOSS: {:.4f}, AUC: {:.4f}, AP: {:.4f}'.format(epoch, loss, auc, ap))

Epoch: 001, LOSS: 14.3380, AUC: 0.7210, AP: 0.7716
Epoch: 002, LOSS: 14.2239, AUC: 0.7193, AP: 0.7704
Epoch: 003, LOSS: 13.0865, AUC: 0.7203, AP: 0.7715
Epoch: 004, LOSS: 13.9011, AUC: 0.7200, AP: 0.7709
Epoch: 005, LOSS: 14.1405, AUC: 0.7210, AP: 0.7721
Epoch: 006, LOSS: 12.2154, AUC: 0.7213, AP: 0.7724
Epoch: 007, LOSS: 13.0210, AUC: 0.7223, AP: 0.7731
Epoch: 008, LOSS: 13.7097, AUC: 0.7217, AP: 0.7719
Epoch: 009, LOSS: 13.6891, AUC: 0.7213, AP: 0.7716
Epoch: 010, LOSS: 13.8397, AUC: 0.7217, AP: 0.7723
Epoch: 011, LOSS: 13.0581, AUC: 0.7217, AP: 0.7723
Epoch: 012, LOSS: 12.9522, AUC: 0.7220, AP: 0.7725
Epoch: 013, LOSS: 12.4356, AUC: 0.7220, AP: 0.7724
Epoch: 014, LOSS: 12.9357, AUC: 0.7226, AP: 0.7727
Epoch: 015, LOSS: 12.4387, AUC: 0.7226, AP: 0.7727
Epoch: 016, LOSS: 12.4696, AUC: 0.7233, AP: 0.7730
Epoch: 017, LOSS: 12.2219, AUC: 0.7236, AP: 0.7733
Epoch: 018, LOSS: 12.2491, AUC: 0.7236, AP: 0.7733
Epoch: 019, LOSS: 12.9630, AUC: 0.7240, AP: 0.7742
Epoch: 020, LOSS: 11.7637, AUC:

Epoch: 169, LOSS: 2.9675, AUC: 0.8959, AP: 0.8963
Epoch: 170, LOSS: 3.0992, AUC: 0.8979, AP: 0.8982
Epoch: 171, LOSS: 3.0374, AUC: 0.8985, AP: 0.9001
Epoch: 172, LOSS: 2.6964, AUC: 0.8995, AP: 0.9012
Epoch: 173, LOSS: 2.9711, AUC: 0.9012, AP: 0.9030
Epoch: 174, LOSS: 3.0357, AUC: 0.9028, AP: 0.9047
Epoch: 175, LOSS: 2.9239, AUC: 0.9025, AP: 0.9049
Epoch: 176, LOSS: 2.7794, AUC: 0.9038, AP: 0.9060
Epoch: 177, LOSS: 2.7279, AUC: 0.9048, AP: 0.9072
Epoch: 178, LOSS: 2.8323, AUC: 0.9045, AP: 0.9071
Epoch: 179, LOSS: 2.6761, AUC: 0.9051, AP: 0.9073
Epoch: 180, LOSS: 2.6304, AUC: 0.9058, AP: 0.9078
Epoch: 181, LOSS: 2.6583, AUC: 0.9081, AP: 0.9092
Epoch: 182, LOSS: 2.8543, AUC: 0.9071, AP: 0.9070
Epoch: 183, LOSS: 2.8164, AUC: 0.9078, AP: 0.9075
Epoch: 184, LOSS: 2.7431, AUC: 0.9081, AP: 0.9082
Epoch: 185, LOSS: 2.5536, AUC: 0.9088, AP: 0.9092
Epoch: 186, LOSS: 2.5534, AUC: 0.9084, AP: 0.9086
Epoch: 187, LOSS: 2.6790, AUC: 0.9084, AP: 0.9088
Epoch: 188, LOSS: 2.5535, AUC: 0.9084, AP: 0.9089


In [84]:
test(data.val_pos_edge_index, data.val_neg_edge_index)

(0.9289256198347108, 0.925048641648224)

In [85]:
z = model.encode(x, data.train_pos_edge_index)
Upload['prob'] = model.decoder(z,data.test,sigmoid = True).detach().cpu().numpy()
Upload

Unnamed: 0,id,prob
0,E370,0.402166
1,E667,0.942945
2,E3190,0.906596
3,E848,0.865893
4,E2161,0.787198
...,...,...
639,E492,0.302000
640,E3055,0.897925
641,E1271,0.800694
642,E2199,0.339961


In [86]:
Upload.to_csv('output/VGNAE_change_test_encode_'+str(dss_num+1)+'.csv',index=False)