# 1. Setup

In [1]:
import networkx as nx
import os
import numpy as np
import math
import torch
from torch import nn
import torch.optim as optim
import random

  from .autonotebook import tqdm as notebook_tqdm


In [85]:
import time

def timer(func):
    def wrapper(*args, **kwargs):
        start = time.time()
        print(f"Running {func.__name__} ...", end='\r')
        result = func(*args, **kwargs)
        end = time.time()
        print(f"{func.__name__} Done in {end - start:.2f} seconds")
        return result
    return wrapper

# 2. Data Preprocessing  
Data Structure:
1. **gList** <Dict>: containing total 31 graphs, which 30 from Synthetic and 1 from youtube,using filename as key  
2. element of gList <Dict>: 'graph':nx.Graph();'score': <Dict> with 'node' and 'score'

In [2]:
# Input data
dpath = ".\\data\\"
gList = dict()

for root, dirs, files in os.walk(dpath):
    for file in files:
        file_path = os.path.join(root, file)
        if 'score' not in file:
            # Process nodes and edges
            gList[file] = dict()
            gList[file]['graph']=nx.Graph()
            with open(file_path,'r') as f:
                content = f.readlines()
                edges = []
                for line in content:
                    if 'com' not in file:
                        nodes = line[:-1].split('\t')
                    else:
                        continue # after finish all code run code with com
                        nodes = line[:-1].split(" ")
                    # Create edge tuple and append
                    edges.append((int(nodes[0]),int(nodes[1])))
                gList[file]['graph'].add_edges_from(edges)
                print("{} has {} nodes, {} edges".format(file,gList[file]['graph'].number_of_nodes(),gList[file]['graph'].number_of_edges()))
            
            # Process scores
            scorefile = file.replace(".txt","_score.txt")
            gList[file]['score'] = dict()
            score_file_path = os.path.join(root,scorefile) 
            with open(score_file_path,'r') as f:
                content = f.readlines()
                for line in content:
                    if 'com' not in file:
                        node_score = line[:-1].split('\t')
                    else:
                        continue # after finish all code run code with com
                        node_score = line[:-1].split(" ")
                    gList[file]['score'][int(node_score[0])] = float(node_score[1])

0.txt has 5000 nodes, 19982 edges
1.txt has 5000 nodes, 19981 edges
10.txt has 5000 nodes, 19980 edges
11.txt has 5000 nodes, 19983 edges
12.txt has 5000 nodes, 19983 edges
13.txt has 5000 nodes, 19984 edges
14.txt has 5000 nodes, 19982 edges
15.txt has 5000 nodes, 19984 edges
16.txt has 5000 nodes, 19982 edges
17.txt has 5000 nodes, 19981 edges
18.txt has 5000 nodes, 19984 edges
19.txt has 5000 nodes, 19981 edges
2.txt has 5000 nodes, 19980 edges
20.txt has 5000 nodes, 19983 edges
21.txt has 5000 nodes, 19982 edges
22.txt has 5000 nodes, 19982 edges
23.txt has 5000 nodes, 19981 edges
24.txt has 5000 nodes, 19984 edges
25.txt has 5000 nodes, 19982 edges
26.txt has 5000 nodes, 19984 edges
27.txt has 5000 nodes, 19983 edges
28.txt has 5000 nodes, 19982 edges
29.txt has 5000 nodes, 19983 edges
3.txt has 5000 nodes, 19982 edges
4.txt has 5000 nodes, 19984 edges
5.txt has 5000 nodes, 19981 edges
6.txt has 5000 nodes, 19984 edges
7.txt has 5000 nodes, 19983 edges
8.txt has 5000 nodes, 19983 

# 3. DrBC

In [3]:
g = gList['0.txt']['graph']
y = torch.tensor(list(gList['0.txt']['score'].values()))
y

tensor([9.4175e-02, 5.3971e-02, 4.4344e-02,  ..., 1.9542e-05, 6.0578e-05,
        1.0908e-04])

In [4]:
# Prepare nodes initial feature X [dv,1,1]
def gen_nodes_feature(G):
    deg = np.array(list(dict(sorted(dict(G.degree()).items())).values()))
    X = np.ones((3,len(deg)))
    X[0,:]=deg
    norms = np.linalg.norm(X,axis = 1,keepdims=True)
    X = torch.FloatTensor(X.T)
    return X

In [5]:
X=gen_nodes_feature(g)
norms = np.linalg.norm(X,axis = 1,keepdims=True)
print(norms.shape)
X_norm = X/norms
print(X_norm)

(5000, 1)
tensor([[1.0000, 0.0042, 0.0042],
        [1.0000, 0.0056, 0.0056],
        [1.0000, 0.0067, 0.0067],
        ...,
        [0.9428, 0.2357, 0.2357],
        [0.9428, 0.2357, 0.2357],
        [0.9428, 0.2357, 0.2357]])


## 3a. DrBC encoder function

In [6]:
class DrBCEncoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers,G):
        super(DrBCEncoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.layer1 = nn.Linear(input_size,hidden_size)
        self.relu = nn.ReLU()
        self.norm1 = nn.BatchNorm1d(hidden_size)
        self.gru_cell = nn.GRUCell(hidden_size, hidden_size,bias = False)
        self.norm2 = nn.BatchNorm1d(hidden_size)
        self.G = G
        self.deg = dict(self.G.degree())
        
    def forward(self, x):
        x = self.layer1(x)
        x = self.relu(x)
        x = self.norm1(x)
        output = [x]
        for i in range(self.num_layers-1):
            hn = self.calHn(x)
            x = self.gru_cell(x,hn)
            x = self.norm2(x)
            output.append(x)
        output, _ = torch.max(torch.stack(output), dim=0)
        return output

    def calHn(self,x):
        hn = torch.zeros(x.shape)
        for node in self.G.nodes():
            degv = self.deg[node]
            for neigh in list(self.G.adj[node]):
                denominator = 1/(math.sqrt(degv+1)*math.sqrt(self.deg[neigh]+1))
                hn[node,:] += (denominator*x[neigh])
        return hn
    
'''
# Define the model
input_size = 3
hidden_size = 32
num_layers = 5
encoder = DrBCEncoder(input_size, hidden_size, num_layers,g)
X = gen_nodes_feature(g)
out = encoder(X)
print(out.shape)
print(out)
'''

'\n# Define the model\ninput_size = 3\nhidden_size = 32\nnum_layers = 5\nencoder = DrBCEncoder(input_size, hidden_size, num_layers,g)\nX = gen_nodes_feature(g)\nout = encoder(X)\nprint(out.shape)\nprint(out)\n'

## 3b. Decoder: 2-layer MLP

In [37]:
class DrBCDecoder(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(DrBCDecoder, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        
        # Define the layers of the decoder
        self.layer1 = nn.Linear(input_size, hidden_size)
        self.norm1 = nn.BatchNorm1d(hidden_size)
        self.relu = nn.ReLU()
        self.layer2 = nn.Linear(hidden_size, output_size)
        self.norm2 = nn.BatchNorm1d(output_size)
        
    def forward(self, x):
        # Pass the input through the layers of the decoder
        x = self.layer1(x)
        x = self.norm1(x)
        x = self.relu(x)
        x = self.layer2(x)
        x = self.norm2(x)
        return x

## 3c. Training

In [8]:
def sampling(minimum,maximum,qty):
    pairs = []
    for i in range(qty):
        a = random.randint(minimum,maximum)
        b = random.randint(minimum,maximum)
        pairs.append((a,b))
    return pairs

In [9]:
def bc_pairs(pairs,pred,gt):
    pred_dif = []
    gt_dif = []
    g = nn.Sigmoid()
    for pair in pairs:
        pred_dif.append(g(pred[pair[0]]-pred[pair[1]]))
        gt_dif.append(g(gt[pair[0]]-gt[pair[1]]))
    return torch.tensor(pred_dif),torch.tensor(gt_dif)

In [10]:
G = gList['0.txt']['graph']
y = torch.tensor([list(gList['0.txt']['score'].values())])
y = torch.transpose(y,0,1)

In [38]:
# Define the models
input_size = 3
hidden_size = 128
output_size = 1
num_layers = 5
encoder = DrBCEncoder(input_size, hidden_size, num_layers,G)
decoder = DrBCDecoder(hidden_size,hidden_size,output_size)

n = G.number_of_nodes()
num_episodes = 20
lr = 0.001
sample_qty = 5*n

# Define the loss and optimizer
criterion = nn.BCELoss(reduction = 'sum')
optimizer = optim.Adam(list(encoder.parameters())+list(decoder.parameters()), lr=lr)

# Get the inputs
inputs = gen_nodes_feature(G)

# Train the model
for episode in range(num_episodes):
    # model
    outputs = encoder(inputs)
    outputs = decoder(outputs)
    
    pairs = sampling(0,n-1,sample_qty)
    pred,gt = bc_pairs(pairs,outputs,y)
    loss = criterion(pred,gt)

    if ~loss.requires_grad:
        loss.requires_grad_()
        
    # Zero the parameter gradients
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Print statistics
    print('[%d] loss: %.4f' %(episode + 1, loss.item()))

[1] loss: 22054.4902
[2] loss: 22153.8184
[3] loss: 22127.1387
[4] loss: 22099.7559
[5] loss: 22178.6367
[6] loss: 22075.3633
[7] loss: 22136.5566
[8] loss: 22211.8926
[9] loss: 22174.9141
[10] loss: 22220.7363
[11] loss: 22123.0059
[12] loss: 22151.0078
[13] loss: 22178.3594
[14] loss: 22041.4180
[15] loss: 22231.3164
[16] loss: 22082.9863
[17] loss: 22159.6211
[18] loss: 22125.8262
[19] loss: 22155.9180
[20] loss: 22121.5527


# 4. Evaluation Metric

## 4a. Top-N% accuracy

In [74]:
def topN(n,pred,gt):
    k = math.ceil(pred.size()[0]*n/100)
    _,pred_top = torch.topk(pred.view(-1),k=k)
    _,gt_top = torch.topk(gt.view(-1),k=k)
    intersect = torch.unique(torch.cat((pred_top,gt_top),0))
    print((2*k-len(intersect))/k)

In [75]:
topN(1,outputs,y)

0.78


## 4b. Kendall tau distance

In [83]:
def kendall(pred,gt):
    pred_ind = torch.argsort(pred)
    gt_ind = torch.argsort(gt)
    con = 0 # number of concordant pairs
    dcor = 0 # number of discordant pairs
    n = len(pred_ind)
    for i in range(n):
        if pred_ind[i] == gt_ind[i]:
            con += 1
        else:
            dcor += 1
    return 2*(con-dcor)/(n*(n-1))

In [84]:
kendall(outputs,y)

0.00040008001600320064