In [1]:
# Import the NetworkX package
import networkx as nx

import torch
print("PyTorch has version {}".format(torch.__version__))

PyTorch has version 1.10.2


In [2]:
!pwd

/Users/wangshihao/Research/SocialNetwork/HW1


## 1. Read File

In [765]:
class readData():
    def __init__(self, data):
        self.__bc_value = list()
        self.__text = list()
        self.__edges = list()
        
        
        if data == "youtube":
            path1 = f'../data/hw1_data/youtube/com-youtube_score.txt'
            path2 = f'../data/hw1_data/youtube/com-youtube.txt'
        else:
            path1 = f'../data/hw1_data/Synthetic/5000/{data}_score.txt'
            path2 = f'../data/hw1_data/Synthetic/5000/{data}.txt'
        
        # create BC dict
        f = open(path1)
        for line in f:
            line = line.replace("\n", "")
            n, bc = line.split(":") if data == "youtube" else line.split("\t")
            self.__bc_value.append([int(n), math.log(float(bc)+1e-8)])
        f.close()
        # read node pair
        f = open(path2)
        a_list, b_list, Gtext = [], [], []
        for line in f:
            line = line.replace("\n", "")
            a, b = line.split(" ") if data == "youtube" else line.split("\t")
            a_list.append(int(a))
            b_list.append(int(b))
            Gtext.append((int(a), int(b)))
        self.__edges = [a_list+b_list, b_list+a_list]
        f.close()
        
        # Create an undirected graph G
        G = nx.Graph(Gtext)
        self.__nodes_list = [[val, 1, 1] for (node, val) in G.degree()]
        
        
    def get_nodes(self):
        return torch.Tensor(self.__nodes_list)

    def get_edges(self):
        return torch.tensor(self.__edges,dtype=torch.long)

    def get_bc_value(self):
        return torch.Tensor(self.__bc_value)

data = readData("youtube")
print(data.get_nodes().shape)
print(data.get_edges().shape)
print(data.get_bc_value())

torch.Size([1134890, 3])
torch.Size([2, 5975248])
tensor([[ 0.0000e+00,  1.7194e+01],
        [ 1.0000e+00,  1.9914e+01],
        [ 2.0000e+00,  1.4682e+01],
        ...,
        [ 1.1349e+06, -1.8421e+01],
        [ 1.1349e+06, -1.8421e+01],
        [ 1.1349e+06, -1.8421e+01]])


## 2. Pytorch Geometric

In [776]:
import torch
from torch_geometric.utils.convert import from_networkx
from torch.nn import GRU
from torch.nn import Linear
from torch_geometric.nn import GCNConv

L = 5
    
class Model(torch.nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        torch.manual_seed(12345)
        self.input = Linear(pyg_graph.num_features, 128)
        self.relu = torch.nn.LeakyReLU()
        self.conv = GCNConv(128, 128)
        self.gru = GRU(128, 128, bias=False)
        self.linear = Linear(128, 64)
        self.output = Linear(64, 1)

    def forward(self, x, edge_index):
        # h1
        h1 = self.input(x)
        h1 = self.relu(h1)
        hi_1 = torch.nn.functional.normalize(h1, p=2, dim=1)
        # h2 ~ L
        stack_h = hi_1.reshape(1, -1, 128)
        for epoch in range(L):
            # Caculate Neighbor
            h_n = self.conv(hi_1, edge_index)
            # GRU Cell
            hv = hi_1
            hv, _ = self.gru(h_n.reshape(1, *h_n.shape), hv.reshape(1, *hv.shape)) # (input, hidden)
            # normalize new hi_1
            hi_1 = hv.reshape(-1, 128)
            hi_1 = torch.nn.functional.normalize(h1, p=2, dim=1)
            # store hidden result
            h_cat = torch.cat([stack_h, hi_1.reshape(1, -1, 128)], dim=0)
        # z is maximum from all hidden values
        z, _ = torch.max(h_cat, 0)
        
        # decorder
        h = self.linear(z)
        h = self.relu(h)
        out = self.output(h)
        
        return out

model = Model()
print(model)

Model(
  (input): Linear(in_features=3, out_features=128, bias=True)
  (relu): LeakyReLU(negative_slope=0.01)
  (conv): GCNConv(128, 128)
  (gru): GRU(128, 128, bias=False)
  (linear): Linear(in_features=128, out_features=64, bias=True)
  (output): Linear(in_features=64, out_features=1, bias=True)
)


In [689]:
import numpy as np
import networkx as nx
import random 
import math

class Graph():
    def __init__(self,batch_size, start=150, end=200, scale=None):
        self.graph_list=[]
        self.__nodes_list = []
        self.__edge_index = []
        self.__bc_value = []
        self.__sample_pairs = []
        
        for x in range(batch_size):
            if scale is not None:
                g = nx.powerlaw_cluster_graph(n=int(scale//batch_size) , m=4, p=0.05, seed=None)
            else:
                g = nx.powerlaw_cluster_graph(n=random.randint(start,end) , m=4, p=0.05, seed=None)
            self.graph_list.append(g)   
            
        s_list,t_list,en = [],[],0
        for g in self.graph_list:
            sample_1, sample_2 = [], []
            # BC Value
            self.__bc_value += list(nx.betweenness_centrality(g).values())
            # set nodes
            for x in range(g.number_of_nodes()):
                self.__nodes_list.append([g.degree[x],1,1])
                # sample pair
                sample_1 += [x+en for _ in range(5)]
                sample_2 += [x+en for _ in range(5)]
            random.shuffle(sample_1)
            random.shuffle(sample_2)
            self.__sample_pairs.extend([[i, j] for i,j in zip(sample_1,sample_2)])

            # set edges
            for e in g.edges():
                s,t = e
                s_list.append(s+en)
                t_list.append(t+en)
            en += g.number_of_nodes()
        self.__edge_index=[s_list+t_list,t_list+s_list]
        
        # log(BC value)
        for i, x in enumerate(self.__bc_value):
            self.__bc_value[i] = math.log(x+1e-8)
            
        
        
    def get_nodes(self):
        return torch.Tensor(self.__nodes_list)

    def get_edges(self):
        return torch.tensor(self.__edge_index,dtype=torch.long)

    def get_bc_value(self):
        return torch.Tensor(self.__bc_value)
        
    
    def get_pair_index(self):
        return torch.tensor(self.__sample_pairs, dtype=torch.long)
        


g = Graph(16, scale=5000)
print(g.get_nodes().shape)
print(g.get_edges().shape)
print(g.get_bc_value())
print(g.get_pair_index().shape)

torch.Size([4992, 3])
torch.Size([2, 39398])
tensor([-2.5067, -2.5306, -3.4108,  ..., -7.2286, -7.6920, -7.3794])
torch.Size([24960, 2])


In [733]:
# training
import torch.nn.functional as F
from datetime import datetime
import wandb


def train(node_start=150, node_end=200, scale=None):
    # setting
    lr = 0.0001
    MAX_ITERATION = 10000
    
    model = Model()
    criterion = torch.nn.CrossEntropyLoss(reduction='sum')  # Define loss criterion.
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    time_list = []
    G = Graph(16, start=node_start, end=node_end, scale=scale)
    val_G = Graph(16, scale=100)
    bc = G.get_bc_value()
    wandb.summary["scale"] = G.get_nodes().shape[0]
    for epoch in range(MAX_ITERATION):
        start = datetime.now()
        optimizer.zero_grad() # clear existing gradients
        # compute BC ranking score
        out= model(G.get_nodes(), G.get_edges())
        # compute loss
        node_pair = G.get_pair_index()
        y_hat = out[node_pair[:, 0]] - out[node_pair[:, 1]]
        grad = torch.sigmoid((bc[node_pair[:, 0]] - bc[node_pair[:, 1]]))
        #yij, bij = sample_node(y_hat, edges)
        loss = F.binary_cross_entropy_with_logits(y_hat, grad.reshape(-1, 1), reduction="sum")
        if epoch % 500 == 0:
            print("[{}/{}] Loss:{:.4f}".format(epoch, MAX_ITERATION, loss.item()))
        loss.backward()
        # validation
        val_loss = validation(model, val_G)
        
        wandb.log({"training_loss": loss}, step=epoch)
        wandb.log({"val_loss": val_loss}, step=epoch)
        optimizer.step()
        end = datetime.now() - start
        time_list.append(end)
    if scale is not None:
        torch.save(model.state_dict(), "./models/weight_{}.pth".format(scale))
    else:
        torch.save(model.state_dict(), "./models/weight_{}_{}.pth".format(node_start, node_end))
    print("Minimum time:{}".format(min(time_list)))
    print("Maximum time:{}".format(max(time_list)))

    return model

def validation(model, val_G):
    model.eval()
    
    with torch.no_grad():
        out = model(val_G.get_nodes(), val_G.get_edges())
        node_pair = val_G.get_edges().reshape(-1, 2)
        y_hat = out[node_pair[:, 0]] - out[node_pair[:, 1]]
        bc = val_G.get_bc_value()
        grad = torch.sigmoid((bc[node_pair[:, 0]] - bc[node_pair[:, 1]]))
        loss = F.binary_cross_entropy_with_logits(y_hat, grad.reshape(-1, 1), reduction="sum")
    
    return loss.item()
        
#model = train()

In [741]:
# Top-N % accuracy
data = readData(0)
y_hat = model(data.get_nodes(), data.get_edges())
bc_value = data.get_bc_value()
predict_values =[[i, *j] for i, j in enumerate(y_hat.tolist())]

def topN_accuracy(predict_values ,bc_value, N):
    bc_value = sorted(bc_value, key = lambda e:e[1], reverse=True)
    predict_values = sorted(predict_values, key = lambda e:e[1], reverse=True)
    top_bc, top_predict = [], []
    for x in range(len(predict_values)*N//100):
        top_bc.append(int(bc_value[x][0]))
        top_predict.append(predict_values[x][0])
    
    return(len(set(top_bc)&set(top_predict)) / len(top_bc))


In [693]:
import scipy.stats as stats
# Kendall tau
data = readData(0)
y_hat = model(data.get_nodes(), data.get_edges())
bc_value = [ float(j) for i, j in data.get_bc_value()]
predict_values =[j[0] for i, j in enumerate(y_hat.tolist())]

# compute tau
tau, _ = stats.kendalltau(predict_values, bc_value)
print("Kendall tau: {}".format(tau))
wandb.summary["Kendall tau"] = tau

Kendall tau: 0.5074497908521457


In [766]:
def test(model):
    for i in range(30):
        data = readData(i)
        y_hat = model(data.get_nodes(), data.get_edges())
        bc_value = data.get_bc_value()
        predict_values =[[i, *j] for i, j in enumerate(y_hat.tolist())]
        K_bc = [ float(j) for i, j in data.get_bc_value()]
        K_predict =[j[0] for i, j in enumerate(y_hat.tolist())]

        acc_1, acc_5, acc_10, tau_list = [], [], [], []
        acc_1.append(topN_accuracy(predict_values, bc_value, N=1))
        acc_5.append(topN_accuracy(predict_values, bc_value, N=5))
        acc_10.append(topN_accuracy(predict_values, bc_value, N=10))
        tau, _ = stats.kendalltau(K_predict, K_bc)
        tau_list.append(tau)
    
    print("Kendall tau: {}".format(sum(tau_list)/len(tau_list)))
    print("Top 1 accuracy: {}".format(sum(acc_1)/len(acc_1)))
    print("Top 5 accuracy: {}".format(sum(acc_5)/len(acc_5)))
    print("Top 10 accuracy: {}".format(sum(acc_10)/len(acc_10)))
    wandb.summary["Top 1 accuracy"]= sum(acc_1)/len(acc_1)
    wandb.summary["Top 5 accuracy"]= sum(acc_5)/len(acc_5)
    wandb.summary["Top 10 accuracy"]= sum(acc_10)/len(acc_10)
    wandb.summary["Kendall tau"] = sum(tau_list)/len(tau_list)

def test_scale(model, scale):
    data = Graph(16, scale=scale)
    y_hat = model(data.get_nodes(), data.get_edges())
    bc_value = [[i, j] for i, j in enumerate(data.get_bc_value().tolist())]
    predict_values =[[i, *j] for i, j in enumerate(y_hat.tolist())]
    K_bc = data.get_bc_value().tolist()
    K_predict =[j[0] for i, j in enumerate(y_hat.tolist())]
    
    acc_1 = topN_accuracy(predict_values, bc_value, N=1)
    tau, _ = stats.kendalltau(K_predict, K_bc)
    print("[{}]Kendall tau: {}".format(scale, tau))
    print("[{}]Top 1 accuracy: {}".format(scale, acc_1))
    wandb.summary["({}) Top 1 accuracy".format(scale)]= acc_1
    wandb.summary["({}) Kendall tau".format(scale)] = tau

def test_youtube(model):
    data = readData("youtube")
    y_hat = model(data.get_nodes(), data.get_edges())
    bc_value = data.get_bc_value()
    predict_values =[[i, *j] for i, j in enumerate(y_hat.tolist())]
    K_bc = [ float(j) for i, j in data.get_bc_value()]
    K_predict =[j[0] for i, j in enumerate(y_hat.tolist())]

    acc_1 = topN_accuracy(predict_values, bc_value, N=1)
    acc_5 = topN_accuracy(predict_values, bc_value, N=5)
    acc_10 = topN_accuracy(predict_values, bc_value, N=10)
    tau, _ = stats.kendalltau(K_predict, K_bc)
    
    print("Kendall tau: {}".format(tau))
    print("Top 1 accuracy: {}".format(acc_1))
    print("Top 5 accuracy: {}".format(acc_5))
    print("Top 10 accuracy: {}".format(acc_10))

# test(model)
# test_scale(model, 10000)

In [742]:
# Experiment
from IPython.display import clear_output
    
# represent table 6, 7
run_list = ["7_13", "13_19", "63_76", "126_188", "251_313"]
scale_list = [5000, 10000, 20000, 50000, 100000]
for i in run_list:
    clear_output()
    s, e = i.split("_")
    wandb.init(project='DrBC', entity="baron")
    model = train(node_start=int(s), node_end=int(e))
    for s in scale_list:
        test_scale(model, scale=s)
    wandb.finish()
    
    
# represent table 3,4,  
run_list = [5000, 10000, 20000, 50000, 100000]

for i in run_list:
    clear_output()
    wandb.init(project='DrBC', entity="baron")
    model = train(scale=i)
    test(model)
    wandb.finish()

In [747]:
scale_list = [5000, 10000, 20000, 50000, 100000]
device = torch.device('cpu')
model = Model()
model.load_state_dict(torch.load("models/weight_13_19.pth", map_location=device))

wandb.init(project='DrBC', entity="baron")
#model = train(node_start=13, node_end=19)
for s in scale_list:
    test_scale(model, scale=s)
wandb.finish()

[5000]Kendall tau: 0.8309785489908402
[5000]Top 1 accuracy: 0.9591836734693877
[10000]Kendall tau: 0.8098825854322089
[10000]Top 1 accuracy: 0.95
[20000]Kendall tau: 0.7842089091090176
[20000]Top 1 accuracy: 0.955
[50000]Kendall tau: 0.7390498242951619
[50000]Top 1 accuracy: 0.944
[100000]Kendall tau: 0.706479276031052
[100000]Top 1 accuracy: 0.922



VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
(10000) Kendall tau,0.80988
(10000) Top 1 accuracy,0.95
(100000) Kendall tau,0.70648
(100000) Top 1 accuracy,0.922
(20000) Kendall tau,0.78421
(20000) Top 1 accuracy,0.955
(5000) Kendall tau,0.83098
(5000) Top 1 accuracy,0.95918
(50000) Kendall tau,0.73905
(50000) Top 1 accuracy,0.944


## com-youtube

### Scale 5000

In [771]:
device = torch.device('cpu')
model = Model()
model.load_state_dict(torch.load("models/weight_5000.pth", map_location=device))
test_youtube(model)

Kendall tau: 0.3330251625212078
Top 1 accuracy: 0.47144871342967926
Top 5 accuracy: 0.5682362892993091
Top 10 accuracy: 0.6246508472186731


  (2 * xtie * ytie) / m + x0 * y0 / (9 * m * (size - 2)))


### Scale 10000

In [772]:
device = torch.device('cpu')
model = Model()
model.load_state_dict(torch.load("models/weight_10000.pth", map_location=device))
test_youtube(model)

Kendall tau: 0.33569026812103525
Top 1 accuracy: 0.5116320056397603
Top 5 accuracy: 0.5210418722684337
Top 10 accuracy: 0.5611116495871846


### Scale 20000

In [773]:
device = torch.device('cpu')
model = Model()
model.load_state_dict(torch.load("models/weight_20000.pth", map_location=device))
test_youtube(model)

Kendall tau: 0.3117399946789241
Top 1 accuracy: 0.5685583362707085
Top 5 accuracy: 0.5850838855209362
Top 10 accuracy: 0.5071152270264079


### Scale 50000

In [774]:
device = torch.device('cpu')
model = Model()
model.load_state_dict(torch.load("models/weight_50000.pth", map_location=device))
test_youtube(model)

Kendall tau: 0.2901170617790284
Top 1 accuracy: 0.597726471624956
Top 5 accuracy: 0.4899372620893839
Top 10 accuracy: 0.4287640211826697


### Scale 100000

In [777]:
device = torch.device('cpu')
model = Model()
model.load_state_dict(torch.load("models/weight_100000.pth", map_location=device))
test_youtube(model)

Kendall tau: 0.16824677458710227
Top 1 accuracy: 0.14125837151921045
Top 5 accuracy: 0.19203792471450726
Top 10 accuracy: 0.24560089524094847
