In [54]:
import pandas as pd
import os
import math
import numpy as np
from sklearn.model_selection import train_test_split
# Import the NetworkX package
import networkx as nx
import torch
print("PyTorch has version {}".format(torch.__version__))
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

PyTorch has version 1.10.2+cu102


In [2]:
train_csv = pd.read_csv("../../data/SocialNetowrk/hw2_data/dataset1/train.csv")
test_csv = pd.read_csv("../../data/SocialNetowrk/hw2_data/dataset1/test.csv")
upload_csv = pd.read_csv("../../data/SocialNetowrk/hw2_data/dataset1/upload.csv")
content_csv = pd.read_csv("../../data/SocialNetowrk/hw2_data/dataset1/content.csv", header=None)

In [3]:
print("training data:", len(train_csv))
print(train_csv.head())
print("testing data:", len(test_csv))
print(test_csv.head())
print("upload data:", len(upload_csv))
print(upload_csv.head())
print("content data:", len(content_csv))

training data: 8686
       id    to  from  label
0  E10311  2399  2339      0
1  E10255  2397  1144      1
2  E10667   854  1726      0
3   E9395   872   702      0
4   E5926  2450  1312      1
testing data: 2172
       id    to  from
0  E10559  2323  2673
1   E4849    81  1634
2   E3964  2405  1765
3    E542  2114   498
4    E331  1013   849
upload data: 2172
       id  prob
0  E10559   0.5
1   E4849   0.5
2   E3964   0.5
3    E542   0.5
4    E331   0.5
content data: 2708


## Create Graph
1. 如果沒有邊的話要如何放進graph裡做特徵計算呢？

In [4]:
train_csv[train_csv['to'] ==816]

Unnamed: 0,id,to,from,label
24,E9183,816,1769,0
3942,E5420,816,2461,0


In [5]:
train_csv['node_pair'] = train_csv[['to', 'from']].apply(tuple, axis=1)
train_csv['node_pair'].values[0]

(2399, 2339)

In [6]:
edge_csv = train_csv[train_csv["label"]==1]
print("edge_csv data:", len(edge_csv))

edge_csv data: 4324


In [295]:
class ReadData:
    def __init__(self, dataset):
        self.__edges = list()
        self.__feature = dict()
        
        train_csv = pd.read_csv("../../data/SocialNetowrk/hw2_data/{}/train.csv".format(dataset))
        test_csv = pd.read_csv("../../data/SocialNetowrk/hw2_data/{}/test.csv".format(dataset))
        content_csv = pd.read_csv("../../data/SocialNetowrk/hw2_data/{}/content.csv".format(dataset), header=None)
        edge_csv = train_csv[train_csv["label"]==1]
        train_csv['node_pair'] = train_csv[['to', 'from']].apply(tuple, axis=1)
        test_csv['node_pair'] = test_csv[['to', 'from']].apply(tuple, axis=1)
        
        # get feature
        for line in content_csv.to_numpy():
            l = line[0].split("\t")
            self.__feature[int(l[0])] = list(map(int,l[1:]))
        
        # Create an undirected graph G
        _, Gtext = self.get_edges(edge_csv.to_numpy())
        self.__Graph = nx.Graph(Gtext)
        self.__nodes_list = [self.__feature[node] for node in self.__Graph.nodes]
        
        # get common neighbors
        _, common_neighbors = self.get_common_neighbors(train_csv['node_pair'].values)
        _, test_common_neighbors = self.get_common_neighbors(test_csv['node_pair'].values)
        train_csv["ComNei"] = common_neighbors
        test_csv["ComNei"] = test_common_neighbors
        
        # get Preferential Attachment
        _, preatt = self.get_preatt(train_csv['node_pair'].values)
        _, test_preatt = self.get_preatt(test_csv['node_pair'].values)
        train_csv["PreAtt"] = preatt
        test_csv["PreAtt"] = test_preatt
        
        # get Jaccard's Coefficient
        _, jaccard = self.get_jaccard(train_csv['node_pair'].values)
        _, test_jaccard = self.get_jaccard(test_csv['node_pair'].values)
        train_csv["Jaccard"] = jaccard
        test_csv["Jaccard"] = test_jaccard
        
        # get Adamic/Adar
        _, aa = self.get_aa(train_csv['node_pair'].values)
        _, test_aa = self.get_aa(test_csv['node_pair'].values)
        train_csv["AA"] = aa
        test_csv["AA"] = test_aa
        
        # get Salton index
        _, salton = self.get_salton(train_csv['node_pair'].values)
        _, test_salton = self.get_salton(test_csv['node_pair'].values)
        train_csv["Salton"] = salton
        test_csv["Salton"] = test_salton
        
        # get SΦrensen index
        _, sphi = self.get_sphi(train_csv['node_pair'].values)
        _, test_sphi = self.get_sphi(test_csv['node_pair'].values)
        train_csv["Sphi"] = sphi
        test_csv["Sphi"] = test_sphi
        
        # get hub promoted index
        _, hub = self.get_hubpro(train_csv['node_pair'].values)
        _, test_hub = self.get_hubpro(test_csv['node_pair'].values)
        train_csv["HubPro"] = hub
        test_csv["HubPro"] = test_hub
        
        # get hub depressed index
        _, hub = self.get_hubdep(train_csv['node_pair'].values)
        _, test_hub = self.get_hubdep(test_csv['node_pair'].values)
        train_csv["HubDep"] = hub
        test_csv["HubDep"] = test_hub
        
        # get Leicht-Holme-Newman Index
        _, leicht = self.get_leicht(train_csv['node_pair'].values)
        _, test_leicht = self.get_leicht(test_csv['node_pair'].values)
        train_csv["Leicht"] = leicht
        test_csv["Leicht"] = test_leicht
        
        # get Resource allocation
        _, res = self.get_resource(train_csv['node_pair'].values)
        _, test_res = self.get_resource(test_csv['node_pair'].values)
        train_csv["Res"] = res
        test_csv["Res"] = test_res
        # print("training dataframe: ")
        # print(train_csv.sort_values(by="ComNei", ascending=False).head())
        # print("\ntesting dataframe: ")
        # print(test_csv.sort_values(by="ComNei", ascending=False).head())
        self.__data_array = train_csv.to_numpy()
        self.__test_array = test_csv.to_numpy()
    
    def get_nodes(self):
        return torch.Tensor(self.__nodes_list)

    def get_edges(self, edges_data):
        # read node pair
        a_list, b_list, Gtext = [], [], []
        for line in edges_data:
            a_list.append(line[1])
            b_list.append(line[2])
            Gtext.append((line[1], line[2]))
        #self.__edges = [a_list+b_list, b_list+a_list]
        self.__edges = [a_list, b_list]
        
        return torch.tensor(self.__edges,dtype=torch.long), Gtext
    
    def get_feature(self):
        return self.__feature
    
    def get_common_neighbors(self, node_pairs):
        common_neighbors = list()
        for i, j in node_pairs:
            if i not in self.__Graph.nodes or j not in self.__Graph.nodes:
                common_neighbors.append(0)
                continue
            to_nlist = [ n for n in self.__Graph.neighbors(i)]
            from_nlist = [ n for n in self.__Graph.neighbors(j)]
            common_neighbors.append(len(set(to_nlist)&set(from_nlist)))
            
        return torch.Tensor(common_neighbors), common_neighbors
    
    # Preferential Attachment
    def get_preatt(self, node_pairs):
        att = list()
        for i, j in node_pairs:
            if i not in self.__Graph.nodes or j not in self.__Graph.nodes:
                att.append(0)
                continue
            to_nlist = [ n for n in self.__Graph.neighbors(i)]
            from_nlist = [ n for n in self.__Graph.neighbors(j)]
            att.append(len(to_nlist) * len(from_nlist))
            
        return  torch.Tensor(att), att
    
    # Jaccard's Coeffieient
    def get_jaccard(self, node_pairs):
        jaccard = list()
        for i, j in node_pairs:
            if i not in self.__Graph.nodes or j not in self.__Graph.nodes:
                jaccard.append(0)
                continue
            to_nlist = [ n for n in self.__Graph.neighbors(i)]
            from_nlist = [ n for n in self.__Graph.neighbors(j)]
            jaccard.append(round(len(set(to_nlist)&set(from_nlist)) / len(set(to_nlist)|set(from_nlist)), 3))
            
        return torch.Tensor(jaccard), jaccard
    
    # Adamic/Adar
    def get_aa(self, node_pairs):
        aa = list()
        for i, j in node_pairs:
            if i not in self.__Graph.nodes or j not in self.__Graph.nodes:
                aa.append(0)
                continue
            to_nlist = [ n for n in self.__Graph.neighbors(i)]
            from_nlist = [ n for n in self.__Graph.neighbors(j)]
            neighbors = set(to_nlist)&set(from_nlist)
            aa_score = 0
            if len(neighbors) != 0:
                for neighbor in neighbors:
                    neighbor_friends = [i for i in self.__Graph.neighbors(neighbor)]
                    if len(neighbor_friends)==1:
                        continue
                    aa_score += 1/ math.log(len(neighbor_friends))
            aa.append(round(aa_score, 3))
            
        return torch.Tensor(aa), aa
    
    # Salton
    def get_salton(self, node_pairs):
        salton = list()
        for i, j in node_pairs:
            if i not in self.__Graph.nodes or j not in self.__Graph.nodes:
                salton.append(0)
                continue
            to_nlist = [ n for n in self.__Graph.neighbors(i)]
            from_nlist = [ n for n in self.__Graph.neighbors(j)]
            salton.append(round(len(set(to_nlist)&set(from_nlist)) / (len(set(to_nlist))*len(set(from_nlist))) ** 0.5, 3))
            
        return torch.Tensor(salton), salton 
    
    # SΦrensen index
    def get_sphi(self, node_pairs):
        sphi = list()
        for i, j in node_pairs:
            if i not in self.__Graph.nodes or j not in self.__Graph.nodes:
                sphi.append(0)
                continue
            to_nlist = [ n for n in self.__Graph.neighbors(i)]
            from_nlist = [ n for n in self.__Graph.neighbors(j)]
            sphi.append(round(len(set(to_nlist)&set(from_nlist)) * 2 / (len(set(to_nlist)) + len(set(from_nlist))), 3))
            
        return torch.Tensor(sphi), sphi 
    
    # Hub Promoted
    def get_hubpro(self, node_pairs):
        hub = list()
        for i, j in node_pairs:
            if i not in self.__Graph.nodes or j not in self.__Graph.nodes:
                hub.append(0)
                continue
            to_nlist = [ n for n in self.__Graph.neighbors(i)]
            from_nlist = [ n for n in self.__Graph.neighbors(j)]
            hub.append(round(len(set(to_nlist)&set(from_nlist)) * 2 / min(len(set(to_nlist)), len(set(from_nlist))), 3))
            
        return torch.Tensor(hub), hub 
    
    # Hub Depressed
    def get_hubdep(self, node_pairs):
        hub = list()
        for i, j in node_pairs:
            if i not in self.__Graph.nodes or j not in self.__Graph.nodes:
                hub.append(0)
                continue
            to_nlist = [ n for n in self.__Graph.neighbors(i)]
            from_nlist = [ n for n in self.__Graph.neighbors(j)]
            hub.append(round(len(set(to_nlist)&set(from_nlist)) * 2 / max(len(set(to_nlist)), len(set(from_nlist))), 3))
            
        return torch.Tensor(hub), hub 
    
    # Leicht-Holme-Newman
    def get_leicht(self, node_pairs):
        leicht = list()
        for i, j in node_pairs:
            if i not in self.__Graph.nodes or j not in self.__Graph.nodes:
                leicht.append(0)
                continue
            to_nlist = [ n for n in self.__Graph.neighbors(i)]
            from_nlist = [ n for n in self.__Graph.neighbors(j)]
            leicht.append(round(len(set(to_nlist)&set(from_nlist)) / (len(set(to_nlist))*len(set(from_nlist))), 3))
            
        return torch.Tensor(leicht), leicht 
    
    # Resource allocation
    def get_resource(self, node_pairs):
        res = list()
        for i, j in node_pairs:
            if i not in self.__Graph.nodes or j not in self.__Graph.nodes:
                res.append(0)
                continue
            to_nlist = [ n for n in self.__Graph.neighbors(i)]
            from_nlist = [ n for n in self.__Graph.neighbors(j)]
            neighbors = set(to_nlist)&set(from_nlist)
            res_score = 0
            if len(neighbors) != 0:
                for neighbor in neighbors:
                    neighbor_friends = [i for i in self.__Graph.neighbors(neighbor)]
                    if len(neighbor_friends)==1:
                        continue
                    res_score += 1/ len(neighbor_friends)
            res.append(round(res_score, 3))
            
        return torch.Tensor(res), res
    
    def get_training_data(self):
        target = self.__data_array[:, 3]
        training_array = self.__data_array[:, 5:].astype(float)
        training_array = self.normalize(training_array)
        
        return training_array, target
    
    def get_testing_data(self):
        testing_array = self.__test_array[:, 4:].astype(float)
        testing_array = self.normalize(testing_array)
        
        return testing_array
    
    def normalize(self, dataset):
        mean = np.mean(dataset, axis=0)
        std  = np.std(dataset, axis=0)
        
        return (dataset - mean)/std
        

data = ReadData("dataset1")
#print(data.get_edges().shape)
print(data.get_nodes().shape)
#print(data.get_common_neighbors())
print(len(data.get_feature()[351]))

torch.Size([2590, 1433])
1433


## 1. Simple NN

In [316]:
import torch
from torchsummary import summary
from torch.nn import Linear, Sequential, Sigmoid

In [319]:
class NN(torch.nn.Module):
    def __init__(self, feature):
        super(NN, self).__init__()
        torch.manual_seed(12345)
        self.nn = Sequential(
            Linear(feature, 32),
            torch.nn.LeakyReLU(),
            Linear(32, 64), 
            torch.nn.LeakyReLU(),
            Linear(64, 128),
            torch.nn.LeakyReLU(),
            Linear(128, 64),
            torch.nn.LeakyReLU(),
            Linear(64, 32),
            torch.nn.LeakyReLU(),
            Linear(32, 16),
            torch.nn.LeakyReLU(),
            Linear(16, 1)
        )
        
    def forward(self, x):
        out = self.nn(x)
        
        return out
    
training_array, target = data.get_training_data()
training_array = torch.tensor(training_array).float()
target = torch.Tensor(target.astype(int))
X_train, X_val, y_train, y_val = train_test_split(training_array, target, test_size=0.1)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
nn_model = NN(X_train.shape[1]).to("cpu")
summary(nn_model, input_size = X_train.shape, device="cpu")
#print(nn_model)

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1             [-1, 7817, 32]             352
         LeakyReLU-2             [-1, 7817, 32]               0
            Linear-3             [-1, 7817, 64]           2,112
         LeakyReLU-4             [-1, 7817, 64]               0
            Linear-5            [-1, 7817, 128]           8,320
         LeakyReLU-6            [-1, 7817, 128]               0
            Linear-7             [-1, 7817, 64]           8,256
         LeakyReLU-8             [-1, 7817, 64]               0
            Linear-9             [-1, 7817, 32]           2,080
        LeakyReLU-10             [-1, 7817, 32]               0
           Linear-11             [-1, 7817, 16]             528
        LeakyReLU-12             [-1, 7817, 16]               0
           Linear-13              [-1, 7817, 1]              17
Total params: 21,665
Trainable params: 

In [320]:
import wandb
import torch.nn.functional as F

def train(X_train, X_val, y_train, y_val):
    # setting
    lr = wandb.config["learning_rate"]
    MAX_ITERATION = wandb.config["MAX_ITERATION"]
    
    model = NN(X_train.shape[1])
    criterion = torch.nn.CrossEntropyLoss(reduction='sum')  # Define loss criterion.
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    for epoch in range(MAX_ITERATION):
        optimizer.zero_grad() # clear existing gradients
        # compute BC ranking score
        out= model(X_train)
        # compute loss
        #yij, bij = sample_node(y_hat, edges)
        loss = F.binary_cross_entropy_with_logits(out, y_train.reshape(-1, 1), reduction="sum")
        if epoch % 500 == 0:
            print("[{}/{}] Loss:{:.4f}".format(epoch, MAX_ITERATION, loss.item()))
        loss.backward()
        # validation
        val_loss = validation(model, X_val, y_val)
        
        wandb.log({"training_loss": loss}, step=epoch)
        wandb.log({"val_loss": val_loss}, step=epoch)
        optimizer.step()
    
    #evaluation 
    y_train_hat = model(X_train)
    y_val_hat = model(X_val)
    training_auc, training_pre, val_auc, val_pre = evaluation(y_train.detach().numpy(),
                                                              y_val.detach().numpy(),
                                                              y_train_hat.detach().numpy(),
                                                              y_val_hat.detach().numpy())
    wandb.summary["Training AUC"] = training_auc
    wandb.summary["Training PRE"] = training_pre
    wandb.summary["Validation AUC"] = val_auc
    wandb.summary["Validation PRE"] = val_pre
    return model
        
def validation(model, X_val, y_val):
    model.eval()
    
    with torch.no_grad():
        out = model(X_val)
        loss = F.binary_cross_entropy_with_logits(out, y_val.reshape(-1, 1), reduction="sum")
    
    return loss.item()



### Training

In [321]:
wandb.init(project='Link_Prediction@MLG', entity="baron")
wandb.config["learning_rate"] = 0.0001
wandb.config["MAX_ITERATION"] = 10000
wandb.config["Dataset"] = "dataset1"

data = ReadData(wandb.config["Dataset"])
training_array, target = data.get_training_data()
training_array = torch.tensor(training_array).float()
target = torch.Tensor(target.astype(int))
X_train, X_val, y_train, y_val = train_test_split(training_array, target, test_size=0.1)
nn_model1 = train(X_train, X_val, y_train, y_val)

wandb.finish()



[0/10000] Loss:5444.0708
[500/10000] Loss:3613.0115
[1000/10000] Loss:3517.5767
[1500/10000] Loss:3507.4897
[2000/10000] Loss:3502.2700
[2500/10000] Loss:3496.4570
[3000/10000] Loss:3490.4246
[3500/10000] Loss:3485.2556
[4000/10000] Loss:3476.8940
[4500/10000] Loss:3468.6230
[5000/10000] Loss:3461.6006
[5500/10000] Loss:3453.0957
[6000/10000] Loss:3446.4187
[6500/10000] Loss:3441.2019
[7000/10000] Loss:3436.4324
[7500/10000] Loss:3432.9287
[8000/10000] Loss:3437.5508
[8500/10000] Loss:3430.0747
[9000/10000] Loss:3423.9395
[9500/10000] Loss:3419.9929



0,1
training_loss,█▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
Training AUC,0.85565
Training PRE,0.87875
Validation AUC,0.84946
Validation PRE,0.87619
training_loss,3415.75635
val_loss,394.60452


In [322]:
wandb.init(project='Link_Prediction@MLG', entity="baron")
wandb.config["learning_rate"] = 0.0001
wandb.config["MAX_ITERATION"] = 10000
wandb.config["Dataset"] = "dataset2"

data = ReadData(wandb.config["Dataset"])
training_array, target = data.get_training_data()
training_array = torch.tensor(training_array).float()
target = torch.Tensor(target.astype(int))
X_train, X_val, y_train, y_val = train_test_split(training_array, target, test_size=0.1)
nn_model2 = train(X_train, X_val, y_train, y_val)

wandb.finish()



[0/10000] Loss:4726.4482
[500/10000] Loss:3322.5095
[1000/10000] Loss:3273.3677
[1500/10000] Loss:3214.0251
[2000/10000] Loss:3175.0457
[2500/10000] Loss:3155.9573
[3000/10000] Loss:3150.3879
[3500/10000] Loss:3142.2351
[4000/10000] Loss:3136.0608
[4500/10000] Loss:3131.4285
[5000/10000] Loss:3129.3289
[5500/10000] Loss:3127.7649
[6000/10000] Loss:3125.8442
[6500/10000] Loss:3122.9280
[7000/10000] Loss:3119.7134
[7500/10000] Loss:3116.3647
[8000/10000] Loss:3113.2212
[8500/10000] Loss:3110.6143
[9000/10000] Loss:3106.6086
[9500/10000] Loss:3104.7983



0,1
training_loss,█▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂
val_loss,█▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
Training AUC,0.82841
Training PRE,0.84948
Validation AUC,0.85751
Validation PRE,0.87269
training_loss,3224.85254
val_loss,337.2561


In [323]:
wandb.init(project='Link_Prediction@MLG', entity="baron")
wandb.config["learning_rate"] = 0.0001
wandb.config["MAX_ITERATION"] = 10000
wandb.config["Dataset"] = "dataset3"

data = ReadData(wandb.config["Dataset"])
training_array, target = data.get_training_data()
training_array = torch.tensor(training_array).float()
target = torch.Tensor(target.astype(int))
X_train, X_val, y_train, y_val = train_test_split(training_array, target, test_size=0.1)
nn_model3 = train(X_train, X_val, y_train, y_val)

wandb.finish()



[0/10000] Loss:1610.0048
[500/10000] Loss:970.8749
[1000/10000] Loss:924.8173
[1500/10000] Loss:902.5377
[2000/10000] Loss:886.8524
[2500/10000] Loss:868.8429
[3000/10000] Loss:865.0078
[3500/10000] Loss:862.3101
[4000/10000] Loss:860.3144
[4500/10000] Loss:858.5636
[5000/10000] Loss:857.3154
[5500/10000] Loss:855.5689
[6000/10000] Loss:854.1609
[6500/10000] Loss:852.7396
[7000/10000] Loss:851.4821
[7500/10000] Loss:849.4568
[8000/10000] Loss:847.1223
[8500/10000] Loss:845.8907
[9000/10000] Loss:844.6510
[9500/10000] Loss:843.5234



0,1
training_loss,█▄▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
Training AUC,0.90259
Training PRE,0.91428
Validation AUC,0.88654
Validation PRE,0.90982
training_loss,842.65417
val_loss,102.33006


## 2. Random Forest

In [307]:
from sklearn.ensemble import RandomForestRegressor
import joblib

def RandomForest(dataset, config=False):
    training_array, target = dataset.get_training_data()
    reg = RandomForestRegressor(n_estimators=int(config["n_estimators"]),
                               min_samples_leaf=int(config["min_samples_leaf"]),
                               min_samples_split=int(config["min_samples_split"]),
                               oob_score=bool(config["oob_score"]))

    X_train, X_val, y_train, y_val = train_test_split(training_array, target, test_size=0.2)
    y_train = y_train.astype('int')
    y_val = y_val.astype('int')

    reg.fit(X_train, y_train)
    y_train_hat = np.where(reg.predict(X_train)>0.5, 1, 0)
    y_val_hat = np.where(reg.predict(X_val)>0.5, 1, 0)

    training_auc, training_pre, val_auc, val_pre = evaluation(y_train, y_val, y_train_hat, y_val_hat)
    wandb.summary["feature_number"] = X_train.shape[1]
    wandb.log({
        "Training AUC":training_auc,
        "Training PRE": training_pre,
        "Validation AUC": val_auc,
        "Validation PRE": val_pre
    })

    joblib.dump(reg, os.path.join(wandb.run.dir, 'rf_model.h5'))
    
    return reg

### Training

In [308]:
import wandb 
from IPython.display import clear_output
sweep_config = {
    "method": "bayes",
    "metric":{
        "name": "Validation PRE",
        "goal": "maximize"
    },
    
    "parameters":{
        "n_estimators":{
            'min': 10,
            'max': 150
        },
        "min_samples_leaf":{
            'min': 1,
            'max': 50,
        },
        "min_samples_split":{
            'min': 2,
            'max': 5
        },
        "oob_score":{
            'values': [True, False]
        }
    }
    
}

In [309]:
def train_dataset1_rf():
    with wandb.init(project='Link_Prediction@MLG', entity="baron", group="rf") as run:
        config = wandb.config
        dataset = ReadData("dataset1")
        model = RandomForest(dataset, config)
        
def train_dataset2_rf():
    with wandb.init(project='Link_Prediction@MLG', entity="baron", group="rf") as run:
        config = wandb.config
        dataset = ReadData("dataset2")
        model = RandomForest(dataset, config)

def train_dataset3_rf():
    with wandb.init(project='Link_Prediction@MLG', entity="baron", group="rf") as run:
        config = wandb.config
        dataset = ReadData("dataset3")
        model = RandomForest(dataset, config)

        
count = 100 # number of runs to execute
# training dataset 1
dataset1 = ReadData("dataset1")
sweep_id = wandb.sweep(sweep_config)
wandb.agent(sweep_id, function=train_dataset2_rf, count=count)
clear_output()
# training dataset 2
dataset2 = ReadData("dataset2")
sweep_id = wandb.sweep(sweep_config)
wandb.agent(sweep_id, function=train_dataset2_rf, count=count)
clear_output()
# training dataset 3
dataset3 = ReadData("dataset3")
sweep_id = wandb.sweep(sweep_config)
wandb.agent(sweep_id, function=train_dataset3_rf, count=count)  
clear_output()

Create sweep with ID: 1srz43lz
Sweep URL: https://wandb.ai/baron/Link_Prediction%40MLG/sweeps/1srz43lz


wandb: Agent Starting Run: u24iffmd with config:
wandb: 	min_samples_leaf: 34
wandb: 	min_samples_split: 4
wandb: 	n_estimators: 79
wandb: 	oob_score: False





0,1
Training AUC,▁
Training PRE,▁
Validation AUC,▁
Validation PRE,▁

0,1
Training AUC,0.8101
Training PRE,0.75493
Validation AUC,0.79164
Validation PRE,0.73425
feature_number,10.0


wandb: Agent Starting Run: bxtnoojg with config:
wandb: 	min_samples_leaf: 21
wandb: 	min_samples_split: 4
wandb: 	n_estimators: 137
wandb: 	oob_score: True





0,1
Training AUC,▁
Training PRE,▁
Validation AUC,▁
Validation PRE,▁

0,1
Training AUC,0.80604
Training PRE,0.7726
Validation AUC,0.8192
Validation PRE,0.79053
feature_number,10.0


wandb: Sweep Agent: Waiting for job.
wandb: Job received.
wandb: Agent Starting Run: xd1t757a with config:
wandb: 	min_samples_leaf: 17
wandb: 	min_samples_split: 4
wandb: 	n_estimators: 120
wandb: 	oob_score: True





0,1
Training AUC,▁
Training PRE,▁
Validation AUC,▁
Validation PRE,▁

0,1
Training AUC,0.8215
Training PRE,0.78049
Validation AUC,0.76921
Validation PRE,0.73341
feature_number,10.0


wandb: Agent Starting Run: rj3eet2c with config:
wandb: 	min_samples_leaf: 32
wandb: 	min_samples_split: 2
wandb: 	n_estimators: 46
wandb: 	oob_score: True





0,1
Training AUC,▁
Training PRE,▁
Validation AUC,▁
Validation PRE,▁

0,1
Training AUC,0.80246
Training PRE,0.76452
Validation AUC,0.81043
Validation PRE,0.77702
feature_number,10.0


wandb: Agent Starting Run: opufc0j9 with config:
wandb: 	min_samples_leaf: 6
wandb: 	min_samples_split: 2
wandb: 	n_estimators: 43
wandb: 	oob_score: True





0,1
Training AUC,▁
Training PRE,▁
Validation AUC,▁
Validation PRE,▁

0,1
Training AUC,0.8194
Training PRE,0.79098
Validation AUC,0.82236
Validation PRE,0.78745
feature_number,10.0


wandb: Agent Starting Run: ey2f5igf with config:
wandb: 	min_samples_leaf: 34
wandb: 	min_samples_split: 5
wandb: 	n_estimators: 32
wandb: 	oob_score: True





0,1
Training AUC,▁
Training PRE,▁
Validation AUC,▁
Validation PRE,▁

0,1
Training AUC,0.81186
Training PRE,0.75219
Validation AUC,0.78595
Validation PRE,0.74429
feature_number,10.0


wandb: Agent Starting Run: zfv5bzuz with config:
wandb: 	min_samples_leaf: 16
wandb: 	min_samples_split: 5
wandb: 	n_estimators: 123
wandb: 	oob_score: True





0,1
Training AUC,▁
Training PRE,▁
Validation AUC,▁
Validation PRE,▁

0,1
Training AUC,0.80865
Training PRE,0.76848
Validation AUC,0.80599
Validation PRE,0.76641
feature_number,10.0


wandb: Agent Starting Run: bs48ug5h with config:
wandb: 	min_samples_leaf: 14
wandb: 	min_samples_split: 3
wandb: 	n_estimators: 17
wandb: 	oob_score: True





  warn(


0,1
Training AUC,▁
Training PRE,▁
Validation AUC,▁
Validation PRE,▁

0,1
Training AUC,0.81561
Training PRE,0.77618
Validation AUC,0.82503
Validation PRE,0.79715
feature_number,10.0


wandb: Agent Starting Run: zwoda40q with config:
wandb: 	min_samples_leaf: 3
wandb: 	min_samples_split: 5
wandb: 	n_estimators: 102
wandb: 	oob_score: False





0,1
Training AUC,▁
Training PRE,▁
Validation AUC,▁
Validation PRE,▁

0,1
Training AUC,0.81931
Training PRE,0.77387
Validation AUC,0.82704
Validation PRE,0.78912
feature_number,10.0


wandb: Agent Starting Run: hbuu6nwa with config:
wandb: 	min_samples_leaf: 34
wandb: 	min_samples_split: 5
wandb: 	n_estimators: 141
wandb: 	oob_score: True





0,1
Training AUC,▁
Training PRE,▁
Validation AUC,▁
Validation PRE,▁

0,1
Training AUC,0.81106
Training PRE,0.76901
Validation AUC,0.79232
Validation PRE,0.75611
feature_number,10.0


wandb: Agent Starting Run: 3m31m7hq with config:
wandb: 	min_samples_leaf: 26
wandb: 	min_samples_split: 3
wandb: 	n_estimators: 105
wandb: 	oob_score: True





0,1
Training AUC,▁
Training PRE,▁
Validation AUC,▁
Validation PRE,▁

0,1
Training AUC,0.81038
Training PRE,0.7634
Validation AUC,0.79864
Validation PRE,0.73643
feature_number,10.0


wandb: Agent Starting Run: sln942e2 with config:
wandb: 	min_samples_leaf: 33
wandb: 	min_samples_split: 3
wandb: 	n_estimators: 149
wandb: 	oob_score: False





0,1
Training AUC,▁
Training PRE,▁
Validation AUC,▁
Validation PRE,▁

0,1
Training AUC,0.80959
Training PRE,0.75627
Validation AUC,0.81504
Validation PRE,0.75704
feature_number,10.0


wandb: Agent Starting Run: wwcriamj with config:
wandb: 	min_samples_leaf: 13
wandb: 	min_samples_split: 2
wandb: 	n_estimators: 45
wandb: 	oob_score: True





0,1
Training AUC,▁
Training PRE,▁
Validation AUC,▁
Validation PRE,▁

0,1
Training AUC,0.81795
Training PRE,0.7768
Validation AUC,0.81629
Validation PRE,0.79443
feature_number,10.0


wandb: Agent Starting Run: zo708sxs with config:
wandb: 	min_samples_leaf: 3
wandb: 	min_samples_split: 5
wandb: 	n_estimators: 14
wandb: 	oob_score: False





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [314]:
import tempfile

def dict_to_config(d):
    class Object(object):
        pass

    config = Object()
    for key, value in d.items():
        setattr(config, key, value)
    return config

def parse_wandb_models(path, model_name, numbers_models=None, metric=None):
    '''Parse wandb models with either run paths or a sweep path.

    Args:
        path: a list contains either run paths or a sweep path
        numbers_models: a integer or a list of numbers of models.
                        if None, treat path as run paths, otherwise treat it as a sweep path.
        metric: metric to sort by when parsing a sweep path
    '''
    api = wandb.Api()
    models, configs, model_paths = list(), list(), list()
    sweep_name = ''

    modeldir = tempfile.mkdtemp()

    if numbers_models is not None: # sweep
        numbers_models = max(numbers_models) if isinstance(numbers_models, list) else numbers_models

        sweep = api.sweep(path[0])
        sweep_name = sweep.config.get('name', '')
        # sort runs by metric
        runs = sorted(sweep.runs, key=lambda run: run.summary.get(metric, 0), 
                            reverse=True)
        runs = runs[:numbers_models]
        
    else:
        runs = [api.run(p) for p in path]

    for run in runs:
        run.file('{}.h5'.format(model_name)).download(replace=True, root=modeldir)

        # load_model =
        models.append(joblib.load(modeldir + '/{}.h5'.format(model_name)))

        configs.append(dict_to_config(run.config))
        model_paths.append(run.path)

    return models, configs, model_paths, sweep_name

def MeanAccuracy(dataset, models):
    sum_training_auc, sum_training_pre, sum_val_auc, sum_val_pre = 0, 0, 0, 0
    n = len(model_paths)
    training_array, target = dataset.get_training_data()
    X_train, X_val, y_train, y_val = train_test_split(training_array, target, test_size=0.2)
    for model in models:
        y_train_hat = np.where(model.predict(X_train)>0.5, 1, 0)
        y_val_hat = np.where(model.predict(X_val)>0.5, 1, 0)
        y_train = y_train.astype('int')
        y_val = y_val.astype('int')
        training_auc, training_pre, val_auc, val_pre = evaluation(y_train, y_val, y_train_hat, y_val_hat)
        sum_training_auc += training_auc
        sum_training_pre += training_pre
        sum_val_auc +=  val_auc
        sum_val_pre +=  val_pre
    print("training auc: {:.3f} pre: {:.3f}, validation auc: {:.3f} pre: {:.3f}".format(sum_training_auc/n, sum_training_pre/n, sum_val_auc/n, sum_val_pre/n))


print("Dataset1: ")
rf_models1, configs, model_paths, sweep_name = parse_wandb_models(path=["baron/Link_Prediction@MLG/u13o8xd1"], model_name="rf_model", numbers_models=5, metric="Validation PRE")
MeanAccuracy(dataset1, rf_models1)
print("Dataset2: ")
rf_models2, configs, model_paths, sweep_name = parse_wandb_models(path=["baron/Link_Prediction@MLG/hjl5err9"], model_name="rf_model", numbers_models=5, metric="Validation PRE")
MeanAccuracy(dataset2, rf_models2)
print("Dataset3: ")
rf_models3, configs, model_paths, sweep_name = parse_wandb_models(path=["baron/Link_Prediction@MLG/1srz43lz"], model_name="rf_model", numbers_models=5, metric="Validation PRE")
MeanAccuracy(dataset3, rf_models3)

Dataset1: 
training auc: 0.765 pre: 0.702, validation auc: 0.753 pre: 0.694
Dataset2: 
training auc: 0.747 pre: 0.702, validation auc: 0.748 pre: 0.699
Dataset3: 
training auc: 0.820 pre: 0.790, validation auc: 0.803 pre: 0.759


## 3. Randomwalk with restart

In [2]:
! pwd

/home/baron/HW/link_prediction


In [11]:
import os
import sys
sys.path.append(os.path.abspath("./pyrwr"))
print(sys.path)
from pyrwr.rwr import RWR

['/home/baron/HW/link_prediction', '/usr/lib/python38.zip', '/usr/lib/python3.8', '/usr/lib/python3.8/lib-dynload', '', '/home/baron/.local/lib/python3.8/site-packages', '/usr/local/lib/python3.8/dist-packages', '/usr/lib/python3/dist-packages', '/home/baron/HW/link_prediction/pyrwr', '/home/baron/HW/link_prediction/pyrwr']


ModuleNotFoundError: No module named 'pyrwr.rwr'

## Evaluation

In [98]:
from sklearn.metrics import roc_auc_score, average_precision_score

In [131]:
def evaluation(y_train, y_val, y_train_hat, y_val_hat):
    training_auc, val_auc = roc_auc_score(y_train, y_train_hat), roc_auc_score(y_val, y_val_hat)
    training_pre, val_pre = average_precision_score(y_train, y_train_hat), average_precision_score(y_val, y_val_hat)
    
    return training_auc, training_pre, val_auc, val_pre

## Output

### Simple NN

In [324]:
upload_csv = pd.read_csv("../../data/SocialNetowrk/hw2_data/dataset1/upload.csv")
X_testing = dataset1.get_testing_data()
X_testing = torch.tensor(X_testing).float()
upload_csv["prob"] = F.sigmoid(nn_model1(X_testing)).detach().numpy()
print("dataset1:")
print(upload_csv.head())
upload_csv.to_csv('upload/upload_nn1_moref.csv')

upload_csv = pd.read_csv("../../data/SocialNetowrk/hw2_data/dataset2/upload.csv")
X_testing = dataset2.get_testing_data()
X_testing = torch.tensor(X_testing).float()
upload_csv["prob"] = F.sigmoid(nn_model2(X_testing)).detach().numpy()
print("\ndataset2:")
print(upload_csv.head())
upload_csv.to_csv('upload/upload_nn2_moref.csv')

upload_csv = pd.read_csv("../../data/SocialNetowrk/hw2_data/dataset3/upload.csv")
X_testing = dataset3.get_testing_data()
X_testing = torch.tensor(X_testing).float()
upload_csv["prob"] = F.sigmoid(nn_model3(X_testing)).detach().numpy()
print("\ndataset3:")
print(upload_csv.head())
upload_csv.to_csv('upload/upload_nn3_moref.csv')

dataset1:
       id      prob
0  E10559  0.365705
1   E4849  0.250038
2   E3964  0.250038
3    E542  0.571623
4    E331  0.885949

dataset2:
      id      prob
0  E3064  0.599758
1   E298  0.575320
2  E3512  1.000000
3  E5670  1.000000
4  E5005  0.599758

dataset3:
      id      prob
0   E370  0.465278
1   E667  0.570752
2  E3190  0.515711
3   E848  0.382854
4  E2161  0.331145




### Random forest

In [315]:
upload_csv = pd.read_csv("../../data/SocialNetowrk/hw2_data/dataset1/upload.csv")
X_testing = dataset1.get_testing_data()
sum_value = 0
for model in rf_models1:
    predict_value = model.predict(X_testing)
    sum_value += predict_value
upload_csv["prob"] = sum_value/len(rf_models1)
print("dataset1:")
print(upload_csv.head())
upload_csv.to_csv('upload/upload_rf1_moref.csv')

upload_csv = pd.read_csv("../../data/SocialNetowrk/hw2_data/dataset2/upload.csv")
X_testing = dataset2.get_testing_data()
sum_value = 0
for model in rf_models2:
    predict_value = model.predict(X_testing)
    sum_value += predict_value
upload_csv["prob"] = sum_value/len(rf_models2)
print("\ndataset2:")
print(upload_csv.head())
upload_csv.to_csv('upload/upload_rf2_moref.csv')

upload_csv = pd.read_csv("../../data/SocialNetowrk/hw2_data/dataset3/upload.csv")
X_testing = dataset3.get_testing_data()
sum_value = 0
for model in rf_models3:
    predict_value = model.predict(X_testing)
    sum_value += predict_value
upload_csv["prob"] = sum_value/len(rf_models3)
print("\ndataset3:")
print(upload_csv.head())
upload_csv.to_csv('upload/upload_rf3_moref.csv')

dataset1:
       id      prob
0  E10559  0.489356
1   E4849  0.332700
2   E3964  0.332700
3    E542  0.555823
4    E331  0.890325

dataset2:
      id      prob
0  E3064  0.604605
1   E298  0.607758
2  E3512  0.999690
3  E5670  0.999310
4  E5005  0.604605

dataset3:
      id      prob
0   E370  0.478342
1   E667  0.463736
2  E3190  0.489414
3   E848  0.518443
4  E2161  0.401615
