In [1]:
import networkx as nx
import numpy as np
import pickle as pkl
import scipy.sparse as sp
import sys
import zipfile as zf
import pandas as pd
import csv
import math

def CommonNeighbors(g, u, v):
    u_neighbors = set(g.neighbors(u))
    v_neighbors = set(g.neighbors(v))
    return len(u_neighbors.intersection(v_neighbors))

In [2]:
train_csv = pd.read_csv('./data/individual_links.txt', names=['source', 'destination'],sep='\s+')
g = nx.from_pandas_edgelist(train_csv, source='source',target='destination')
    

In [3]:
# Load samples
edges_positive = pd.read_csv('./edges_pos_50k.csv').to_numpy()
edges_negative = pd.read_csv('./edges_neg_50k.csv').to_numpy()

In [4]:
def generate_features(sample_list, test = False):
    features = []
    i = 0
    for sample in sample_list:
        #print(sample)
        source = sample[0]
        target = sample[1]
        if test == False:
            label = sample[2]
        else:
            label = -1
        
        feature = []
        try:
            i = i+1
            print(i)
            
            p = CommonNeighbors(g, source, target)
            feature.append(p)
            
            #p = nx.simrank_similarity(g, source, target)
            #feature.append(p)
            
            preds = nx.resource_allocation_index(g, [(source, target)])
            for u, v, p in preds:
                feature.append(p)

            preds = nx.jaccard_coefficient(g, [(source, target)])
            for u, v, p in preds:
                feature.append(p)

            preds = nx.adamic_adar_index(g, [(source, target)])
            for u, v, p in preds:
                feature.append(p)

            preds = nx.preferential_attachment(g, [(source, target)])
            for u, v, p in preds:
                feature.append(p)
            
            feature.append(label)  # append label
            
        except Exception as e:
            print(e)
            pass
        features.append(feature)
    print("features: "+str(len(features)))
    return features

In [None]:
features_pos = generate_features(edges_positive)
features_neg = generate_features(edges_negative)

In [None]:
features = features_pos + features_neg

In [None]:
def write_train_to_csv(features):
    with open("train_50k.csv","w",newline="") as csvfile:
        writer=csv.writer(csvfile)
        writer.writerow(["CN","RA","JC","AA","PA","Label"])
        writer.writerows(features)
        
write_train_to_csv(features)

In [None]:
def get_test_samples():
    with open('./data/test-public.txt') as test:
        test_edges = []
        for line in test:
            edge_list = line.split()
            try:
                test_edges.append((int(edge_list[1]), int(edge_list[2])))
            except:
                pass
        return test_edges
test_samples = get_test_samples()

In [None]:
# output label -1 for data in test set
test_features = generate_features(test_samples, test = True)

In [None]:
def write_test_to_csv(test_features):
    with open("test_50k.csv","w") as csvfile:
        writer=csv.writer(csvfile)
        writer.writerow(["CN","RA","JC","AA","PA","Label"])
        writer.writerows(test_features)



In [None]:
write_test_to_csv(test_features)