In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Use following algorithms
1. [jaccard](https://networkx.github.io/documentation/networkx-1.9/reference/generated/networkx.algorithms.link_prediction.jaccard_coefficient.html)
2. [adamic adar](https://networkx.github.io/documentation/networkx-1.10/reference/generated/networkx.algorithms.link_prediction.adamic_adar_index.html)
3. [preferential attachment](https://networkx.github.io/documentation/stable/reference/algorithms/generated/networkx.algorithms.link_prediction.preferential_attachment.html#networkx.algorithms.link_prediction.preferential_attachment)
4. [katz](https://networkx.github.io/documentation/networkx-1.10/reference/generated/networkx.algorithms.centrality.katz_centrality.html) (and try different beta, maybe max_power) 

    4.1 small beta value (float), will find it more similar to CommonNeighbor algo, which doesn't consider other nodes other than neighbors  
    4.2 max_power (int), which means to what extent of path length will be considered

We want this for output
* P is whether Katz value exceed the threshold

                Katz,   P,    TP,    normalized betweenness of the whole graph
        A, B    0.05,   1,    1,     
        C, D    0.07,   1,    0,     
        E, F    0.10,   0,    0,     
        
# How we find the best threshold
To find the best sensitivity and specificity, we use 
[YoudenIndex](https://en.wikipedia.org/wiki/Youden%27s_J_statistic?fbclid=IwAR3OICm3oJQQnopIKeflrpd1K-DcxahLE9IerxPfb8k6uISWt30D4DexuCA)    

    J = sensitivity + specificity - 1
    sensitivity = tp / tp + fn (ALL YES)
    specificity = tn / fp + tn (ALL NO )
    
## sensitivity and specificity
[wikipedia](https://en.wikipedia.org/wiki/Sensitivity_and_specificity)

In [26]:
import os
import pandas as pd
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

journal = input('scientometrics / jasist')
entity = input('co-author / co-occurrence')

path = os.getcwd()
all_eva_files = []
a1_eva_files, a2_eva_files, a3_eva_files = [], [], []
evaluation_result_path = path+"\\data_transformation_output\\{}\\netfiles\\{}\\".format(journal, entity)
for f in os.listdir(evaluation_result_path):
    if f.endswith(".txt"):
        all_eva_files.append(evaluation_result_path+f)
        if 'Jaccard' in f:
            a1_eva_files.append(evaluation_result_path+f)
        elif 'Katz_beta001' in f:
            a2_eva_files.append(evaluation_result_path+f)
        elif 'AdamicAdar' in f:
            a3_eva_files.append(evaluation_result_path+f)
all_a_eva_files = [a1_eva_files, a2_eva_files, a3_eva_files]

scientometrics / jasistscientometrics
co-author / co-occurrenceco-occurrence


### netfiles

In [27]:
netfiles_path = path+"\\data_transformation_output\\{}\\netfiles\\{}\\".format(journal, entity)
netfiles = []
for f in os.listdir(netfiles_path):
    if f.endswith('.net'):
        netfiles.append(netfiles_path+f)

netfiles

['C:\\Users\\Liser\\Desktop\\linchengwei_link_prediction\\data_transformation_output\\scientometrics\\netfiles\\co-occurrence\\2010-15.net',
 'C:\\Users\\Liser\\Desktop\\linchengwei_link_prediction\\data_transformation_output\\scientometrics\\netfiles\\co-occurrence\\2016-16.net',
 'C:\\Users\\Liser\\Desktop\\linchengwei_link_prediction\\data_transformation_output\\scientometrics\\netfiles\\co-occurrence\\2016-17.net',
 'C:\\Users\\Liser\\Desktop\\linchengwei_link_prediction\\data_transformation_output\\scientometrics\\netfiles\\co-occurrence\\2016-18.net',
 'C:\\Users\\Liser\\Desktop\\linchengwei_link_prediction\\data_transformation_output\\scientometrics\\netfiles\\co-occurrence\\2016-19.net']

### start

In [28]:
import linkpred
# from matplotlib import pyplot as plt
training_file = netfiles[0]
testing_file = netfiles[1]

# Read network
G = linkpred.read_network(training_file)

# read test network
test = linkpred.read_network(testing_file)

# Exclude test network from learning phase
training = G.copy()
# training.remove_edges_from(test.edges())

# jaccard = linkpred.predictors.Jaccard(training, excluded=training.edges()) #, excluded=training.edges()
# jaccard_results = jaccard.predict()

# test_set = set(linkpred.evaluation.Pair(u, v) for u, v in test.edges())
# evaluation = linkpred.evaluation.EvaluationSheet(jaccard_results, test_set)

Network contains multiple edges. These will be ignored.
Network contains multiple edges. These will be ignored.


# Computing Edge Betweenness in full testing network

In [11]:
import networkx as nx
edge_betweenness_result = nx.algorithms.centrality.edge_betweenness_centrality(test)

In [12]:
len(edge_betweenness_result)

3772

In [13]:
for kw, score in edge_betweenness_result.items():
     print(kw, score)

('"All-in-one" indicator', 'Altmetric score') 1.3382348414793919e-06
('"All-in-one" indicator', 'Altmetrics') 0.0013435877808453095
('1 Nano enabled drug delivery', 'Bibliography analysis') 1.3382348414793919e-06
('1 Nano enabled drug delivery', 'Graphene') 6.691174207396959e-06
('1 Nano enabled drug delivery', 'Specialization score') 1.3382348414793919e-06
('1 Nano enabled drug delivery', 'Interdisciplinarity') 0.0013342201369549538
('1 Nano enabled drug delivery', 'Innovation trajectory') 1.3382348414793919e-06
('AIC', 'Discretised lognormal') 0.0013395730763208712
('AIC', 'Citation counts') 1.3382348414793919e-06
('AIC', 'Stopped sum models') 1.3382348414793919e-06
('AIC', 'Negative binomial') 1.3382348414793919e-06
('AIC', 'Standard error') 1.3382348414793919e-06
('ANVUR', 'Research evaluation') 0.00023240714763335496
('ANVUR', 'Impact factor') 0.0002493787365156624
('ANVUR', 'Bibliometrics') 0.000863140131537771
('ARWU', 'Global university rankings') 1.3382348414793919e-06
('ARWU'

('Cluster analysis', 'Subject-classification') 0.0006850188422054075
('Cluster of Excellence', 'Excellence Initiative') 7.544298918840069e-05
('Cluster of Excellence', 'Highly cited papers') 1.3382348414793919e-06
('Clustering', 'HCV and Egypt') 3.1126705158124234e-05
('Clustering', 'PubMed') 3.1126705158124234e-05
('Clustering', 'Machine learning') 0.00026327099012655287
('Clustering', 'Data base') 3.1126705158124234e-05
('Clustering', 'Core-periphery structure') 0.00030638092268096785
('Clustering', 'Regression analysis') 3.1126705158124234e-05
('Clustering', 'Data mining') 0.0002381660671958601
('Clustering', 'Collaboration network') 0.0006030523204008948
('Co-author', 'Collaboration') 0.00025686576367853496
('Co-author', 'Scientometrics') 0.0010867220171667787
('Co-author', 'Evolving network') 1.3382348414793919e-06
('Co-author network metrics', 'Indicators') 0.0013409113111623506
('Co-author network metrics', 'Transformative research') 1.3382348414793919e-06
('Co-authorship', 'Lin

('Intellectual base', 'Massive gathering') 0.00014611689947887995
('Intellectual base', 'Multidisciplinary research field') 0.00014611689947887995
('Intellectual structure', 'Topic hierarchy') 0.0013368966066379124
('Intellectual structure', 'Multivariate scaling') 1.0705878731835135e-05
('Intellectual structure', 'Strategic knowledge management') 1.0705878731835135e-05
('Intellectual structure', 'Strategic behavior') 1.0705878731835135e-05
('Intellectual structure', 'Keyword network') 0.0053475864265516496
('Intellectual structure', 'K-core decomposition') 0.0013368966066379124
('Intellectual structure', 'Strategic entrepreneurship') 1.0705878731835135e-05
('Intellectual structure', 'Strategic management') 1.0705878731835135e-05
('Intellectual structures and dynamics', 'Scientometric mapping') 1.3382348414793919e-06
('Intellectual structures and dynamics', 'Transport geography') 1.3382348414793919e-06
('Intellectual theft', 'Scientific misconduct') 1.3382348414793919e-06
('Intellectua

('Theories', 'Web') 1.3382348414793919e-06
('Theories', 'Webometrics') 8.400928531058488e-05
("Thomson Reuters' Incites", 'VOS viewer map') 1.3382348414793919e-06
('Trustworthiness', 'Usability') 1.3382348414793919e-06
('Twenty-first century', 'h-Core scores') 1.3382348414793919e-06
('Twenty-first century', 'h-Index') 0.00028558190687736485
('Twitter Index', 'Twitter percentiles') 1.3382348414793919e-06
('Twitter Index', 'Twitter counts') 1.3382348414793919e-06
('Twitter counts', 'Twitter percentiles') 1.3382348414793919e-06
('URAP', 'World-class university') 1.3382348414793919e-06
('URAP', 'USNWR') 1.3382348414793919e-06
('USNWR', 'World-class university') 1.3382348414793919e-06
('University', 'beta-Convergence') 0.0010412627011355954
('University R&D and innovation', 'University ranking') 0.00036530663520451445
('University commercialization', 'University engagement') 1.3382348414793919e-06
('University commercialization', 'University-industry collaboration') 0.0013409113111623504
('

In [37]:
f = open(path+'\\output prediction\\{}\\{}\\edge_betweenness.txt'.format(journal, entity), 'w', encoding='utf-8')
for kw, score in edge_betweenness_result.items():
    f.write("{}\t{}\t{}\n".format(kw[0], kw[1], score))
f.close()

## extract testing network nodes only appear in training network

In [17]:
import networkx as nx

In [55]:
def remove_nodes_not_in_training(test):
    x = test.copy()
    print("nodes: {}\nedges: {}".format(x.number_of_nodes(), x.number_of_edges()))
    intersect_nodes = set(list(test.nodes)).intersection(set(list(training.nodes)))
    testing_nodes_not_in_training = set(list(test.nodes))-intersect_nodes

    x.remove_nodes_from(testing_nodes_not_in_training)
    print("nodes: {}\nedges: {}".format(x.number_of_nodes(), x.number_of_edges()))
    
    return x

test = remove_nodes_not_in_training(test)

nodes: 1223
edges: 3772
nodes: 461
edges: 1164


#### jaccard

In [59]:
# We exclude edges already present, to predict only new links
jaccard = linkpred.predictors.Jaccard(G) # , excluded=G.edges()
jaccard_results = jaccard.predict()
# jaccard_top = jaccard_results.top(1000) # katz_results.items length: 7855073

jaccard_test_set = set(linkpred.evaluation.Pair(u, v) for u, v in test.edges())
jaccard_evaluation = linkpred.evaluation.EvaluationSheet(jaccard_results, jaccard_test_set)

jaccard_results.to_file(path+'\\output prediction\\{}\\{}\\result\\jaccard_t1.txt'.format(journal, entity), delimiter='\t', encoding='utf-8')

jaccard_evaluation.to_file(path+'\\output prediction\\{}\\{}\\evaluation\\jaccard_t1.txt'.format(journal, entity))

#### AdamicAdar

In [60]:
AdamicAdar = linkpred.predictors.AdamicAdar(G)
AdamicAdar_results = AdamicAdar.predict()

# AdamicAdar_top = AdamicAdar_results.top(1000) # katz_results.items length: 7855073

AdamicAdar_test_set = set(linkpred.evaluation.Pair(u, v) for u, v in test.edges())
AdamicAdar_evaluation = linkpred.evaluation.EvaluationSheet(AdamicAdar_results, AdamicAdar_test_set)

AdamicAdar_results.to_file(path+'\\output prediction\\{}\\{}\\result\\AdamicAdar_t1.txt'.format(journal, entity), delimiter='\t', encoding='utf-8')

AdamicAdar_evaluation.to_file(path+'\\output prediction\\{}\\{}\\evaluation\\AdamicAdar_t1.txt'.format(journal, entity))

#### Katz_beta001

In [61]:
katz = linkpred.predictors.Katz(G)
katz_beta001_results = katz.predict(beta=0.001, max_power=5)

# katz_beta001_top = katz_beta001_results.top(1000) # katz_results.items length: 7855073

katz_beta001_test_set = set(linkpred.evaluation.Pair(u, v) for u, v in test.edges())
katz_beta001_evaluation = linkpred.evaluation.EvaluationSheet(katz_beta001_results, katz_beta001_test_set)

katz_beta001_results.to_file(path+'\\output prediction\\{}\\{}\\result\\katz_beta001_t1.txt'.format(journal, entity), delimiter='\t', encoding='utf-8')

katz_beta001_evaluation.to_file(path+'\\output prediction\\{}\\{}\\evaluation\\katz_beta001_t1.txt'.format(journal, entity))

Computing matrix powers: [############################################################] 5/5


#### Katz_beta01

In [62]:
katz = linkpred.predictors.Katz(G)
katz_beta01_results = katz.predict(beta=0.01, max_power=5)

# katz_beta01_top = katz_beta01_results.top(1000) # katz_results.items length: 7855073

katz_beta01_test_set = set(linkpred.evaluation.Pair(u, v) for u, v in test.edges())
katz_beta01_evaluation = linkpred.evaluation.EvaluationSheet(katz_beta01_results, katz_beta01_test_set)

katz_beta01_results.to_file(path+'\\output prediction\\{}\\{}\\result\\katz_beta01_t1.txt'.format(journal, entity), delimiter='\t', encoding='utf-8')

katz_beta01_evaluation.to_file(path+'\\output prediction\\{}\\{}\\evaluation\\katz_beta01_t1.txt'.format(journal, entity))

Computing matrix powers: [############################################################] 5/5


## Full example
https://github.com/rafguns/linkpred/issues/12

#### tn value is -1, something's wrong
    Solved, with the correct number of tp, fp, fn, can calculate the number of tn
    tp + fn = num_of_link in testing network
    tp + fp = num_of_predict_link
    tn = all_possible_link - tp - fp - fn

In [144]:
evaluation1 = linkpred.evaluation.StaticEvaluation(jaccard_results, test_set)