In [67]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Use following algorithms
1. [jaccard](https://networkx.github.io/documentation/networkx-1.9/reference/generated/networkx.algorithms.link_prediction.jaccard_coefficient.html)
2. [adamic adar](https://networkx.github.io/documentation/networkx-1.10/reference/generated/networkx.algorithms.link_prediction.adamic_adar_index.html)
3. [preferential attachment](https://networkx.github.io/documentation/stable/reference/algorithms/generated/networkx.algorithms.link_prediction.preferential_attachment.html#networkx.algorithms.link_prediction.preferential_attachment)
4. [katz](https://networkx.github.io/documentation/networkx-1.10/reference/generated/networkx.algorithms.centrality.katz_centrality.html) (and try different beta, maybe max_power) 

    4.1 small beta value (float), will find it more similar to CommonNeighbor algo, which doesn't consider other nodes other than neighbors  
    4.2 max_power (int), which means to what extent of path length will be considered

We want this for output
* P is whether Katz value exceed the threshold

                Katz,   P,    TP,    normalized betweenness of the whole graph
        A, B    0.05,   1,    1,     
        C, D    0.07,   1,    0,     
        E, F    0.10,   0,    0,     
        
# How we find the best threshold
To find the best sensitivity and specificity, we use 
[YoudenIndex](https://en.wikipedia.org/wiki/Youden%27s_J_statistic?fbclid=IwAR3OICm3oJQQnopIKeflrpd1K-DcxahLE9IerxPfb8k6uISWt30D4DexuCA)    

    J = sensitivity + specificity - 1
    sensitivity = tp / tp + fn (ALL YES)
    specificity = tn / fp + tn (ALL NO )
    
## sensitivity and specificity
[wikipedia](https://en.wikipedia.org/wiki/Sensitivity_and_specificity)

In [68]:
import os
import pandas as pd
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import linkpred
import networkx as nx
import linkpred

journal = input('scientometrics / jasist')
entity = input('co-author / co-occurrence')

path = os.getcwd()

netfiles_path = path+"\\data_transformation_output\\{}\\netfiles\\{}\\".format(journal, entity)
netfiles_lemma_path = path+"\\data_transformation_output\\{}\\netfiles\\{}\\lemma\\".format(journal, entity)
netfiles_stem_path = path+"\\data_transformation_output\\{}\\netfiles\\{}\\stem\\".format(journal, entity)

def get_netfiles(netfiles_path):
    netfiles = []
    times = []
    for f in os.listdir(netfiles_path):
        if f.endswith('.net'):
            netfiles.append(netfiles_path+f)
            times.append(f.split('.')[0])
            
    return netfiles, times

netfiles, times = get_netfiles(netfiles_stem_path)

scientometrics / jasistscientometrics
co-author / co-occurrenceco-occurrence


# main

In [50]:
user_input = input('origin / stem / lemma ?')

if user_input=='stem':
    result_output_path = path+'\\output prediction\\{}\\{}\\result\\stem\\'.format(journal, entity)
    evaluation_output_path = path+'\\output prediction\\{}\\{}\\evaluation\\stem\\'.format(journal, entity)
    betweenness_output_path = path+'\\output prediction\\{}\\{}\\edge_betweenness\\stem\\'.format(journal, entity)
    
elif user_input=='lemma':
    result_output_path = path+'\\output prediction\\{}\\{}\\result\\lemma\\'.format(journal, entity)
    evaluation_output_path = path+'\\output prediction\\{}\\{}\\evaluation\\lemma\\'.format(journal, entity)
    betweenness_output_path = path+'\\output prediction\\{}\\{}\\edge_betweenness\\lemma\\'.format(journal, entity)
else:
    result_output_path = path+'\\output prediction\\{}\\{}\\result\\'.format(journal, entity)
    evaluation_output_path = path+'\\output prediction\\{}\\{}\\evaluation\\'.format(journal, entity)
    betweenness_output_path = path+'\\output prediction\\{}\\{}\\edge_betweenness\\'.format(journal, entity)

origin / stem / lemma ?stem


In [51]:
betweenness_output_path

'C:\\Users\\Liser\\Desktop\\linchengwei_link_prediction\\sync\\output prediction\\scientometrics\\co-occurrence\\edge_betweenness\\stem\\'

In [43]:
def generate_output_dir(output_path):
    for i in range(len(times)-1):
        try:
            os.mkdir(output_path+times[i+1])
        except FileExistsError:
            print("File Existed!!")
            
    output_path = [output_path+t for t in os.listdir(output_path) if t.startswith('20')]
    return output_path
generate_output_dir(result_output_path)
generate_output_dir(evaluation_output_path)

File Existed!!
File Existed!!
File Existed!!
File Existed!!


['C:\\Users\\Liser\\Desktop\\linchengwei_link_prediction\\sync\\output prediction\\scientometrics\\co-occurrence\\evaluation\\stem\\2016-2016',
 'C:\\Users\\Liser\\Desktop\\linchengwei_link_prediction\\sync\\output prediction\\scientometrics\\co-occurrence\\evaluation\\stem\\2016-2017',
 'C:\\Users\\Liser\\Desktop\\linchengwei_link_prediction\\sync\\output prediction\\scientometrics\\co-occurrence\\evaluation\\stem\\2016-2018',
 'C:\\Users\\Liser\\Desktop\\linchengwei_link_prediction\\sync\\output prediction\\scientometrics\\co-occurrence\\evaluation\\stem\\2016-2019']

In [52]:
betweenness_output_path

'C:\\Users\\Liser\\Desktop\\linchengwei_link_prediction\\sync\\output prediction\\scientometrics\\co-occurrence\\edge_betweenness\\stem\\'

In [48]:
output_path

'C:\\Users\\Liser\\Desktop\\linchengwei_link_prediction\\sync\\output prediction\\scientometrics\\co-occurrence\\'

In [53]:
result_output_path

'C:\\Users\\Liser\\Desktop\\linchengwei_link_prediction\\sync\\output prediction\\scientometrics\\co-occurrence\\result\\stem\\'

In [65]:
def save_edge_betweenness(edge_betweenness_result, t, betweenness_output_path):
    f = open(betweenness_output_path+'{}.txt'.format(t), 'w', encoding='utf-8')
    for kw, score in edge_betweenness_result.items():
        f.write("{}\t{}\t{}\n".format(kw[0], kw[1], score))
    f.close()
    print('Save edge betweenness!')
    
    
def remove_nodes_not_in_training(training, test):
    x = test.copy()
    print("nodes: {}\nedges: {}".format(x.number_of_nodes(), x.number_of_edges()))
    intersect_nodes = set(list(test.nodes)).intersection(set(list(training.nodes)))
    testing_nodes_not_in_training = set(list(test.nodes))-intersect_nodes

    x.remove_nodes_from(testing_nodes_not_in_training)
    print("nodes: {}\nedges: {}".format(x.number_of_nodes(), x.number_of_edges()))
    
    return x

def predict_jaccard(G, test, t):
    jaccard = linkpred.predictors.Jaccard(G) # , excluded=G.edges()
    jaccard_results = jaccard.predict()

    jaccard_test_set = set(linkpred.evaluation.Pair(u, v) for u, v in test.edges())
    jaccard_evaluation = linkpred.evaluation.EvaluationSheet(jaccard_results, jaccard_test_set)

    jaccard_results.to_file(result_output_path+'{}\\jaccard.txt'.format(t), delimiter='\t', encoding='utf-8')

    jaccard_evaluation.to_file(evaluation_output_path+'{}\\jaccard.txt'.format(t))
    print('Saved Jaccard!')
    
def predict_AdamicAdar(G, test, t):
    AdamicAdar = linkpred.predictors.AdamicAdar(G)
    AdamicAdar_results = AdamicAdar.predict()

    AdamicAdar_test_set = set(linkpred.evaluation.Pair(u, v) for u, v in test.edges())
    AdamicAdar_evaluation = linkpred.evaluation.EvaluationSheet(AdamicAdar_results, AdamicAdar_test_set)

    AdamicAdar_results.to_file(result_output_path+'{}\\AdamicAdar.txt'.format(t), delimiter='\t', encoding='utf-8')

    AdamicAdar_evaluation.to_file(evaluation_output_path+'{}\\AdamicAdar.txt'.format(t))
    print('Saved AdamicAdar!')
        
def predict_AdamicAdar(G, test, t):
    AdamicAdar = linkpred.predictors.AdamicAdar(G)
    AdamicAdar_results = AdamicAdar.predict()

    AdamicAdar_test_set = set(linkpred.evaluation.Pair(u, v) for u, v in test.edges())
    AdamicAdar_evaluation = linkpred.evaluation.EvaluationSheet(AdamicAdar_results, AdamicAdar_test_set)

    AdamicAdar_results.to_file(result_output_path+'\{}\\AdamicAdar.txt'.format(t), delimiter='\t', encoding='utf-8')

    AdamicAdar_evaluation.to_file(evaluation_output_path+'{}\\AdamicAdar.txt'.format(t))
    print('Saved AdamicAdar!')
    
def predict_katz(G, test, t, beta=0.001, max_power=5):
    katz = linkpred.predictors.Katz(G)
    katz_results = katz.predict(beta=beta, max_power=max_power)

    katz_test_set = set(linkpred.evaluation.Pair(u, v) for u, v in test.edges())
    katz_evaluation = linkpred.evaluation.EvaluationSheet(katz_results, katz_test_set)

    katz_results.to_file(result_output_path+'{}\\katz_{}.txt'.format(t, "beta-{}".format(str(beta))), delimiter='\t', encoding='utf-8')

    katz_evaluation.to_file(evaluation_output_path+'{}\\katz_{}.txt'.format(t, "beta-{}".format(str(beta))))
    print('Saved katz!'+" beta-{}".format(str(beta)))

def predict_CommonNeighbours(G, test, t):
    CommonNeighbours = linkpred.predictors.CommonNeighbours(G)
    CommonNeighbours_results = CommonNeighbours.predict()

    CommonNeighbours_test_set = set(linkpred.evaluation.Pair(u, v) for u, v in test.edges())
    CommonNeighbours_evaluation = linkpred.evaluation.EvaluationSheet(CommonNeighbours_results, CommonNeighbours_test_set)

    CommonNeighbours_results.to_file(result_output_path+'{}\\CommonNeighbours.txt'.format(t), delimiter='\t', encoding='utf-8')

    CommonNeighbours_evaluation.to_file(evaluation_output_path+'{}\\CommonNeighbours.txt'.format(t))
    print('Saved CommonNeighbours!')

def predict_PreferentialAttatchment(G, test, t):
    PreferentialAttatchment = nx.preferential_attachment(G)
    
    
    PreferentialAttatchment_test_set = set(linkpred.evaluation.Pair(u, v) for u, v in test.edges())
    PreferentialAttatchment_evaluation = linkpred.evaluation.EvaluationSheet(PreferentialAttatchment, PreferentialAttatchment_test_set)
    
    
def main(testing_file, t, training_file=netfiles[0]):
    training_file = training_file
    testing_file = testing_file
    # read train
    G = linkpred.read_network(training_file)
    
    # sometimes work perfectly but sometimes raise AsserionError because of self-loop
    G.remove_edges_from(nx.selfloop_edges(G))
    
    # read test
    test = linkpred.read_network(testing_file)
    test.remove_edges_from(nx.selfloop_edges(test))
    
    # Computing Edge Betweenness in full testing network
    edge_betweenness_result = nx.algorithms.centrality.edge_betweenness_centrality(test)
    print("edge_betweenness_result: {}".format(len(edge_betweenness_result)))
    save_edge_betweenness(edge_betweenness_result, t, betweenness_output_path)
    
    training = G.copy()
    
    # remove nodes not in training
    test = remove_nodes_not_in_training(training, test)
    print('Removed nodes not in training')
    
    # 1
    print('-'*10, 'jaccard')
    predict_jaccard(G, test, t)
        
    # 2
    print('-'*10, 'AdamicAdar')
    predict_AdamicAdar(G, test, t)
    
    # 3
    print('-'*10, 'katz')
    predict_katz(G, test, t, beta=0.001, max_power=5)
    
    # 4
    print('-'*10, 'katz')
    predict_katz(G, test, t, beta=0.01, max_power=5)
    
    # 5
    print('-'*10, 'CommonNeighbours')
    predict_CommonNeighbours(G, test, t)
    
    # 6
    # print('-'*10, 'preferential_attachment')
    # predict_preferential_attachment(G, test, times)



In [66]:
for i, time in enumerate(times):
    if i < (len(times)-1):
        testing_file = netfiles[i+1]
        print(times[i+1], testing_file)
        main(testing_file, t=times[i+1], training_file=netfiles[0])

2016-2016 C:\Users\Liser\Desktop\linchengwei_link_prediction\sync\data_transformation_output\scientometrics\netfiles\co-occurrence\stem\2016-2016.net


Network contains multiple edges. These will be ignored.
Network contains multiple edges. These will be ignored.


edge_betweenness_result: 3761
Save edge betweenness!
nodes: 1178
edges: 3761
nodes: 479
edges: 1310
Removed nodes not in training
---------- jaccard
Saved Jaccard!
---------- AdamicAdar
Saved AdamicAdar!
---------- katz
Computing matrix powers: [############################################################] 5/5
Saved katz! beta-0.001
---------- katz
Computing matrix powers: [############################################################] 5/5
Saved katz! beta-0.01
---------- CommonNeighbours
Saved CommonNeighbours!
2016-2017 C:\Users\Liser\Desktop\linchengwei_link_prediction\sync\data_transformation_output\scientometrics\netfiles\co-occurrence\stem\2016-2017.net


Network contains multiple edges. These will be ignored.
Network contains multiple edges. These will be ignored.


edge_betweenness_result: 7785
Save edge betweenness!
nodes: 2221
edges: 7785
nodes: 748
edges: 2505
Removed nodes not in training
---------- jaccard
Saved Jaccard!
---------- AdamicAdar
Saved AdamicAdar!
---------- katz
Computing matrix powers: [############################################################] 5/5
Saved katz! beta-0.001
---------- katz
Computing matrix powers: [############################################################] 5/5
Saved katz! beta-0.01
---------- CommonNeighbours
Saved CommonNeighbours!
2016-2018 C:\Users\Liser\Desktop\linchengwei_link_prediction\sync\data_transformation_output\scientometrics\netfiles\co-occurrence\stem\2016-2018.net


Network contains multiple edges. These will be ignored.
Network contains multiple edges. These will be ignored.


edge_betweenness_result: 11929
Save edge betweenness!
nodes: 3148
edges: 11929
nodes: 913
edges: 3527
Removed nodes not in training
---------- jaccard
Saved Jaccard!
---------- AdamicAdar
Saved AdamicAdar!
---------- katz
Computing matrix powers: [############################################################] 5/5
Saved katz! beta-0.001
---------- katz
Computing matrix powers: [############################################################] 5/5
Saved katz! beta-0.01
---------- CommonNeighbours
Saved CommonNeighbours!
2016-2019 C:\Users\Liser\Desktop\linchengwei_link_prediction\sync\data_transformation_output\scientometrics\netfiles\co-occurrence\stem\2016-2019.net


Network contains multiple edges. These will be ignored.
Network contains multiple edges. These will be ignored.


edge_betweenness_result: 15510
Save edge betweenness!
nodes: 3884
edges: 15510
nodes: 1027
edges: 4413
Removed nodes not in training
---------- jaccard
Saved Jaccard!
---------- AdamicAdar
Saved AdamicAdar!
---------- katz
Computing matrix powers: [############################################################] 5/5
Saved katz! beta-0.001
---------- katz
Computing matrix powers: [############################################################] 5/5
Saved katz! beta-0.01
---------- CommonNeighbours
Saved CommonNeighbours!


In [64]:
for i, time in enumerate(times):
    if i < (len(times)-1):
        testing_file = netfiles[i+1]
        print(times[i+1], testing_file)
        main(testing_file, t=times[i+1], training_file=netfiles[0])

2016-2016 C:\Users\Liser\Desktop\linchengwei_link_prediction\sync\data_transformation_output\scientometrics\netfiles\co-occurrence\stem\2016-2016.net


Network contains multiple edges. These will be ignored.
Network contains multiple edges. These will be ignored.


nodes: 1178
edges: 3761
nodes: 479
edges: 1310
Removed nodes not in training
---------- jaccard
---------- AdamicAdar
---------- katz
---------- katz
---------- CommonNeighbours
Saved CommonNeighbours!
2016-2017 C:\Users\Liser\Desktop\linchengwei_link_prediction\sync\data_transformation_output\scientometrics\netfiles\co-occurrence\stem\2016-2017.net


Network contains multiple edges. These will be ignored.
Network contains multiple edges. These will be ignored.


nodes: 2221
edges: 7785
nodes: 748
edges: 2505
Removed nodes not in training
---------- jaccard
---------- AdamicAdar
---------- katz
---------- katz
---------- CommonNeighbours
Saved CommonNeighbours!
2016-2018 C:\Users\Liser\Desktop\linchengwei_link_prediction\sync\data_transformation_output\scientometrics\netfiles\co-occurrence\stem\2016-2018.net


Network contains multiple edges. These will be ignored.
Network contains multiple edges. These will be ignored.


nodes: 3148
edges: 11929
nodes: 913
edges: 3527
Removed nodes not in training
---------- jaccard
---------- AdamicAdar
---------- katz
---------- katz
---------- CommonNeighbours
Saved CommonNeighbours!
2016-2019 C:\Users\Liser\Desktop\linchengwei_link_prediction\sync\data_transformation_output\scientometrics\netfiles\co-occurrence\stem\2016-2019.net


Network contains multiple edges. These will be ignored.
Network contains multiple edges. These will be ignored.


nodes: 3884
edges: 15510
nodes: 1027
edges: 4413
Removed nodes not in training
---------- jaccard
---------- AdamicAdar
---------- katz
---------- katz
---------- CommonNeighbours
Saved CommonNeighbours!


## try preferential attachment

In [42]:
jaccard = linkpred.predictors.Jaccard(G)
jaccard_results = jaccard.predict()

In [51]:
import networkx as nx
PreferentialAttatchment = nx.preferential_attachment(G)

('Competency', 'Internet and web learning', 24)

## Full example
https://github.com/rafguns/linkpred/issues/12

#### tn value is -1, something's wrong
    Solved, with the correct number of tp, fp, fn, can calculate the number of tn
    tp + fn = num_of_link in testing network
    tp + fp = num_of_predict_link
    tn = all_possible_link - tp - fp - fn