In [46]:
import numpy as np
import pandas as pd
import networkx as nx
import os
import json
import scipy
from scipy.stats import spearmanr
from networkx.algorithms import bipartite

### Read Graph

In [47]:
# repo1 = os.listdir('../repo_info')
# repo2 = os.listdir('../repo_info2')
# repos = repo1 + repo2
# repos.remove('00_repo_names.json')
# repos.remove('00_repo_names.json')
# user_path = '../new_starer_info'
# users_file = os.listdir(user_path)

In [48]:
# edges = []
# users = set()
# for fileName in users_file:
#     with open(user_path + '/' + fileName, 'r') as f:
#         data = json.load(f)
#     for user in data:
#         id = user['username']
#         edges.append((id, fileName))
#         users.add(id)

In [49]:
# G = nx.Graph()

# G.add_nodes_from(users, bipartite=1)
# G.add_nodes_from(repos, bipartite=0)

# G.add_edges_from(edges)

In [50]:
# hubs = {n for n, d in G.nodes(data=True) if d['bipartite'] == 1}
# authorities = {n for n, d in G.nodes(data=True) if d['bipartite'] == 0}
# print("Number of users:", len(hubs))
# print("Number of repos:", len(authorities))

### Read Graph from Adjacency Matrix

In [51]:
adjacency_matrix = {}
with open('../adjacency_matrix.json') as f:
    adjacency_matrix = json.load(f)
adjacency = scipy.sparse.csr_matrix(adjacency_matrix)

In [52]:
B = bipartite.from_biadjacency_matrix(adjacency)

In [53]:
# assign names to the nodes
with open ('../repos.json', 'r') as repo_list:
    set_0_names = json.load(repo_list)
with open ('../users.json', 'r') as user_list:
    set_1_names = json.load(user_list)

# Assign names to the nodes
mapping = {i: name for i, name in enumerate(set_0_names + set_1_names)}
B = nx.relabel_nodes(B, mapping)

In [54]:
hubs = {n for n, d in B.nodes(data=True) if d['bipartite'] == 1}
authorities = {n for n, d in B.nodes(data=True) if d['bipartite'] == 0}
print("Number of users:", len(hubs))
print("Number of repos:", len(authorities))

Number of users: 412266
Number of repos: 100


### HITS Algorithm

In [55]:
def normalize(score):
    total = sum(score.values())
    return {k: v / total for k, v in score.items()}

In [56]:
def hits(G, h_init=None, a_init=None, max_iter=100, threshold=1.0e-8, normalized=True):
    # check if proper graph
    if isinstance(G, nx.MultiGraph | nx.MultiDiGraph):
        raise Exception("hits() not defined for graphs with multiedges.")
    if len(G) == 0:
        return {}, {}
    
    # create dictionary for hubs and authorities
    hubs = {n for n, d in G.nodes(data=True) if d['bipartite'] == 1}
    authorities = {n for n, d in G.nodes(data=True) if d['bipartite'] == 0}
    
    # initialize score for each node, else equal
    if h_init is None:
        hub_score = dict.fromkeys(hubs, 1.0 / len(hubs))
    else:
        hub_score = h_init
        hub_score = normalize(hub_score)
    if a_init is None:
        authority_score = dict.fromkeys(authorities, 1.0 / len(authorities))
    else:
        authority_score = a_init
        authority_score = normalize(authority_score)
        
    # calculate score iteratively
    for _ in range(max_iter):  # power iteration: make up to max_iter iterations
        hlast = hub_score
        alast = authority_score
        
        authority_score = dict.fromkeys(authorities, 0)
        for h in hubs:
            score = hub_score[h]
            for nbr in G[h]:
                authority_score[nbr] += score
        authority_score = normalize(authority_score)

        hub_score = dict.fromkeys(hubs, 0)
        for a in authorities:
            score = authority_score[a]
            for nbr in G[a]:
                hub_score[nbr] += score
        hub_score = normalize(hub_score)
        
        err = sum(abs(hub_score[n] - hlast[n]) for n in hubs) + sum(abs(authority_score[n] - alast[n]) for n in authorities)
        if err < threshold:
            break
        
    return hub_score, authority_score

In [57]:
def hub_avg(G, h_init=None, a_init=None, max_iter=100, threshold=1.0e-8, normalized=True):
    # check if proper graph
    if isinstance(G, nx.MultiGraph | nx.MultiDiGraph):
        raise Exception("hits() not defined for graphs with multiedges.")
    if len(G) == 0:
        return {}, {}
    
    # create dictionary for hubs and authorities
    hubs = {n for n, d in G.nodes(data=True) if d['bipartite'] == 1}
    authorities = {n for n, d in G.nodes(data=True) if d['bipartite'] == 0}
    
    # initialize score for each node, else equal
    if h_init is None:
        hub_score = dict.fromkeys(hubs, 1.0 / len(hubs))
    else:
        hub_score = h_init
        hub_score = normalize(hub_score)
    if a_init is None:
        authority_score = dict.fromkeys(authorities, 1.0 / len(authorities))
    else:
        authority_score = a_init
        authority_score = normalize(authority_score)
        
    # calculate score iteratively
    for _ in range(max_iter):  # power iteration: make up to max_iter iterations
        hlast = hub_score
        alast = authority_score
        
        authority_score = dict.fromkeys(authorities, 0)
        for h in hubs:
            for nbr in G[h]:
                authority_score[nbr] += hub_score[h]
        authority_score = normalize(authority_score)

        hub_score = dict.fromkeys(hubs, 0)
        for a in authorities:
            score = authority_score[a] / len(G[a])
            for nbr in G[a]:
                hub_score[nbr] += score
        hub_score = normalize(hub_score)
        
        err = sum(abs(hub_score[n] - hlast[n]) for n in hubs) + sum(abs(authority_score[n] - alast[n]) for n in authorities)
        if err < threshold:
            break
        
    return hub_score, authority_score

In [58]:
def max_k(G, k=1, h_init=None, a_init=None, max_iter=100, threshold=1.0e-8, normalized=True):
    # check if proper graph
    if isinstance(G, nx.MultiGraph | nx.MultiDiGraph):
        raise Exception("hits() not defined for graphs with multiedges.")
    if len(G) == 0:
        return {}, {}
    
    # create dictionary for hubs and authorities
    hubs = {n for n, d in G.nodes(data=True) if d['bipartite'] == 1}
    authorities = {n for n, d in G.nodes(data=True) if d['bipartite'] == 0}
    
    # initialize score for each node, else equal
    if h_init is None:
        hub_score = dict.fromkeys(hubs, 1.0 / len(hubs))
    else:
        hub_score = h_init
        hub_score = normalize(hub_score)
    if a_init is None:
        authority_score = dict.fromkeys(authorities, 1.0 / len(authorities))
    else:
        authority_score = a_init
        authority_score = normalize(authority_score)
        
    # calculate score iteratively
    for _ in range(max_iter):  # power iteration: make up to max_iter iterations
        hlast = hub_score
        alast = authority_score
        
        authority_score = dict.fromkeys(authorities, 0)
        for a in authorities:
            authority_score[a] = sum([hub_score[nbr] for nbr in G[a]])
        authority_score = normalize(authority_score)
            
        hub_score = dict.fromkeys(hubs, 0)
        for h in hubs:
            nbr_score = [authority_score[nbr] for nbr in G[h]]
            hub_score[h] = sum(sorted(nbr_score, reverse=True)[:k])
        hub_score = normalize(hub_score)
       
        err = sum(abs(hub_score[n] - hlast[n]) for n in hubs) + sum(abs(authority_score[n] - alast[n]) for n in authorities)
        if err < threshold:
            break
        
    return hub_score, authority_score

### Comparison

In [59]:
def dict_sort(d):
    return dict(sorted(d.items(), key=lambda item: item[1], reverse=True))

In [60]:
hits_h, hits_a = hits(B)
hub_avg_h, hub_avg_a = hub_avg(B)
max_k_h, max_k_a = max_k(B)

KeyboardInterrupt: 

In [None]:
df_hits_h = pd.DataFrame(list(dict_sort(hits_h).items()), columns=["hits_user", "score"])
df_hub_avg_h = pd.DataFrame(list(dict_sort(hub_avg_h).items()), columns=["hub_avg_user", "score"])
df_max_k_h = pd.DataFrame(list(dict_sort(max_k_h).items()), columns=["max_k_user", "score"])
df_h = pd.concat([df_hits_h, df_hub_avg_h, df_max_k_h], axis = 1)
df_h.head(20)

Unnamed: 0,hits_user,score,hub_avg_user,score.1,max_k_user,score.2
0,chaoqun13,4.7e-05,chaoqun13,5.3e-05,xuedingesmiao,4e-06
1,Dilid,4.6e-05,Dilid,5.2e-05,Devorein,4e-06
2,Vincent--Li,4.6e-05,Vincent--Li,5.1e-05,jifffffy,4e-06
3,akulagrawal,4.5e-05,rougsig,5e-05,ggzjg,4e-06
4,rougsig,4.4e-05,akulagrawal,4.9e-05,Ayarxy,4e-06
5,slavapeshkin,4.4e-05,karanjude,4.9e-05,rahmatnazali,4e-06
6,arturBermondTorres,4.3e-05,arturBermondTorres,4.8e-05,basnijholt,4e-06
7,wangxu-scu,4.3e-05,chuizi000,4.8e-05,n3tr,4e-06
8,sashankaryal,4.2e-05,slavapeshkin,4.8e-05,chenbao-cn,4e-06
9,MasonStone,4.2e-05,sashankaryal,4.8e-05,jsonchi,4e-06


In [None]:
df_hits_a = pd.DataFrame(list(dict_sort(hits_a).items()), columns=["hits_repo", "score"])
df_hub_avg_a = pd.DataFrame(list(dict_sort(hub_avg_a).items()), columns=["hub_avg_repo", "score"])
df_max_k_a = pd.DataFrame(list(dict_sort(max_k_a).items()), columns=["max_k_repo", "score"])
df_a = pd.concat([df_hits_a, df_hub_avg_a, df_max_k_a], axis = 1)
df_a.head(20)

Unnamed: 0,hits_repo,score,hub_avg_repo,score.1,max_k_repo,score.2
0,pytorch,0.019312,pytorch,0.017803,pytorch,0.023848
1,transformers,0.01851,transformers,0.017552,keras,0.021759
2,keras,0.017817,keras,0.016367,transformers,0.021455
3,TensorFlow-Examples,0.017153,Deep-Learning-Papers-Reading-Roadmap,0.016194,TensorFlow-Examples,0.020961
4,Deep-Learning-Papers-Reading-Roadmap,0.017028,TensorFlow-Examples,0.016012,Deep-Learning-Papers-Reading-Roadmap,0.019151
5,DeepSpeed,0.016226,DeepSpeed,0.015818,d2l-zh,0.017347
6,Made-With-ML,0.015863,Made-With-ML,0.015565,tensorflow,0.017079
7,ColossalAI,0.015661,ray,0.015343,faceswap,0.01622
8,ray,0.015601,ColossalAI,0.015118,caffe,0.016069
9,pytorch-lightning,0.014912,pytorch-lightning,0.014898,100-Days-Of-ML-Code,0.015669


### Original Graph

In [None]:
hits_h, hits_a = hits(G)
hub_avg_h, hub_avg_a = hub_avg(G)
max_k_h, max_k_a = max_k(G)

In [None]:
df_hits_h = pd.DataFrame(list(dict_sort(hits_h).items()), columns=["hits_user", "hits_score"])
df_hub_avg_h = pd.DataFrame(list(dict_sort(hub_avg_h).items()), columns=["hub_avg_user", "hub_avg_score"])
df_max_k_h = pd.DataFrame(list(dict_sort(max_k_h).items()), columns=["max_k_user", "max_k_score"])
df_h = pd.concat([df_hits_h, df_hub_avg_h, df_max_k_h], axis = 1)
df_h.head(20)

In [None]:
df_hits_a = pd.DataFrame(list(dict_sort(hits_a).items()), columns=["hits_repo", "hits_score"])
df_hub_avg_a = pd.DataFrame(list(dict_sort(hub_avg_a).items()), columns=["hub_avg_repo", "hub_avg_score"])
df_max_k_a = pd.DataFrame(list(dict_sort(max_k_a).items()), columns=["max_k_repo", "max_k_score"])
df_orig = pd.DataFrame(list(dict_sort({a: len(G[a]) for a in authorities}).items()), columns=["star_repo", "star_num"])
df_a = pd.concat([df_hits_a, df_hub_avg_a, df_max_k_a, df_orig], axis = 1)
df_a.head(20)

### With Initialization

In [None]:
with open ("../follower_count_final.json", "r") as f:
    follower_cnt = json.load(f)
h_init = {data["username"]: data["followers_count"] for data in follower_cnt}

In [None]:
for key in h_init:
    if h_init[key] == -1:
        h_init[key] = 5

In [None]:
hits_h_init, hits_a_init = hits(B, h_init)
hub_avg_h_init, hub_avg_a_init = hub_avg(B, h_init)
max_k_h_init, max_k_a_init = max_k(B, 1, h_init)

In [None]:
df_hits_h_init = pd.DataFrame(list(dict_sort(hits_h_init).items()), columns=["hits_user_init", "score"])
df_hub_avg_h_init = pd.DataFrame(list(dict_sort(hub_avg_h_init).items()), columns=["hub_avg_user_init", "score"])
df_max_k_h_init = pd.DataFrame(list(dict_sort(max_k_h_init).items()), columns=["max_k_user_init", "score"])
df_h_init = pd.concat([df_hits_h_init, df_hub_avg_h_init, df_max_k_h_init], axis = 1)
df_h_init.head(20)

Unnamed: 0,hits_user_init,score,hub_avg_user_init,score.1,max_k_user_init,score.2
0,chaoqun13,4.7e-05,chaoqun13,5.3e-05,xuedingesmiao,4e-06
1,Dilid,4.6e-05,Dilid,5.2e-05,Devorein,4e-06
2,Vincent--Li,4.6e-05,Vincent--Li,5.1e-05,jifffffy,4e-06
3,akulagrawal,4.5e-05,rougsig,5e-05,ggzjg,4e-06
4,rougsig,4.4e-05,akulagrawal,4.9e-05,Ayarxy,4e-06
5,slavapeshkin,4.4e-05,karanjude,4.9e-05,rahmatnazali,4e-06
6,arturBermondTorres,4.3e-05,arturBermondTorres,4.8e-05,basnijholt,4e-06
7,wangxu-scu,4.3e-05,chuizi000,4.8e-05,n3tr,4e-06
8,sashankaryal,4.2e-05,slavapeshkin,4.8e-05,chenbao-cn,4e-06
9,MasonStone,4.2e-05,sashankaryal,4.8e-05,jsonchi,4e-06


In [None]:
df_hits_a_init = pd.DataFrame(list(dict_sort(hits_a_init).items()), columns=["hits_repo_init", "score"])
df_hub_avg_a_init = pd.DataFrame(list(dict_sort(hub_avg_a_init).items()), columns=["hub_avg_repo_init", "score"])
df_max_k_a_init = pd.DataFrame(list(dict_sort(max_k_a_init).items()), columns=["max_k_repo_init", "score"])
df_orig = pd.DataFrame(list(dict_sort({a: len(B[a]) for a in authorities}).items()), columns=["star_num_repo", "score"])
df_a_init = pd.concat([df_hits_a_init, df_hub_avg_a_init, df_max_k_a_init, df_orig], axis = 1)
df_a_init.head(20)

Unnamed: 0,hits_repo_init,score,hub_avg_repo_init,score.1,max_k_repo_init,score.2,star_num_repo,score.3
0,pytorch,0.019312,pytorch,0.017803,pytorch,0.023848,pytorch,33804
1,transformers,0.01851,transformers,0.017552,keras,0.021759,transformers,32883
2,keras,0.017817,keras,0.016367,transformers,0.021455,keras,32841
3,TensorFlow-Examples,0.017153,Deep-Learning-Papers-Reading-Roadmap,0.016194,TensorFlow-Examples,0.020961,TensorFlow-Examples,32619
4,Deep-Learning-Papers-Reading-Roadmap,0.017028,TensorFlow-Examples,0.016012,Deep-Learning-Papers-Reading-Roadmap,0.019151,Deep-Learning-Papers-Reading-Roadmap,31409
5,DeepSpeed,0.016226,DeepSpeed,0.015818,d2l-zh,0.017347,d2l-zh,30929
6,Made-With-ML,0.015863,Made-With-ML,0.015565,tensorflow,0.017079,ColossalAI,29883
7,ColossalAI,0.015661,ray,0.015343,faceswap,0.01622,faceswap,29570
8,ray,0.015601,ColossalAI,0.015118,caffe,0.016069,Real-Time-Voice-Cloning,29382
9,pytorch-lightning,0.014912,pytorch-lightning,0.014898,100-Days-Of-ML-Code,0.015669,tensorflow,29011


### Evaluation

In [None]:
df_h_all = pd.DataFrame({"hits": hits_h, "hub_avg": hub_avg_h, "max_k": max_k_h, "hits_init": hits_h_init, "hub_avg_init": hub_avg_h_init, "max_k_init": max_k_h_init})

ranked_df = df_h_all.rank()

# Calculate Spearman's rank correlation matrix
corr, _ = spearmanr(ranked_df)

# Convert the correlation matrix to a DataFrame for better readability
corr_df = pd.DataFrame(corr, index=df_h_all.columns, columns=df_h_all.columns)
display(corr_df)

Unnamed: 0,hits,hub_avg,max_k,hits_init,hub_avg_init,max_k_init
hits,1.0,0.903617,0.671772,1.0,0.903617,0.671772
hub_avg,0.903617,1.0,0.421437,0.903617,1.0,0.421437
max_k,0.671772,0.421437,1.0,0.671772,0.421437,1.0
hits_init,1.0,0.903617,0.671772,1.0,0.903617,0.671772
hub_avg_init,0.903617,1.0,0.421437,0.903617,1.0,0.421437
max_k_init,0.671772,0.421437,1.0,0.671772,0.421437,1.0


In [None]:
df_a_all = pd.DataFrame({"hits": hits_a, "hub_avg": hub_avg_a, "max_k": max_k_a, "hits_init": hits_a_init, 
                         "hub_avg_init": hub_avg_a_init, "max_k_init": max_k_a_init, "original": {a: len(B[a]) for a in authorities}})

ranked_df = df_a_all.rank()

# Calculate Spearman's rank correlation matrix
corr, _ = spearmanr(ranked_df)

# Convert the correlation matrix to a DataFrame for better readability
corr_df = pd.DataFrame(corr, index=df_a_all.columns, columns=df_a_all.columns)
display(corr_df)

Unnamed: 0,hits,hub_avg,max_k,hits_init,hub_avg_init,max_k_init,original
hits,1.0,0.996808,0.939802,1.0,0.996808,0.939802,0.897581
hub_avg,0.996808,1.0,0.920972,0.996808,1.0,0.920972,0.878937
max_k,0.939802,0.920972,1.0,0.939802,0.920972,1.0,0.981596
hits_init,1.0,0.996808,0.939802,1.0,0.996808,0.939802,0.897581
hub_avg_init,0.996808,1.0,0.920972,0.996808,1.0,0.920972,0.878937
max_k_init,0.939802,0.920972,1.0,0.939802,0.920972,1.0,0.981596
original,0.897581,0.878937,0.981596,0.897581,0.878937,0.981596,1.0


### Different k

In [None]:
hub_res = []
auth_res = []
for i in range(1, 10):
    hub_score, auth_score = max_k(B, i)
    hub_res.append(hub_score)
    auth_res.append(auth_score)
    print(i)

1
2
3
4
5
6
7
8
9


In [76]:
df = pd.DataFrame()
for i in range(10):
    df = pd.concat([df, pd.DataFrame(list(dict_sort(hub_res[i]).items()), columns=[f"user_{i+1}", f"score_{i+1}"])], axis=1)
df.head(20)

Unnamed: 0,user_1,score_1,user_2,score_2,user_3,score_3,user_4,score_4,user_5,score_5,user_6,score_6,user_7,score_7,user_8,score_8,user_9,score_9,user_10,score_10
0,xuedingesmiao,4e-06,Ayarxy,4e-06,tjulyz,5e-06,TonyzBi,5e-06,TonyzBi,6e-06,najlepsiwebdesigner,7e-06,najlepsiwebdesigner,7e-06,najlepsiwebdesigner,8e-06,najlepsiwebdesigner,9e-06,najlepsiwebdesigner,9e-06
1,Devorein,4e-06,chenbao-cn,4e-06,TonyzBi,5e-06,michca07,5e-06,michca07,6e-06,zgjstudy,7e-06,zgjstudy,7e-06,ray-ng,8e-06,ray-ng,9e-06,ray-ng,9e-06
2,jifffffy,4e-06,ehmtang,4e-06,0luck0,5e-06,mibdennis,5e-06,najlepsiwebdesigner,6e-06,ray-ng,7e-06,ray-ng,7e-06,volutail,8e-06,volutail,9e-06,volutail,9e-06
3,ggzjg,4e-06,tjulyz,4e-06,whubao,5e-06,KangweiiLiu,5e-06,w675881684,6e-06,volutail,7e-06,volutail,7e-06,msmilevski,8e-06,msmilevski,9e-06,msmilevski,9e-06
4,Ayarxy,4e-06,TonyzBi,4e-06,ishotjr,5e-06,najlepsiwebdesigner,5e-06,bjbluejita,6e-06,lurrybryant,7e-06,lurrybryant,7e-06,James-DBA-Anderson,8e-06,James-DBA-Anderson,9e-06,alexopoulos7,9e-06
5,rahmatnazali,4e-06,0luck0,4e-06,kxg916361108,5e-06,w675881684,5e-06,sadolintw,6e-06,Moerzelmann,7e-06,songw,7e-06,alexopoulos7,8e-06,alexopoulos7,9e-06,ashwin2802,9e-06
6,basnijholt,4e-06,colodenn,4e-06,michca07,5e-06,bjbluejita,5e-06,zgjstudy,6e-06,songw,7e-06,msmilevski,7e-06,Root-9527,8e-06,ashwin2802,9e-06,ozeranjo,9e-06
7,n3tr,4e-06,whubao,4e-06,blairdrummond,5e-06,sadolintw,5e-06,asaketsu,6e-06,msmilevski,7e-06,James-DBA-Anderson,7e-06,lioo717,8e-06,ozeranjo,9e-06,LS-King,9e-06
8,chenbao-cn,4e-06,jdhjhj,4e-06,mcfatealan,5e-06,zgjstudy,5e-06,solomonno,6e-06,James-DBA-Anderson,7e-06,alexopoulos7,7e-06,ashwin2802,8e-06,LS-King,9e-06,wkqscut,9e-06
9,jsonchi,4e-06,ishotjr,4e-06,mibdennis,5e-06,nyangoto,5e-06,palincho,6e-06,alexopoulos7,7e-06,Root-9527,7e-06,ozeranjo,8e-06,wkqscut,9e-06,XingJinming-real,9e-06


In [77]:
df = pd.DataFrame()
for i in range(10):
    df = pd.concat([df, pd.DataFrame(list(dict_sort(auth_res[i]).items()), columns=[f"user_{i+1}", f"score_{i+1}"])], axis=1)
df.head(20)

Unnamed: 0,user_1,score_1,user_2,score_2,user_3,score_3,user_4,score_4,user_5,score_5,user_6,score_6,user_7,score_7,user_8,score_8,user_9,score_9,user_10,score_10
0,pytorch,0.023848,pytorch,0.02253,pytorch,0.022098,pytorch,0.021807,pytorch,0.021597,pytorch,0.021432,pytorch,0.021298,pytorch,0.021181,pytorch,0.021075,pytorch,0.020977
1,keras,0.021759,keras,0.021649,keras,0.021254,keras,0.020863,keras,0.020548,keras,0.020299,keras,0.020092,keras,0.019917,keras,0.019766,keras,0.01963
2,transformers,0.021455,TensorFlow-Examples,0.020644,TensorFlow-Examples,0.020255,transformers,0.020033,transformers,0.019924,transformers,0.019839,transformers,0.01977,transformers,0.019709,transformers,0.01965,transformers,0.019594
3,TensorFlow-Examples,0.020961,transformers,0.020455,transformers,0.020186,TensorFlow-Examples,0.019901,TensorFlow-Examples,0.019606,TensorFlow-Examples,0.019361,TensorFlow-Examples,0.019158,TensorFlow-Examples,0.018988,TensorFlow-Examples,0.018839,TensorFlow-Examples,0.01871
4,Deep-Learning-Papers-Reading-Roadmap,0.019151,Deep-Learning-Papers-Reading-Roadmap,0.018912,Deep-Learning-Papers-Reading-Roadmap,0.018799,Deep-Learning-Papers-Reading-Roadmap,0.018659,Deep-Learning-Papers-Reading-Roadmap,0.018517,Deep-Learning-Papers-Reading-Roadmap,0.018393,Deep-Learning-Papers-Reading-Roadmap,0.01828,Deep-Learning-Papers-Reading-Roadmap,0.018184,Deep-Learning-Papers-Reading-Roadmap,0.018099,Deep-Learning-Papers-Reading-Roadmap,0.018022
5,d2l-zh,0.017347,tensorflow,0.017182,tensorflow,0.01673,ColossalAI,0.016337,ColossalAI,0.01639,ColossalAI,0.016406,ColossalAI,0.016398,ColossalAI,0.016382,ColossalAI,0.016359,ColossalAI,0.016332
6,tensorflow,0.017079,d2l-zh,0.01687,d2l-zh,0.01645,tensorflow,0.016268,caffe,0.016003,DeepSpeed,0.016038,DeepSpeed,0.016136,DeepSpeed,0.016206,DeepSpeed,0.016257,DeepSpeed,0.016296
7,faceswap,0.01622,faceswap,0.016699,faceswap,0.01644,faceswap,0.016168,faceswap,0.01594,caffe,0.015886,caffe,0.015784,Made-With-ML,0.01575,Made-With-ML,0.015768,Made-With-ML,0.015785
8,caffe,0.016069,caffe,0.016294,caffe,0.016252,caffe,0.016131,DeepSpeed,0.015899,faceswap,0.015761,Made-With-ML,0.015729,caffe,0.015697,caffe,0.015618,caffe,0.015546
9,100-Days-Of-ML-Code,0.015669,ColossalAI,0.016036,ColossalAI,0.01623,d2l-zh,0.01611,tensorflow,0.015876,Made-With-ML,0.0157,faceswap,0.015613,faceswap,0.015492,faceswap,0.015392,ray,0.01538


In [109]:
df_k_all = pd.DataFrame({f"k={i+1}": hub_res[i] for i in range(10)})

ranked_df = df_k_all.rank()

# Calculate Spearman's rank correlation matrix
corr, _ = spearmanr(ranked_df)

# Convert the correlation matrix to a DataFrame for better readability
corr_df = pd.DataFrame(corr, index=df_k_all.columns, columns=df_k_all.columns)
display(corr_df)

Unnamed: 0,k=1,k=2,k=3,k=4,k=5,k=6,k=7,k=8,k=9,k=10
k=1,1.0,0.919903,0.805629,0.75452,0.732529,0.721108,0.714135,0.709334,0.705731,0.702795
k=2,0.919903,1.0,0.932145,0.890172,0.871642,0.862262,0.856706,0.853005,0.850247,0.848025
k=3,0.805629,0.932145,1.0,0.987195,0.977079,0.971584,0.9684,0.966356,0.964887,0.96373
k=4,0.75452,0.890172,0.987195,1.0,0.997,0.993988,0.992008,0.990697,0.989751,0.989008
k=5,0.732529,0.871642,0.977079,0.997,1.0,0.999073,0.997956,0.997101,0.996449,0.995922
k=6,0.721108,0.862262,0.971584,0.993988,0.999073,1.0,0.999649,0.999158,0.998722,0.998346
k=7,0.714135,0.856706,0.9684,0.992008,0.997956,0.999649,1.0,0.999843,0.999592,0.999338
k=8,0.709334,0.853005,0.966356,0.990697,0.997101,0.999158,0.999843,1.0,0.999918,0.999771
k=9,0.705731,0.850247,0.964887,0.989751,0.996449,0.998722,0.999592,0.999918,1.0,0.999951
k=10,0.702795,0.848025,0.96373,0.989008,0.995922,0.998346,0.999338,0.999771,0.999951,1.0


In [85]:
df_k_all = pd.DataFrame({f"k={i+1}": auth_res[i] for i in range(10)})
df_a_all = pd.DataFrame({"hits": hits_a, "hub_avg": hub_avg_a, "max_k": max_k_a, "original": {a: len(B[a]) for a in authorities}})
df_res = pd.concat([df_a_all, df_k_all], axis=1)
ranked_df = df_res.rank()

# Calculate Spearman's rank correlation matrix
corr, _ = spearmanr(ranked_df)

# Convert the correlation matrix to a DataFrame for better readability
corr_df = pd.DataFrame(corr, index=df_res.columns, columns=df_res.columns)
display(corr_df)

Unnamed: 0,hits,hub_avg,max_k,original,k=1,k=2,k=3,k=4,k=5,k=6,k=7,k=8,k=9,k=10
hits,1.0,0.996808,0.939802,0.897581,0.939802,0.945359,0.953975,0.962268,0.969193,0.974233,0.97997,0.98285,0.984674,0.986751
hub_avg,0.996808,1.0,0.920972,0.878937,0.920972,0.927477,0.937162,0.946475,0.954527,0.960372,0.967657,0.971353,0.973537,0.976322
max_k,0.939802,0.920972,1.0,0.981596,1.0,0.998884,0.997492,0.994923,0.990711,0.988455,0.98447,0.981674,0.979598,0.976982
original,0.897581,0.878937,0.981596,1.0,0.981596,0.98183,0.977834,0.973147,0.965586,0.961278,0.955733,0.951839,0.949079,0.9448
k=1,0.939802,0.920972,1.0,0.981596,1.0,0.998884,0.997492,0.994923,0.990711,0.988455,0.98447,0.981674,0.979598,0.976982
k=2,0.945359,0.927477,0.998884,0.98183,0.998884,1.0,0.998992,0.996844,0.993291,0.991083,0.987519,0.984926,0.983198,0.980762
k=3,0.953975,0.937162,0.997492,0.977834,0.997492,0.998992,1.0,0.998896,0.99646,0.994731,0.991755,0.989571,0.988071,0.985995
k=4,0.962268,0.946475,0.994923,0.973147,0.994923,0.996844,0.998896,1.0,0.998548,0.997348,0.99508,0.993315,0.992067,0.990351
k=5,0.969193,0.954527,0.990711,0.965586,0.990711,0.993291,0.99646,0.998548,1.0,0.999424,0.99772,0.996508,0.995488,0.994191
k=6,0.974233,0.960372,0.988455,0.961278,0.988455,0.991083,0.994731,0.997348,0.999424,1.0,0.99904,0.998188,0.997444,0.996376


### Quality Assessment

In [95]:
df_test = pd.DataFrame(dict_sort(hits_a).items(), columns=["repo name", "hits score"])
df_test.head()

Unnamed: 0,repo name,hits score
0,pytorch,0.019312
1,transformers,0.01851
2,keras,0.017817
3,TensorFlow-Examples,0.017153
4,Deep-Learning-Papers-Reading-Roadmap,0.017028


In [99]:
df_test["degree"] = [len(B[repo]) for repo in df_test["repo name"]]
df_test["nbr_avg"] = df_test["hits score"] / df_test["degree"]
df_test.head(20)

Unnamed: 0,repo name,hits score,degree,nbr_avg
0,pytorch,0.019312,33804,5.712817e-07
1,transformers,0.01851,32883,5.628998e-07
2,keras,0.017817,32841,5.425092e-07
3,TensorFlow-Examples,0.017153,32619,5.258598e-07
4,Deep-Learning-Papers-Reading-Roadmap,0.017028,31409,5.421385e-07
5,DeepSpeed,0.016226,27686,5.860571e-07
6,Made-With-ML,0.015863,27985,5.668273e-07
7,ColossalAI,0.015661,29883,5.240646e-07
8,ray,0.015601,25642,6.084055e-07
9,pytorch-lightning,0.014912,23193,6.429653e-07


In [132]:
df_rank = pd.DataFrame({"hits": hits_a, "hub_avg": hub_avg_a, "max_k": max_k_a, "orig": {a: len(B[a]) for a in authorities}}).rank(ascending=False)
df_rank["hits_diff"] = df_rank["hits"] - df_rank["orig"]
df_rank["hub_avg_diff"] = df_rank["hub_avg"] - df_rank["orig"]
df_rank["max_k_diff"] = df_rank["max_k"] - df_rank["orig"]
df_rank.head()

Unnamed: 0,hits,hub_avg,max_k,orig,hits_diff,hub_avg_diff,max_k_diff
awesome-datascience,51.0,46.0,53.0,48.0,3.0,-2.0,5.0
data-science-ipython-notebooks,30.0,29.0,34.0,37.0,-7.0,-8.0,-3.0
horovod,47.0,45.0,69.0,82.0,-35.0,-37.0,-13.0
best-of-ml-python,81.0,79.0,89.0,88.0,-7.0,-9.0,1.0
annotated_deep_learning_paper_implementations,31.0,30.0,23.0,18.0,13.0,12.0,5.0


In [140]:
df_rank.describe()

Unnamed: 0,hits,hub_avg,max_k,orig,hits_diff,hub_avg_diff,max_k_diff
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,50.5,50.5,50.5,50.5,0.0,0.0,0.0
std,29.011492,29.011492,29.011492,29.011318,13.130272,14.275429,5.56595
min,1.0,1.0,1.0,1.0,-35.0,-37.0,-13.0
25%,25.75,25.75,25.75,25.75,-9.0,-9.25,-4.0
50%,50.5,50.5,50.5,50.5,-1.0,-2.0,0.0
75%,75.25,75.25,75.25,74.875,7.5,9.0,3.0
max,100.0,100.0,100.0,100.0,30.0,34.0,20.0


In [144]:
threshold = 5
print("rank acsend over 10:", df_rank[df_rank["hits_diff"] <= -threshold].shape[0])
print("rank descend over 10:", df_rank[df_rank["hits_diff"] >= threshold].shape[0])

rank acsend over 10: 41
rank descend over 10: 29


In [135]:
threshold = 5
print("rank acsend over 10:", df_rank[df_rank["hub_avg_diff"] <= -threshold].shape[0])
print("rank descend over 10:", df_rank[df_rank["hub_avg_diff"] >= threshold].shape[0])

rank acsend over 10: 43
rank descend over 10: 33


In [134]:
threshold = 5
print("rank acsend over 10:", df_rank[df_rank["max_k_diff"] <= -threshold].shape[0])
print("rank descend over 10:", df_rank[df_rank["max_k_diff"] >= threshold].shape[0])

rank acsend over 10: 19
rank descend over 10: 13
