In [89]:
import numpy as np
import pandas as pd
import networkx as nx
import os
import json
from networkx.algorithms import bipartite

### Read Graph

In [90]:
repo_path = '../repo_info'
repos = os.listdir(repo_path)
repos.remove('00_repo_names.json')
user_path = '../starer_info'
users_file = os.listdir(user_path)

In [91]:
edges = []
users = set()
for fileName in users_file:
    with open(user_path + '/' + fileName, 'r') as f:
        data = json.load(f)
    for user in data:
        id = user['username']
        edges.append((id, fileName))
        users.add(id)

In [152]:
B = nx.Graph()

B.add_nodes_from(users, bipartite='hub')
B.add_nodes_from(repos, bipartite='authority')

B.add_edges_from(edges)

In [143]:
hubs = {n for n, d in B.nodes(data=True) if d['bipartite'] == 'hub'}
authorities = {n for n, d in B.nodes(data=True) if d['bipartite'] == 'authority'}
print("Number of users:", len(hubs))
print("Number of repos:", len(authorities))

Number of users: 799030
Number of repos: 50


### Remove 1-degree nodes

In [94]:
nodes_to_remove = [node for node, degree in B.degree() if degree <= 1]
B.remove_nodes_from(nodes_to_remove)

In [95]:
hubs = {n for n, d in B.nodes(data=True) if d['bipartite'] == 'hub'}
authorities = {n for n, d in B.nodes(data=True) if d['bipartite'] == 'authority'}
print("Number of users:", len(hubs))
print("Number of repos:", len(authorities))

Number of users: 286178
Number of repos: 50


### HITS Algorithm

In [148]:
def normalize(score):
    total = sum(score.values())
    return {k: v / total for k, v in score.items()}

In [149]:
def hits(G, h_init=None, a_init=None, max_iter=100, threshold=1.0e-8, normalized=True):
    # check if proper graph
    if isinstance(G, nx.MultiGraph | nx.MultiDiGraph):
        raise Exception("hits() not defined for graphs with multiedges.")
    if len(G) == 0:
        return {}, {}
    
    # create dictionary for hubs and authorities
    hubs = {n for n, d in G.nodes(data=True) if d['bipartite'] == 'hub'}
    authorities = {n for n, d in G.nodes(data=True) if d['bipartite'] == 'authority'}
    
    # initialize score for each node, else equal
    if h_init is None:
        hub_score = dict.fromkeys(hubs, 1.0 / len(hubs))
    else:
        hub_score = h_init
        hub_score = normalize(hub_score)
    if a_init is None:
        authority_score = dict.fromkeys(authorities, 1.0 / len(authorities))
    else:
        authority_score = a_init
        authority_score = normalize(authority_score)
    # calculate score iteratively
    for _ in range(max_iter):  # power iteration: make up to max_iter iterations
        hlast = hub_score
        alast = authority_score
        
        authority_score = dict.fromkeys(authorities, 0)
        for h in hubs:
            for nbr in G[h]:
                authority_score[nbr] += hub_score[h]
        authority_score = normalize(authority_score)

        hub_score = dict.fromkeys(hubs, 0)
        for a in authorities:
            for nbr in G[a]:
                hub_score[nbr] += authority_score[a]
        hub_score = normalize(hub_score)
        
        err = sum(abs(hub_score[n] - hlast[n]) for n in hubs) + sum(abs(authority_score[n] - alast[n]) for n in authorities)
        if err < threshold:
            break
    return hub_score, authority_score

In [153]:
hub_score, authority_score = hits(B)