In [1]:
import json
import os
import numpy as np
import networkx as nx
from networkx.algorithms import bipartite
import sknetwork as skn
from collections import Counter
import scipy
from sknetwork.clustering import get_modularity

In [2]:
repos = []
with open('repos.json', 'r') as f:
    repos = json.load(f)

len(repos)

100

In [3]:
users = []
with open('users.json', 'r') as f:
    users = json.load(f)

len(users)

412266

In [4]:
adjacency_matrix = {}
with open('adjacency_matrix.json', 'r') as f:
    adjacency_matrix = json.load(f)
adjacency = scipy.sparse.csr_matrix(adjacency_matrix)
adjacency

<100x412266 sparse matrix of type '<class 'numpy.intc'>'
	with 1908925 stored elements in Compressed Sparse Row format>

In [5]:
B = bipartite.from_biadjacency_matrix(adjacency)

In [6]:
print(nx.is_connected(B))
bottom_nodes, top_nodes = bipartite.sets(B)  # if success, the graph is bipartite

True


## Louvain

In [7]:
from sknetwork.clustering import Louvain

louvain = Louvain(random_state=42)
louvain.fit(adjacency, force_bipartite=True)

repoLabels_Louvain = louvain.labels_row_
userLabels_Louvain = louvain.labels_col_

len(set(repoLabels_Louvain)), len(set(userLabels_Louvain))

(9, 9)

In [8]:
Counter(repoLabels_Louvain)

Counter({4: 10, 1: 14, 0: 25, 2: 15, 6: 8, 3: 15, 5: 10, 7: 2, 8: 1})

In [9]:
repoNameLabel_Louvain = {}

for repo, label in zip(repos, repoLabels_Louvain):
    repoNameLabel_Louvain[repo] = label

len(repoNameLabel_Louvain)

100

In [10]:
labelRepoName_Louvain = {}
for l in set(repoLabels_Louvain):
    labelRepoName_Louvain[l] = []

for repo, label in zip(repos, repoLabels_Louvain):
    labelRepoName_Louvain[label].append(repo)

len(labelRepoName_Louvain)

9

In [11]:
labelRepoName_Louvain

{0: ['AI-For-Beginners.json',
  'annotated_deep_learning_paper_implementations.json',
  'applied-ml.json',
  'ColossalAI.json',
  'DeepFaceLab.json',
  'DeepSpeed.json',
  'faceswap.json',
  'GFPGAN.json',
  'gradio.json',
  'MockingBird.json',
  'Prompt-Engineering-Guide.json',
  'Real-Time-Voice-Cloning.json',
  'so-vits-svc.json',
  'spleeter.json',
  'stable-diffusion-webui.json',
  'stanford_alpaca.json',
  'TTS.json',
  'diffusers.json',
  'first-order-model.json',
  'JARVIS.json',
  'jina.json',
  'lectures.json',
  'Qix.json',
  'stable-diffusion-webui-colab.json',
  'supervision.json'],
 1: ['AI-Expert-Roadmap.json',
  'awesome-datascience.json',
  'data-science-ipython-notebooks.json',
  'machine-learning-for-software-engineers.json',
  'Made-With-ML.json',
  'ML-From-Scratch.json',
  '500-AI-Machine-learning-Deep-learning-Computer-vision-NLP-Projects-with-code.json',
  'awesome-deep-learning.json',
  'awesome-nlp.json',
  'awesome-production-machine-learning.json',
  'best-o

In [12]:
get_modularity(input_matrix=adjacency, labels=repoLabels_Louvain, labels_col=userLabels_Louvain)

0.27616872049490504

## Leiden

In [13]:
from sknetwork.clustering import Leiden

leiden = Leiden(random_state=42)
leiden.fit(adjacency, force_bipartite=True)

repoLabels_Leiden = leiden.labels_row_
userLabels_Leiden = leiden.labels_col_

len(set(repoLabels_Leiden)), len(set(userLabels_Leiden))

(8, 8)

In [14]:
Counter(repoLabels_Leiden)

Counter({2: 11, 5: 9, 1: 22, 0: 27, 4: 14, 3: 12, 6: 4, 7: 1})

In [15]:
repoNameLabel_Leiden = {}

for repo, label in zip(repos, repoLabels_Leiden):
    repoNameLabel_Leiden[repo] = label

len(repoNameLabel_Leiden)

100

In [16]:
labelRepoName_Leiden = {}
for l in set(repoLabels_Leiden):
    labelRepoName_Leiden[l] = []

for repo, label in zip(repos, repoLabels_Leiden):
    labelRepoName_Leiden[label].append(repo)

len(labelRepoName_Leiden)

8

In [17]:
labelRepoName_Leiden

{0: ['awesome-datascience.json',
  'awesome-deep-learning-papers.json',
  'caffe.json',
  'cs-video-courses.json',
  'data-science-ipython-notebooks.json',
  'Deep-Learning-Papers-Reading-Roadmap.json',
  'DeepSpeech.json',
  'handson-ml.json',
  'keras.json',
  'machine-learning-for-software-engineers.json',
  'ML-From-Scratch.json',
  'opencv.json',
  'pytorch.json',
  'spaCy.json',
  'TensorFlow-Examples.json',
  'tensorflow.json',
  'tesseract.js.json',
  'awesome-deep-learning.json',
  'awesome-nlp.json',
  'cheatsheets-ai.json',
  'CNTK.json',
  'lectures.json',
  'Machine-Learning-Tutorials.json',
  'openface.json',
  'Qix.json',
  'Screenshot-to-code.json',
  'tfjs.json'],
 1: ['AI-For-Beginners.json',
  'annotated_deep_learning_paper_implementations.json',
  'ColossalAI.json',
  'DeepFaceLab.json',
  'DeepSpeed.json',
  'faceswap.json',
  'GFPGAN.json',
  'gradio.json',
  'MockingBird.json',
  'Prompt-Engineering-Guide.json',
  'Real-Time-Voice-Cloning.json',
  'so-vits-svc.js

In [18]:
get_modularity(input_matrix=adjacency, labels=repoLabels_Leiden, labels_col=userLabels_Leiden)

0.281482868211348

## K-centers

In [7]:
from sknetwork.clustering import KCenters  # 48s

# kcenters = KCenters(n_clusters=10)  # nember of clusters is predefined
# kcenters.fit(adjacency, force_bipartite=True)

# repoLabels_Kcenters = kcenters.labels_row_
# userLabels_Kcenters = kcenters.labels_col_

# len(set(userLabels_Kcenters)), len(set(userLabels_Kcenters))

best_k = -1
max_modularity = -1
best_repoLabels_Kcenters = None
best_userLabels_Kcenters = None
for k in range(2, 11):
    kcenters = KCenters(n_clusters=k)  # nember of clusters is predefined
    kcenters.fit(adjacency, force_bipartite=True)

    repoLabels_Kcenters = kcenters.labels_row_
    userLabels_Kcenters = kcenters.labels_col_

    cur_modularity = get_modularity(input_matrix=adjacency, labels=repoLabels_Kcenters, labels_col=userLabels_Kcenters)
    print(k, ":", cur_modularity)

    if cur_modularity > max_modularity:
        max_modularity = cur_modularity
        best_k = k
        best_repoLabels_Kcenters = repoLabels_Kcenters
        best_userLabels_Kcenters = userLabels_Kcenters

print("best:", best_k, ":", max_modularity)

4 0.2996274631525948


In [8]:
Counter(best_repoLabels_Kcenters)

Counter({2: 26, 1: 28, 3: 20, 0: 26})

In [9]:
repoNameLabel_Kcenters = {}

for repo, label in zip(repos, best_repoLabels_Kcenters):
    repoNameLabel_Kcenters[repo] = label

len(repoNameLabel_Kcenters)

100

In [11]:
labelRepoName_Kcenters = {}
for l in set(best_repoLabels_Kcenters):
    labelRepoName_Kcenters[l] = []

for repo, label in zip(repos, best_repoLabels_Kcenters):
    labelRepoName_Kcenters[label].append(repo)

len(labelRepoName_Kcenters)

4

In [12]:
labelRepoName_Kcenters

{0: ['caffe.json',
  'DeepSpeech.json',
  'fastai.json',
  'keras.json',
  'netron.json',
  'opencv.json',
  'openpose.json',
  'pytorch.json',
  'ray.json',
  'spaCy.json',
  'streamlit.json',
  'TensorFlow-Examples.json',
  'tensorflow.json',
  'tesseract.js.json',
  'CNTK.json',
  'horovod.json',
  'imgaug.json',
  'labelImg.json',
  'lectures.json',
  'ml-agents.json',
  'onnx.json',
  'openface.json',
  'Screenshot-to-code.json',
  'shap.json',
  'tensor2tensor.json',
  'tfjs.json'],
 1: ['AI-For-Beginners.json',
  'ColossalAI.json',
  'DeepFaceLab.json',
  'DeepSpeed.json',
  'faceswap.json',
  'GFPGAN.json',
  'gradio.json',
  'mediapipe.json',
  'MockingBird.json',
  'Prompt-Engineering-Guide.json',
  'Real-Time-Voice-Cloning.json',
  'so-vits-svc.json',
  'spleeter.json',
  'stable-diffusion-webui.json',
  'stanford_alpaca.json',
  'TTS.json',
  'ultralytics.json',
  'yolov5.json',
  'CLIP.json',
  'diffusers.json',
  'EasyOCR.json',
  'first-order-model.json',
  'JARVIS.json'

In [24]:
# get_modularity(input_matrix=adjacency, labels=repoLabels_Kcenters, labels_col=userLabels_Kcenters)

0.2746972754913583

## Propogation

In [39]:
from sknetwork.clustering import PropagationClustering

propogation = PropagationClustering(n_iter=5)
propogation.fit(adjacency)

repoLabels_Propogation = propogation.labels_row_
userLabels_Propogation = propogation.labels_col_

len(set(userLabels_Propogation)), len(set(userLabels_Propogation))

(1, 1)

In [26]:
Counter(repoLabels_Propogation)

Counter({0: 100})

In [27]:
repoNameLabel_Propogation = {}

for repo, label in zip(repos, repoLabels_Propogation):
    repoNameLabel_Propogation[repo] = label

len(repoNameLabel_Propogation)

100

In [28]:
labelRepoName_Propogation = {}
for l in set(repoLabels_Propogation):
    labelRepoName_Propogation[l] = []

for repo, label in zip(repos, repoLabels_Propogation):
    labelRepoName_Propogation[label].append(repo)

len(labelRepoName_Propogation)

1

In [29]:
labelRepoName_Propogation

{0: ['100-Days-Of-ML-Code.json',
  'AI-Expert-Roadmap.json',
  'AI-For-Beginners.json',
  'annotated_deep_learning_paper_implementations.json',
  'applied-ml.json',
  'awesome-datascience.json',
  'awesome-deep-learning-papers.json',
  'caffe.json',
  'ColossalAI.json',
  'cs-video-courses.json',
  'd2l-zh.json',
  'data-science-ipython-notebooks.json',
  'Deep-Learning-Papers-Reading-Roadmap.json',
  'DeepFaceLab.json',
  'DeepSpeech.json',
  'DeepSpeed.json',
  'faceswap.json',
  'fastai.json',
  'GFPGAN.json',
  'gradio.json',
  'handson-ml.json',
  'keras.json',
  'machine-learning-for-software-engineers.json',
  'Made-With-ML.json',
  'mediapipe.json',
  'ML-From-Scratch.json',
  'MockingBird.json',
  'netron.json',
  'opencv.json',
  'openpose.json',
  'paper-reading.json',
  'Prompt-Engineering-Guide.json',
  'pytorch-lightning.json',
  'pytorch-tutorial.json',
  'pytorch.json',
  'ray.json',
  'Real-Time-Voice-Cloning.json',
  'so-vits-svc.json',
  'spaCy.json',
  'spleeter.jso

In [30]:
get_modularity(input_matrix=adjacency, labels=repoLabels_Propogation, labels_col=userLabels_Propogation)

-1.162980822755344e-11