In [1]:
import json
import os
import numpy as np
import networkx as nx
from networkx.algorithms import bipartite
import sknetwork as skn
from collections import Counter
import scipy
from sknetwork.clustering import get_modularity

In [7]:
repos = []
with open('repos.json', 'r') as f:
    repos = json.load(f)

len(repos)

100

In [8]:
users = []
with open('users.json', 'r') as f:
    users = json.load(f)

len(users)

1022491

In [2]:
adjacency_matrix = {}
with open('adjacency_matrix.json', 'r') as f:
    adjacency_matrix = json.load(f)
adjacency = scipy.sparse.csr_matrix(adjacency_matrix)
adjacency

<100x412266 sparse matrix of type '<class 'numpy.intc'>'
	with 1908925 stored elements in Compressed Sparse Row format>

In [3]:
B = bipartite.from_biadjacency_matrix(adjacency)

In [48]:
userAdj = adjacency.transpose()

In [49]:
from sknetwork.clustering import Louvain

louvain = Louvain(random_state=42)
louvain.fit(userAdj, force_bipartite=True)

repoLabels_Louvain = louvain.labels_row_
userLabels_Louvain = louvain.labels_col_

len(set(repoLabels_Louvain)), len(set(userLabels_Louvain))

(9, 9)

## Louvain

In [50]:
from sknetwork.clustering import Louvain

louvain = Louvain(random_state=42)
louvain.fit(adjacency, force_bipartite=True)

repoLabels_Louvain = louvain.labels_row_
userLabels_Louvain = louvain.labels_col_

len(set(repoLabels_Louvain)), len(set(userLabels_Louvain))

(11, 11)

In [51]:
Counter(repoLabels_Louvain)

Counter({8: 5,
         1: 12,
         0: 19,
         2: 12,
         5: 9,
         3: 15,
         9: 2,
         6: 8,
         4: 10,
         7: 7,
         10: 1})

In [41]:
repoNameLabel_Louvain = {}

for repo, label in zip(repos, repoLabels_Louvain):
    repoNameLabel_Louvain[repo] = label

len(repoNameLabel_Louvain)

100

In [44]:
labelRepoName_Louvain = {}
for l in set(repoLabels_Louvain):
    labelRepoName_Louvain[l] = []

for repo, label in zip(repos, repoLabels_Louvain):
    labelRepoName_Louvain[label].append(repo)

len(labelRepoName_Louvain)

11

In [45]:
labelRepoName_Louvain

{0: ['AI-For-Beginners',
  'ColossalAI',
  'data-science-ipython-notebooks',
  'DeepSpeed',
  'GFPGAN',
  'MockingBird',
  'Prompt-Engineering-Guide',
  'so-vits-svc',
  'stable-diffusion-webui',
  'stanford_alpaca',
  'TTS',
  'awesome-production-machine-learning',
  'CLIP',
  'diffusers',
  'pytorch_geometric',
  'qlib',
  'stable-diffusion-webui-colab',
  'supervision',
  'tfjs'],
 1: ['AI-Expert-Roadmap',
  'applied-ml',
  'machine-learning-for-software-engineers',
  'Made-With-ML',
  'ML-From-Scratch',
  '500-AI-Machine-learning-Deep-learning-Computer-vision-NLP-Projects-with-code',
  'awesome-deep-learning',
  'best-of-ml-python',
  'd2l-en',
  'Machine-Learning-Tutorials',
  'ML-YouTube-Courses',
  'stanford-cs-229-machine-learning'],
 2: ['annotated_deep_learning_paper_implementations',
  'd2l-zh',
  'paper-reading',
  'ultralytics',
  'yolov5',
  'Awesome-pytorch-list',
  'CVPR2024-Papers-with-Code',
  'deep-learning-for-image-processing',
  'Dive-into-DL-PyTorch',
  'learnope

In [47]:
get_modularity(input_matrix=adjacency, labels=repoLabels_Louvain, labels_col=userLabels_Louvain)

0.27046261697075363

## Leiden

In [72]:
from sknetwork.clustering import Leiden

leiden = Leiden(random_state=42)
leiden.fit(adjacency, force_bipartite=True)

repoLabels_Leiden = leiden.labels_row_
userLabels_Leiden = leiden.labels_col_

len(set(repoLabels_Leiden)), len(set(userLabels_Leiden))

(11, 11)

In [53]:
Counter(repoLabels_Leiden)

Counter({7: 3, 3: 13, 0: 17, 2: 13, 1: 17, 4: 13, 9: 2, 5: 13, 6: 7, 8: 2})

In [54]:
repoNameLabel_Leiden = {}

for repo, label in zip(repos, repoLabels_Leiden):
    repoNameLabel_Leiden[repo] = label

len(repoNameLabel_Leiden)

100

In [55]:
labelRepoName_Leiden = {}
for l in set(repoLabels_Leiden):
    labelRepoName_Leiden[l] = []

for repo, label in zip(repos, repoLabels_Leiden):
    labelRepoName_Leiden[label].append(repo)

len(labelRepoName_Leiden)

10

In [56]:
labelRepoName_Leiden

{0: ['AI-For-Beginners',
  'ColossalAI',
  'data-science-ipython-notebooks',
  'DeepSpeed',
  'GFPGAN',
  'gradio',
  'MockingBird',
  'Prompt-Engineering-Guide',
  'so-vits-svc',
  'stable-diffusion-webui',
  'stanford_alpaca',
  'TTS',
  'CLIP',
  'diffusers',
  'JARVIS',
  'stable-diffusion-webui-colab',
  'supervision'],
 1: ['awesome-deep-learning-papers',
  'caffe',
  'cs-video-courses',
  'Deep-Learning-Papers-Reading-Roadmap',
  'handson-ml',
  'keras',
  'machine-learning-for-software-engineers',
  'opencv',
  'pytorch-tutorial',
  'pytorch',
  'TensorFlow-Examples',
  'tensorflow',
  'cheatsheets-ai',
  'CNTK',
  'lectures',
  'openface',
  'Qix'],
 2: ['annotated_deep_learning_paper_implementations',
  'd2l-zh',
  'paper-reading',
  'ultralytics',
  'yolov5',
  'Awesome-pytorch-list',
  'CVPR2024-Papers-with-Code',
  'deep-learning-for-image-processing',
  'Dive-into-DL-PyTorch',
  'labelImg',
  'learnopencv',
  'ML-NLP',
  'pytorch-handbook'],
 3: ['AI-Expert-Roadmap',
  'a

In [57]:
get_modularity(input_matrix=adjacency, labels=repoLabels_Leiden, labels_col=userLabels_Leiden)

0.27873639825685986

## K-centers

In [59]:
from sknetwork.clustering import KCenters  # 48s

kcenters = KCenters(n_clusters=10)  # nember of clusters is predefined
kcenters.fit(adjacency, force_bipartite=True)

repoLabels_Kcenters = kcenters.labels_row_
userLabels_Kcenters = kcenters.labels_col_

len(set(userLabels_Kcenters)), len(set(userLabels_Kcenters))

100

In [61]:
Counter(repoLabels_Kcenters)

Counter({6: 4, 7: 7, 9: 6, 4: 12, 8: 13, 0: 19, 3: 12, 2: 9, 1: 13, 5: 5})

In [62]:
repoNameLabel_Kcenters = {}

for repo, label in zip(repos, repoLabels_Kcenters):
    repoNameLabel_Kcenters[repo] = label

len(repoNameLabel_Kcenters)

100

In [63]:
labelRepoName_Kcenters = {}
for l in set(repoLabels_Kcenters):
    labelRepoName_Kcenters[l] = []

for repo, label in zip(repos, repoLabels_Kcenters):
    labelRepoName_Kcenters[label].append(repo)

len(labelRepoName_Kcenters)

10

In [64]:
labelRepoName_Kcenters

{0: ['awesome-datascience',
  'awesome-deep-learning-papers',
  'data-science-ipython-notebooks',
  'Deep-Learning-Papers-Reading-Roadmap',
  'handson-ml',
  'keras',
  'machine-learning-for-software-engineers',
  'Made-With-ML',
  'ML-From-Scratch',
  'opencv',
  'TensorFlow-Examples',
  'tensorflow',
  'awesome-deep-learning',
  'awesome-nlp',
  'cheatsheets-ai',
  'CNTK',
  'lectures',
  'Machine-Learning-Tutorials',
  'stanford-cs-229-machine-learning'],
 1: ['DeepFaceLab',
  'DeepSpeech',
  'faceswap',
  'GFPGAN',
  'mediapipe',
  'MockingBird',
  'Real-Time-Voice-Cloning',
  'so-vits-svc',
  'spleeter',
  'tesseract.js',
  'first-order-model',
  'Screenshot-to-code',
  'tfjs'],
 2: ['ColossalAI',
  'DeepSpeed',
  'stable-diffusion-webui',
  'stanford_alpaca',
  'ultralytics',
  'CLIP',
  'diffusers',
  'JARVIS',
  'stable-diffusion-webui-colab'],
 3: ['caffe',
  'netron',
  'openpose',
  'pytorch',
  'yolov5',
  'imgaug',
  'labelImg',
  'learnopencv',
  'ml-agents',
  'ncnn',
  

In [65]:
get_modularity(input_matrix=adjacency, labels=repoLabels_Kcenters, labels_col=userLabels_Kcenters)

0.26999034747144657

## Propogation

In [69]:
from sknetwork.clustering import PropagationClustering

propogation = PropagationClustering()
propogation.fit(adjacency)

repoLabels_Propogation = propogation.labels_row_
userLabels_Propogation = propogation.labels_col_

len(set(userLabels_Propogation)), len(set(userLabels_Propogation))

(1, 1)

In [70]:
Counter(repoLabels_Propogation)

Counter({0: 100})

In [None]:
repoNameLabel_Propogation = {}

for repo, label in zip(repos, repoLabels_Propogation):
    repoNameLabel_Propogation[repo] = label

len(repoNameLabel_Propogation)

100

In [None]:
labelRepoName_Propogation = {}
for l in set(repoLabels_Propogation):
    labelRepoName_Propogation[l] = []

for repo, label in zip(repos, repoLabels_Propogation):
    labelRepoName_Propogation[label].append(repo)

len(labelRepoName_Propogation)

10

In [None]:
labelRepoName_Propogation

{0: ['awesome-datascience',
  'awesome-deep-learning-papers',
  'data-science-ipython-notebooks',
  'Deep-Learning-Papers-Reading-Roadmap',
  'handson-ml',
  'keras',
  'machine-learning-for-software-engineers',
  'Made-With-ML',
  'ML-From-Scratch',
  'opencv',
  'TensorFlow-Examples',
  'tensorflow',
  'awesome-deep-learning',
  'awesome-nlp',
  'cheatsheets-ai',
  'CNTK',
  'lectures',
  'Machine-Learning-Tutorials',
  'stanford-cs-229-machine-learning'],
 1: ['DeepFaceLab',
  'DeepSpeech',
  'faceswap',
  'GFPGAN',
  'mediapipe',
  'MockingBird',
  'Real-Time-Voice-Cloning',
  'so-vits-svc',
  'spleeter',
  'tesseract.js',
  'first-order-model',
  'Screenshot-to-code',
  'tfjs'],
 2: ['ColossalAI',
  'DeepSpeed',
  'stable-diffusion-webui',
  'stanford_alpaca',
  'ultralytics',
  'CLIP',
  'diffusers',
  'JARVIS',
  'stable-diffusion-webui-colab'],
 3: ['caffe',
  'netron',
  'openpose',
  'pytorch',
  'yolov5',
  'imgaug',
  'labelImg',
  'learnopencv',
  'ml-agents',
  'ncnn',
  

In [71]:
get_modularity(input_matrix=adjacency, labels=repoLabels_Propogation, labels_col=userLabels_Propogation)

-1.162980822755344e-11