In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
# get webpage with all packages
r = requests.get('https://conda-forge.org/feedstocks/')

# check the status code returned
r.status_code

200

In [3]:
# parse the response to get package names
soup = BeautifulSoup(r.content, "lxml")
table = soup.find("section", {"id": "feedstocks"})
links = [a['href'] for a in table.findAll('a')]

# inspect the links
links[:3]

['https://github.com/conda-forge/ablog-feedstock',
 'https://github.com/conda-forge/absl-py-feedstock',
 'https://github.com/conda-forge/ad3-cpp-feedstock']

In [4]:
# grab just the package names from the links
packages = [line.split('/')[-1].replace('-feedstock', '') for line in links]

# inspect the packages
packages[:3]

['ablog', 'absl-py', 'ad3-cpp']

In [5]:
# iterate over packages and get the grab meta.yaml
meta = {}
for pkg in packages:
    meta[pkg] = {}
    url = 'https://raw.githubusercontent.com/conda-forge/{}-feedstock/master/recipe/meta.yaml'.format(pkg)
    r = requests.get(url)
    if r.status_code == 200:
        meta[pkg]['response'] = r.text
    else:
        print("Error with {}".format(pkg))

Error with r-abind
Error with r-suppdists
Error with trilinos


In [6]:
# find where the maintainers line starts
for pkg in meta:

    if meta[pkg] == {}:
        del meta[pkg]
    
    else:
        text = meta[pkg]['response'].split('\n')
        for i, line in enumerate(text):
            if 'recipe-maintainers' in line:
                break

        # grab the text from the maintainers section
        maintainers = text[i+1: ]

        # clean up the maintainers text
        maintainers_clean = []
        for m in maintainers:
            line = m.strip()
            if line != '' and not line.startswith('#'):
                maintainers_clean.append(line.replace('-', '').replace(' ', ''))

        # store the maintainers in meta dict
        meta[pkg]['maintainers'] = maintainers_clean

RuntimeError: dictionary changed size during iteration

In [None]:
import networkx as nx
from networkx.algorithms import bipartite

# add packages and maintainers to a bipartite graph
B = nx.Graph()

# add packages
packages = set()
for pkg in meta.keys():
    if pkg != '':
        packages.add(pkg)
B.add_nodes_from(packages, bipartite=0)

# add authors
authors = set()
for pkg in meta.keys():
    for maintainer in meta[pkg]['maintainers']:
        if maintainer != '':
            authors.add(maintainer)
B.add_nodes_from(authors, bipartite=1)

# add edges
edges = []
for pkg in meta:
    for maintainer in meta[pkg]['maintainers']:
        edges.append((pkg, maintainer))
B.add_edges_from(edges)

In [None]:
# inspect the packages
list(packages)[:5]

In [None]:
# inspect the authors
list(authors)[:5]

In [None]:
# inspect the edges
edges[:5]

In [None]:
# check if the graph is connected
nx.is_connected(B)

In [None]:
# grab the largest component
nx.number_connected_components(B)

In [None]:
# project bipartitle graph
G = bipartite.weighted_projected_graph(B, authors)

In [None]:
# grab the largest component
G = max(nx.connected_component_subgraphs(G), key=len)

In [None]:
# inspect the edges
list(G.edges(data=True))[:5]

In [None]:
import pandas as pd

In [None]:
# put nodes into a DataFrame
nodes = pd.DataFrame([(node) for node in G.nodes], columns=['name'])

# create a copy of the index
nodes['index_copy'] = nodes.index

nodes.head()

In [None]:
# put edges into a DataFrame
edges = pd.DataFrame(list(G.edges), columns=['source', 'target'])

# replace name with id
edges1 = pd.merge(edges, nodes, left_on='source', right_on='name')
edges1 = edges1.rename({'index_copy': 'source_id'}, axis=1)
edges2 = pd.merge(edges1, nodes, left_on='target', right_on='name')
edges2 = edges2.rename({'index_copy': 'target_id'}, axis=1)

# subset to columns and rename
edges = edges2[['source_id', 'target_id']]
edges = edges.rename({'source_id': 'source', 'target_id': 'target'}, axis=1)

# drop the index_copy column
nodes.drop('index_copy', axis=1, inplace=True)

In [None]:
import holoviews as hv
hv.extension('bokeh')

In [None]:
%opts Nodes Graph [width=800 height=600 xaxis=None yaxis=None]

In [None]:
%%opts Graph [color_index='circle']
%%opts Graph (node_size=5 edge_line_width=0.25)
graph = hv.Graph.from_networkx(G, nx.spring_layout)
graph = graph.redim.range(x=(-0.05, 1.05), y=(-0.05, 1.05))

graph

In [None]:
from holoviews.operation.datashader import datashade, bundle_graph

bundled = bundle_graph(graph)
bundled