In [1]:
import pandas as pd
from downloaders import BaseDownloader
from ensmallen_graph import EnsmallenGraph
from downloaders.extractors.zip_extraction import ZipExtractor
from tqdm.auto import tqdm
import os
import shutil

In [2]:
url = "http://nrvis.com/download/data/{}/{}.zip"

In [3]:
path = "bindings/python/ensmallen_graph/network_repository/{}.json"

In [4]:
graphs = pd.read_html("http://networkrepository.com/networks.php")[0]
graphs["url"] = [
    url.format(row["Type"], row["Graph Name"])
    for _, row in graphs.iterrows()
]
graphs.drop(
    columns=["Download", "Size"],
    inplace=True
)

In [5]:
graphs

Unnamed: 0,Graph Name,Type,|V|,|E|,dmax,davg,r,|T|,Tavg,Tmax,κavg,κ,K,ωheu,url
0,bio-CE-CX,bio,15K,246K,375,32,0.34,7M,442,14K,0.21,0.29,79,43,http://nrvis.com/download/data/bio/bio-CE-CX.zip
1,bio-CE-GN,bio,2K,54K,242,48,0.07,686K,308,3K,0.18,0.14,49,16,http://nrvis.com/download/data/bio/bio-CE-GN.zip
2,bio-CE-GT,bio,924,3K,151,7,-0.18,12K,12,684,0.61,0.13,10,8,http://nrvis.com/download/data/bio/bio-CE-GT.zip
3,bio-CE-HT,bio,3K,3K,44,2,-0.30,87,-,4,0.01,0.01,4,4,http://nrvis.com/download/data/bio/bio-CE-HT.zip
4,bio-CE-LC,bio,1K,2K,131,2,-0.17,699,-,31,0.08,0.04,7,7,http://nrvis.com/download/data/bio/bio-CE-LC.zip
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5147,web-wikipedia-growth,web,-,-,-,-,-,-,-,-,-,-,-,-,http://nrvis.com/download/data/web/web-wikiped...
5148,web-wikipedia2009,web,2M,5M,3K,4,0.05,7M,3,12K,0.16,0.05,67,11,http://nrvis.com/download/data/web/web-wikiped...
5149,web-wikipedia-link-de,web,-,-,-,-,-,-,-,-,-,-,-,-,http://nrvis.com/download/data/web/web-wikiped...
5150,web-wikipedia-link-fr,web,-,-,-,-,-,-,-,-,-,-,-,-,http://nrvis.com/download/data/web/web-wikiped...


In [6]:
downloader = BaseDownloader(
    target_directory="graphs",
    process_number=6,
    verbose=2,
    crash_early=True
)

In [7]:
metadata = {}
for url in tqdm(graphs.url):
    report = downloader.download(url)
    if "extraction_success" not in report.columns:
        continue
    extraction = report.extraction_destination[0]
    graph_name = extraction.split(os.sep)[-1]
    has_nan_weights = False
    edge_path_pattern = "{}/{}".format(
        report.extraction_destination[0],
        graph_name
    )
    edge_path = None
    for ext in ("edges", "mtx"):
        edge_path = "{}.{}".format(edge_path_pattern, ext)
        if os.path.exists(edge_path):
            break
    if edge_path is None:
        raise ValueError("File not found in list {}", os.listdir(extraction))
    comment = None
    with open(edge_path, "r") as f:
        first_line = f.readline()
        for _ in range(100):
            middle_line = f.readline()
    for separator in ("\t", " ", ",", ";"):
        if separator in middle_line:
            edge_separator = separator
            
    for comment_symbol in ("%", "#"):
        if first_line.startswith(comment_symbol):
            comment = comment_symbol
    
    data = pd.read_csv(edge_path, sep=edge_separator, header=None, comment=comment)
    if data.isna().values.any():
        continue
    graph = EnsmallenGraph.from_unsorted_csv(
        edge_path=edge_path,
        directed=False,
        edge_separator=edge_separator,
        sources_column_number=0,
        destinations_column_number=1,
        **(dict(weights_column_number=2) if len(data.columns) > 2 else {}),
        **(dict(default_weight=1) if has_nan_weights else {}),
        **(dict(edge_file_comment_symbol=comment) if comment is None else {})
    )
    metadata[graph_name] = {
        "urls": [url],
        "arguments": {
            "edge_path": "{graph_name}/{graph_name}.{ext}".format(
                graph_name=graph_name,
                ext=ext
            ),
            "edge_separator": edge_separator,
            "sources_column_number": 0,
            "destinations_column_number": 1,
            **({
                "weights_column_number": 2,
            } if len(data.columns) > 2 else {}),
            **({
                "edge_file_comment_symbol": comment
            } if comment is not None else {}),
            "edge_header": False,
            "numeric_edge_node_ids": True,
            "has_nan_weights": has_nan_weights
        }
    }
    os.remove(report.destination[0])
    shutil.rmtree(report.extraction_destination[0])

HBox(children=(IntProgress(value=0, max=5152), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Downloading to graphs/bio-CE-CX.zip', layout=Layout(flex='2')…

HBox(children=(IntProgress(value=0, description='Downloading to graphs/bio-CE-GN.zip', layout=Layout(flex='2')…

HBox(children=(IntProgress(value=0, description='Downloading to graphs/bio-CE-GT.zip', layout=Layout(flex='2')…

HBox(children=(IntProgress(value=0, description='Downloading to graphs/bio-CE-HT.zip', layout=Layout(flex='2')…

HBox(children=(IntProgress(value=0, description='Downloading to graphs/bio-CE-LC.zip', layout=Layout(flex='2')…

HBox(children=(IntProgress(value=0, description='Downloading to graphs/bio-CE-PG.zip', layout=Layout(flex='2')…

HBox(children=(IntProgress(value=0, description='Downloading to graphs/bio-DM-CX.zip', layout=Layout(flex='2')…

HBox(children=(IntProgress(value=0, description='Downloading to graphs/bio-DM-HT.zip', layout=Layout(flex='2')…

HBox(children=(IntProgress(value=0, description='Downloading to graphs/bio-DM-LC.zip', layout=Layout(flex='2')…

HBox(children=(IntProgress(value=0, description='Downloading to graphs/bio-DR-CX.zip', layout=Layout(flex='2')…

HBox(children=(IntProgress(value=0, description='Downloading to graphs/bio-HS-CX.zip', layout=Layout(flex='2')…

HBox(children=(IntProgress(value=0, description='Downloading to graphs/bio-HS-HT.zip', layout=Layout(flex='2')…

HBox(children=(IntProgress(value=0, description='Downloading to graphs/bio-HS-LC.zip', layout=Layout(flex='2')…

HBox(children=(IntProgress(value=0, description='Downloading to graphs/bio-SC-CC.zip', layout=Layout(flex='2')…

HBox(children=(IntProgress(value=0, description='Downloading to graphs/bio-SC-GT.zip', layout=Layout(flex='2')…

HBox(children=(IntProgress(value=0, description='Downloading to graphs/bio-SC-HT.zip', layout=Layout(flex='2')…

HBox(children=(IntProgress(value=0, description='Downloading to graphs/bio-SC-LC.zip', layout=Layout(flex='2')…

HBox(children=(IntProgress(value=0, description='Downloading to graphs/bio-SC-TS.zip', layout=Layout(flex='2')…

HBox(children=(IntProgress(value=0, description='Downloading to graphs/bio-WormN...-v3-benchmark.zip', layout=…

HBox(children=(IntProgress(value=0, description='Downloading to graphs/bio-celegans-dir.zip', layout=Layout(fl…

ValueError: The destinations column number passed was 1 but the first parsable line has 1 values.

In [None]:
graph_name

In [None]:
os.listdir("graphs/bio-grid-yeast")

In [None]:
pd.read_csv("graphs/bio-grid-fission-yeast/bio-grid-fission-yeast.edges", sep=",", comment="%")