In [1]:
# from neo4j import GraphDatabase, Neo4jDriver
#
#
# class GraphDB:
#     driver: Neo4jDriver
#
#     def __init__(self, uri, user, password):
#         self.driver = GraphDatabase.driver(uri, auth=(user, password))
#
#     def __enter__(self):
#         return self
#
#     def __exit__(self, exc_type, exc_val, exc_tb):
#         self.driver.close()
#
#     def add_nodes(self, packages: list[dict[str, dict]]):
#         with self.driver.session() as session:
#             for package in packages:
#                 for version_name, version_data in package['versions'].items():
#                     session.run("CREATE (n:NameVersion {name: $name, version: $version, timestamp: $timestamp})",
#                                 name=package['name'], version=version_name, timestamp=version_data['timestamp'])
#             # greeting = session.write_transaction(self._create_and_return_greeting, message)
#             # print(greeting)


In [2]:
# import json
#
# data: list[dict[str, dict]]
# with open('../../data/output/pypi-bq-dependencies420k-old.json', 'r') as file:
#     data = json.load(file)
# with GraphDB("bolt://localhost:7687", "neo4j", "softwareThatMatters") as db:
#     db.add_nodes(data)

In [3]:
from py2neo import Graph

graph = Graph('bolt://localhost:7687', auth=('neo4j', 'softwareThatMatters'))

In [4]:
from py2neo.bulk import create_nodes
import json

input_data:  dict[str, dict[str, dict]]
with open('../../data/output/neo4j_data.json', 'r') as file:
    input_data = json.load(file)

keys = ['name', 'version', 'timestamp']
nodes: list[list] = []
for package in input_data.values():
    for version_name, version_data in package['versions'].items():
        nodes.append([package['name'], version_name, version_data['timestamp']])

In [7]:
import numpy as np

batch_size = 1_000_000

batches = np.array_split(nodes, batch_size)

for batch in batches:
    create_nodes(graph.auto(), batch.tolist(), labels={"NameVersion"}, keys=keys)

In [8]:
from py2neo.bulk import create_relationships


def create_relationship_data(pack:dict[str, dict]):
    relationship_data = []
    for index, (version_name, version_data) in enumerate(pack['versions'].items()):
        dependent_info = (package['name'], version_name, version_data['timestamp'])
        for dependency_name, dependency_version_constraint in version_data['dependencies'].items():

            try:
                spec = SimpleSpec(dependency_version_constraint)
            except ValueError:
                # Ignore dependencies with non-standard formats
                continue

            if dependency_name in input_data:
                for dependency_version in input_data[dependency_name]['versions'].keys():
                    try:
                        semver_version = Version.coerce(dependency_version)
                    except ValueError:
                        continue

                    if spec.match(semver_version):
                        dependency_info = (dependency_name, dependency_version, input_data[dependency_name]['versions'][dependency_version]['timestamp'])
                        relationship_data.append((dependent_info, {}, dependency_info))

        if index % 100_000:
            create_relationships(graph.auto(), relationship_data, "DEPENDS_ON", start_node_key=('NameVersion', 'name', 'version', 'timestamp'), end_node_key=('NameVersion', 'name', 'version', 'timestamp'))
            relationship_data.clear()
            print((index / len(pack['versions'])) * 100, "% done")
    # return relationship_data

In [None]:
import multiprocessing
from multiprocessing import Pool
from semantic_version import SimpleSpec, Version



# This could be converted to a multiprocessing pool
# for package in input_data.values():

with Pool(multiprocessing.cpu_count()) as pool:
    # results = pool.map(create_relationship_data, input_data.values())
    pool.map(create_relationship_data, input_data.values())

# flat_results = sum(results, [])
            # compatible_dependency_versions = spec.filter((Version.coerce(version) for version in input_data[dependency_name]['versions'].keys()))
            # for compatible_dependency_version in compatible_dependency_versions:
            #     compatible_dependency_version_str = str(compatible_dependency_version)
            #     dependency_info = (dependency_name, compatible_dependency_version_str, input_data[dependency_name]['versions'][compatible_dependency_version_str]['timestamp'])
            #     relationship_data.append((dependent_info, {}, dependency_info))
# flat_results[:100]