## Python Dependency Network 

In [1]:
import numpy as np
import pandas as pd
import networkx as nx

In [2]:
import pyarrow.parquet as pq

table = pq.read_table("C:/Users/PC/Desktop/311025.parquet")
df = table.to_pandas()
df.head()


Unnamed: 0,name,version,summary,requires_dist,requires_python,upload_time
0,logsteplib,0.0.6,Package containing a standard format for the l...,"[pyspark, twine; extra == ""dev"", build; extra ...",>=3.10,2025-10-31 17:58:15.170974
1,birdbrain-python-library-2,0.9.40,Rewritten Python Library for Birdbrain Technol...,"[black; extra == ""dev"", flake8; extra == ""dev""...",>=3.8,2025-10-31 17:58:09.177850
2,spearmint-framework,0.2.0,An experiment framework for easily testing mul...,"[mlflow>=3.5.1, pydantic<3.0.0,>=2.0.0, pyyaml...",>=3.10,2025-10-31 17:57:28.573838
3,vaapi,0.7.3,Python utils for adding logs to our Visual Ana...,"[protobuf==3.20.3, numpy<2.0.0,>=1.23.5, httpx...",>=3.10.9,2025-10-31 17:57:13.438910
4,strands-mlx,0.1.0,Use MLX in Strands Agents,"[strands-agents, mlx, mlx-lm, strands-agents-t...","<3.14,>=3.10",2025-10-31 17:57:06.760503


In [3]:
import ast  # para convertir texto de lista a lista real si hace falta

datadict = {"package": [], "requirement": [], "version": []}

for _, row in df.iterrows():
    pkg = row["name"]
    ver = row["version"]
    reqs = row["requires_dist"]

    # Convertir a lista si es texto
    if isinstance(reqs, str):
        try:
            reqs = ast.literal_eval(reqs)
        except Exception:
            # Si no es lista, tratamos el string como una sola dependencia
            reqs = [reqs]
    elif isinstance(reqs, float) or reqs is None:
        reqs = []

    # Asegurar que sea una lista
    if not isinstance(reqs, (list, tuple, np.ndarray)):
        reqs = [reqs]

    # Si está vacía o tiene solo NaN
    if len(reqs) == 0 or all(pd.isna(x) for x in reqs):       
        datadict["package"].append(pkg)
        datadict["requirement"].append(np.nan)
        datadict["version"].append(ver)
        continue

    # Procesar dependencias
    for req in reqs:
        if not isinstance(req, str) or req.strip() == "":
            continue
        dep_name = (
            req.split(";")[0]
               .split("[")[0]
               .split("==")[0]
               .split(">=")[0]
               .split("<=")[0]
               .split(">")[0]
               .split("<")[0]
               .strip()
        )
        datadict["package"].append(pkg)
        datadict["requirement"].append(dep_name)
        datadict["version"].append(ver)

# Convertir a DataFrame
df_exploded = pd.DataFrame(datadict)
print(df_exploded.head())

                      package requirement version
0                  logsteplib     pyspark   0.0.6
1                  logsteplib       twine   0.0.6
2                  logsteplib       build   0.0.6
3  birdbrain-python-library-2       black  0.9.40
4  birdbrain-python-library-2      flake8  0.9.40


In [4]:
print(df_exploded.shape)
print(df_exploded["package"].nunique())



(2717308, 3)
820998


In [5]:
# Grafo

G = nx.DiGraph()

for _, row in df_exploded.iterrows():
    pkg = row["package"]
    dep = row["requirement"]
    if pd.notna(dep) and dep != "":
        G.add_edge(pkg, dep)
G.remove_nodes_from(['.', 'nan', np.nan])

print("Nodos:", G.number_of_nodes())
print("Aristas:", G.number_of_edges())

Nodos: 438257
Aristas: 2104347
