In [0]:
!pip install networkx
!pip insatall plotly

You should consider upgrading via the '/local_disk0/.ephemeral_nfs/envs/pythonEnv-8c936a28-b024-4e44-93e0-ce9abaa4716a/bin/python -m pip install --upgrade pip' command.[0m
ERROR: unknown command "insatall" - maybe you meant "install"


# Generate Graph

In [0]:
import networkx as nx
import numpy as np
from pyspark.sql.functions import udf
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    IntegerType,
    ArrayType,
)
import random


def random_node():
    return ".".join(str(random.randint(0, 255)) for _ in range(4))


def random_edge_ab():
    return random.randint(1, 65535)


def random_edge_c():
    return random.choice(["A", "B"])


num_graphs = spark.sparkContext.defaultParallelism
num_nodes = 1_000  # ADJUST AS NEEDED, total_nodes = num_nodes * num_graphs
edges_per_node = 2
seed = 42

sc = spark.sparkContext
seed_bc = sc.broadcast(seed)


def generate_ba_graph(partition_id, num_nodes, edges_per_node, seed):
    random.seed(
        seed + partition_id
    )  # make sure each partition is unique but deterministic
    g = nx.barabasi_albert_graph(num_nodes, edges_per_node, seed + partition_id)

    node_map = {node: random_node() for node in g.nodes()}
    edges = [
        (
            node_map[src],
            node_map[dst],
            random_edge_ab(),
            random_edge_ab(),
            random_edge_c(),
        )
        for src, dst in g.edges()
    ]
    return edges


generate_ba_graph_udf = udf(
    lambda partition_id: generate_ba_graph(
        partition_id, num_nodes, edges_per_node, seed
    ),
    ArrayType(
        StructType(
            [
                StructField("src", StringType(), False),
                StructField("dst", StringType(), False),
                StructField("edge_a", IntegerType(), False),
                StructField("edge_b", IntegerType(), False),
                StructField("edge_c", StringType(), False),
            ]
        )
    ),
)

rdd = spark.range(num_graphs).withColumn(
    "edges", generate_ba_graph_udf("id")
)

all_edges = rdd.select("edges").rdd.flatMap(lambda row: row.edges).distinct()
edge_df = all_edges.toDF(["src", "dst", "edge_a", "edge_b", "edge_c"])

edge_df.show(10, truncate=False)

average_degree = edge_df.groupBy("src").count().agg({"count": "avg"}).first()[0]

edge_count = edge_df.count()
unique_nodes_df = edge_df.select("src").union(edge_df.select("dst")).distinct()
node_count = unique_nodes_df.count()

print(f"Total edges in graph: {edge_count}")
print(f"Total nodes in graph: {node_count}")
print(f"Average Degree: {average_degree}")

+-------------+---------------+------+------+------+
|src          |dst            |edge_a|edge_b|edge_c|
+-------------+---------------+------+------+------+
|57.12.140.125|150.185.98.35  |10535 |25501 |A     |
|57.12.140.125|161.108.255.202|60994 |64517 |B     |
|57.12.140.125|20.41.214.161  |60240 |5910  |B     |
|57.12.140.125|133.91.135.19  |36611 |4744  |B     |
|57.12.140.125|169.238.212.31 |49642 |22261 |B     |
|57.12.140.125|159.166.43.26  |32703 |48308 |A     |
|57.12.140.125|188.136.42.217 |7857  |60306 |A     |
|57.12.140.125|228.109.146.251|44291 |62638 |B     |
|57.12.140.125|66.248.175.24  |20687 |45169 |A     |
|114.71.52.44 |127.38.226.50  |3935  |14121 |B     |
+-------------+---------------+------+------+------+
only showing top 10 rows

Total edges in graph: 15968
Total nodes in graph: 8000
Average Degree: 4.00702634880803


# Visualize

In [0]:
sampled_edges_df = edge_df.sample(withReplacement=False, fraction=0.01, seed=42)

In [0]:
import plotly.graph_objects as go
import networkx as nx
import pandas as pd

# Convert to Pandas
edges_pd = sampled_edges_df.select("src", "dst").toPandas()

# Build NetworkX graph
G = nx.Graph()
G.add_edges_from(edges_pd.itertuples(index=False, name=None))

# Compute layout positions
pos = nx.spring_layout(G, seed=42, threshold=1.1)

# Extract edge positions
edge_x = []
edge_y = []
for src, dst in G.edges():
    x0, y0 = pos[src]
    x1, y1 = pos[dst]
    edge_x.extend([x0, x1, None])
    edge_y.extend([y0, y1, None])

# Edge trace
edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=0.5, color='#888'),
    hoverinfo='none',
    mode='lines'
)

# Extract node positions
node_x = []
node_y = []
for node in G.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)

# Node trace with hover labels
node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers+text',
    hoverinfo='text',
    text=[node for node in G.nodes()],
    textposition="top center",
    marker=dict(
        showscale=False,
        color='#00bfff',
        size=10,
        line_width=1
    )
)

# Build figure
fig = go.Figure(data=[edge_trace, node_trace],
                layout=go.Layout(
                    title='Top 100 Nodes by Degree',
                    titlefont_size=16,
                    showlegend=False,
                    hovermode='closest',
                    margin=dict(b=20, l=5, r=5, t=40),
                    xaxis=dict(showgrid=False, zeroline=False),
                    yaxis=dict(showgrid=False, zeroline=False)
                ))

fig.show()
