In [0]:
!pip install networkx

In [0]:
import numpy as np
import matplotlib.pyplot as plt

def create_synthetic_distribution(params, plot=True):

    slope = params.get('slope', -2)
    min_degree = params.get('min_degree', 1)
    max_degree = params.get('max_degree', 200_000)
    max_prob = params.get('max_prob', 0.5)

    # Create an array of degrees from min_degree to max_degree as floats
    degrees = np.arange(min_degree, max_degree + 1, dtype=float)

    # Calculate the scaling factor A to ensure the maximum probability at min_degree
    A = max_prob / (min_degree ** slope)

    # Compute the power-law decay values
    y_values = A * degrees ** slope

    # Convert degrees to integers for dictionary keys
    degrees_int = degrees.astype(int)

    # Create a dictionary mapping degrees to decay values
    decay_dict = dict(zip(degrees_int, y_values))

    return decay_dict

params = {
    'slope': -2,
    'intercept': 0.8,
    'r_squared': 0.98,
    'max_degree': 200_000,
    'min_degree': 1,
    'max_prob': 0.5,
    'degree_range': list(np.arange(1, 200_000))
}

target_distribution = create_synthetic_distribution(params, 200_000)

In [0]:
import networkx as nx
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType
import random

# spark = SparkSession.builder.master("local[*]").appName("NetworkFlowGraph").getOrCreate()

def random_node():
    # return '.'.join(map(str, np.random.randint(0, 256, size=4).tolist()))
    return int(np.random.randint(1_000_000, 10_000_000_000))

def random_feature():
    return int(np.random.randint(1, 70000))  # cast to native int

def random_col_e():
    return str(np.random.choice(['col_e_A', 'col_e_B']))  # cast to native str

num_graphs = spark.sparkContext.defaultParallelism # number of cores available
num_nodes_per_graph = 1_000
seed = 42

seed_bc = spark.sparkContext.broadcast(seed)

def configuration_model_with_distribution(n, degree_distribution):
    """
    Generate a graph with a specific degree distribution
    """

    degrees = []
    remaining_nodes = n

    for degree, prob in sorted(degree_distribution.items()):
        if remaining_nodes <= 0:
            break
        count = min(int(n * prob + 0.5), remaining_nodes)
        if count > 0:
            degrees.extend([int(degree)] * count)
            remaining_nodes -= count

    if remaining_nodes > 0:
        min_degree = min(degree_distribution.keys())
        degrees.extend([min_degree] * remaining_nodes)

    if len(degrees) < 2:
        degrees = [1, 1]

    if sum(degrees) % 2 != 0:
        degrees[0] += 1

    try:
        g = nx.configuration_model(degrees, seed=42)
        g = nx.Graph(g)

        if g.number_of_edges() == 0:
            raise nx.NetworkXError("Generated graph has no edges")

        return g
    except Exception as e:
        print(f"Error generating graph: {e}")
        return nx.barabasi_ablert_graph(n, 2)
    
def generate_custom_graph(partition_id, num_nodes, degree_distribution, seed):
    np.random.seed(seed + partition_id)
    random.seed(seed + partition_id)

    g = configuration_model_with_distribution(num_nodes, degree_distribution)

    node_map = {node: random_node() for node in g.nodes()}
    edges = [(node_map[edge[0]], node_map[edge[1]],
              random_feature(), random_feature(), random_col_e())
             for edge in g.edges()]
    
    return edges

target_distribution_bc = spark.sparkContext.broadcast(target_distribution)

generate_custom_graph_udf = udf(
    lambda partition_id: generate_custom_graph(
        partition_id, num_nodes_per_graph, target_distribution_bc.value, seed_bc.value
    ),
    ArrayType(StructType([
        StructField("col_a", StringType(), False),
        StructField("col_b", StringType(), False),
        StructField("col_c", IntegerType(), False),
        StructField("col_d", IntegerType(), False),
        StructField("col_e", StringType(), False),
    ]))
)

rdd = spark.range(num_graphs).withColumn("edges", generate_custom_graph_udf("id"))

all_edges = rdd.select("edges").rdd.flatMap(lambda row: row.edges).distinct()
edge_df = all_edges.toDF(["col_a", "col_b", "col_c", "col_d", "col_e"])

edge_count = edge_df.count()

nodes = edge_df.select("col_a").union(edge_df.select("col_b")).distinct()
node_count = nodes.count()

average_degree = edge_df.groupBy("col_a").count().agg({"count": "avg"}).first()[0]

print(f"Total edges in graph: {edge_count}")
print(f"Total nodes in graph: {node_count}")
print(f"Average degree: {average_degree}")

def print_graph_stats(edge_df, display=False):

    unique_nodes = edge_df.select("col_a").union(edge_df.select("col_b")).distinct()

    total_edges = edge_df.count()

    src_degrees = edge_df.groupBy("col_a").agg({"col_a": "count"}) \
        .withColumnRenamed("count(col_a)", "in_degree") \
        .withColumnRenamed("col_a", "merge_col")

    dst_degrees = edge_df.groupBy("col_b").agg({"col_b": "count"}) \
        .withColumnRenamed("count(col_b)", "out_degree") \
        .withColumnRenamed("col_b", "merge_col")

    total_degrees = src_degrees.join(dst_degrees, "merge_col", "outer") \
        .fillna(0) \
            .selectExpr(
                "merge_col", 
                "cast(out_degree as int) + cast(in_degree as int) as total_degree"
            )
        
    avg_degree = total_degrees.agg({"total_degree": "avg"}).collect()[0][0]
    max_degree = total_degrees.agg({"total_degree": "max"}).collect()[0][0]

    if display:
        print("\nGraph Statistics:")
        print(f"Number of unique Nodes: {unique_nodes.count()}")
        print(f"Number of edges: {total_edges}")
        print(f"Average degree: {avg_degree:.2f}")
        print(f"Maximum degree: {max_degree}")

    return total_degrees

def check_degree_distribution(edge_df):

    total_degrees = print_graph_stats(edge_df)

    degree_dist = total_degrees.groupBy("total_degree").count() \
        .withColumnRenamed("count", "num_vertices")

    total_nodes = total_degrees.count()

    print("\nResulting degree distribution:")
    for row in degree_dist.orderBy("total_degree").collect():
        degree = row['total_degree']
        count = row['num_vertices']
        percentage = count / total_nodes
        target_percentage = target_distribution.get(degree, 0)
        print(f"Degree {degree}: {percentage:.4f} (Target: {target_percentage:.4f})")

print_graph_stats(edge_df, display=True)
check_degree_distribution(edge_df)

