In [None]:
import pandas as pd
import random
import string
from datetime import datetime, timedelta

from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import (
    StructType, StructField, StringType, IntegerType, LongType,
    FloatType, DoubleType, BooleanType, DateType, TimestampType, ArrayType
)

spark = SparkSession \
    .builder \
    .appName("MySparkApp") \
    .master("spark://masternode") \
    .config("spark.driver.memory","32G") \
    .config("spark.executor.cores",12) \
    .config("spark.executor.instances",4) \
    .config("spark.executor.memory","16G") \
    .config("spark.sql.adaptive.advisoryPartitionSizeInBytes","160mb") \
    .config("spark.sql.adaptive.coalescePartitions.minPartitionSize","32mb") \
    .config("spark.sql.files.maxPartitionBytes","2gb") \
    .config("spark.sql.shuffle.partitions",200) \
    .config("spark.task.cpus",1) \
    .config("spark.sql.legacy.charVarcharAsString",True) \
    .getOrCreate()

In [None]:
import numpy as np

def create_synthetic_distribution(params, plot=True):

    slope = params.get('slope', -2)
    min_degree = params.get('min_degree', 1)
    max_degree = params.get('max_degree', 200_000)
    max_prob = params.get('max_prob', 0.5)

    # Create an array of degrees from min_degree to max_degree as floats
    degrees = np.arange(min_degree, max_degree + 1, dtype=float)

    # Calculate the scaling factor A to ensure the maximum probability at min_degree
    A = max_prob / (min_degree ** slope)

    # Compute the power-law decay values
    y_values = A * degrees ** slope

    # Convert degrees to integers for dictionary keys
    degrees_int = degrees.astype(int)

    # Create a dictionary mapping degrees to decay values
    decay_dict = dict(zip(degrees_int, y_values))

    return decay_dict

params = {
    'slope': -2,
    'intercept': 0.8,
    'r_squared': 0.98,
    'max_degree': 200_000,
    'min_degree': 1,
    'max_prob': 0.5,
    'degree_range': list(np.arange(1, 200_000))
}

target_distribution = create_synthetic_distribution(params, 200_000)

In [None]:
num_graphs = spark.sparkContext.defaultParallelism # number of cores available
num_graphs

# Create Graph Based on Degree Distribution

In [None]:
import networkx as nx
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, explode
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType
import random

# spark = SparkSession.builder.master("local[*]").appName("NetworkFlowGraph").getOrCreate()

def random_node():
    # return '.'.join(map(str, np.random.randint(0, 256, size=4).tolist()))
    return int(np.random.randint(1_000_000, 10_000_000_000))

def random_feature():
    return int(np.random.randint(1, 70000))  # cast to native int

def random_col_e():
    return str(np.random.choice(['col_e_A', 'col_e_B']))  # cast to native str

# num_graphs = spark.sparkContext.defaultParallelism # number of cores available
num_nodes_per_graph = 350_000

def configuration_model_with_distribution(n, degree_distribution,seed):
    """
    Generate a graph with a specific degree distribution
    """

    degrees = []
    remaining_nodes = n

    for degree, prob in sorted(degree_distribution.items()):
        if remaining_nodes <= 0:
            break
        count = min(int(n * prob + 0.5), remaining_nodes)
        if count > 0:
            degrees.extend([int(degree)] * count)
            remaining_nodes -= count

    if remaining_nodes > 0:
        min_degree = min(degree_distribution.keys())
        degrees.extend([min_degree] * remaining_nodes)

    if len(degrees) < 2:
        degrees = [1, 1]

    if sum(degrees) % 2 != 0:
        degrees[0] += 1

    try:
        g = nx.configuration_model(degrees, seed=seed)
        g = nx.Graph(g)

        if g.number_of_edges() == 0:
            raise nx.NetworkXError("Generated graph has no edges")

        return g
    except Exception as e:
        print(f"Error generating graph: {e}")
        return nx.barabasi_ablert_graph(n, 2)
    
def generate_custom_graph(partition_id, num_nodes, degree_distribution, seed):
    # np.random.seed(seed + partition_id)
    # random.seed(seed + partition_id)

    g = configuration_model_with_distribution(num_nodes, degree_distribution, seed + partition_id)

    node_map = {node: random_node() for node in g.nodes()}
    edges = [(node_map[edge[0]], node_map[edge[1]],
              random_feature(), random_feature(), random_col_e())
             for edge in g.edges()]
    
    return edges

target_distribution_bc = spark.sparkContext.broadcast(target_distribution)

directory_path = "/path/to/synthetic_dataset"
seed = 1
remaining_iterations = 10

# while get_directory_size(directory_path) < MAX_SIZE_BYTES:
start = time.time()
while remaining_iterations > 0:
    print(remaining_iterations)
    seed_bc = spark.sparkContext.broadcast(seed)
    generate_custom_graph_udf = udf(
        lambda partition_id: generate_custom_graph(
            partition_id, num_nodes_per_graph, target_distribution_bc.value, seed_bc.value
        ),
        ArrayType(StructType([
            StructField("col_a", LongType(), False),
            StructField("col_b", LongType(), False),
            StructField("col_c", IntegerType(), False),
            StructField("col_d", IntegerType(), False),
            StructField("col_e", StringType(), False),
        ]))
    )

    edge_df = (
        spark.range(num_graphs)
             .withColumn("edges", generate_custom_graph_udf("id"))
             .select(explode("edges").alias("edge"))
             .select("edge.col_a", "edge.col_b", "edge.col_c", "edge.col_d", "edge.col_e")
             .distinct()  # if needed
    )

    edge_df.write.mode("append").parquet(directory_path)

    seed += 1
    remaining_iterations -= 1

end = time.time()
print(f"TTR 100 Iterations: {round(end-start, 2)}")

# Alternative Option for Scaling: Double Size of Dataset and Add Random Noise (maintains distribution)

In [None]:
from pyspark.sql.functions import col, floor, rand, lit

# Define the noise range
NOISE_MIN = -5
NOISE_MAX = 5

long_cols = ["col_a", "col_b"]
integer_cols = ["col_c", "col_d"]
string_cols = ["col_e"]

# Original DataFrame
df_original = test

# Create the noisy copy
df_augmented = df_original.select(
    *[
        # Add integer noise to numeric columns
        (col(c) + floor(rand() * (NOISE_MAX - NOISE_MIN + 1)) + NOISE_MIN).cast("long").alias(c)
        for c in long_cols
    ] + [
        # Add integer noise to numeric columns
        (col(c) + floor(rand() * (NOISE_MAX - NOISE_MIN + 1)) + NOISE_MIN).cast("integer").alias(c)
        for c in integer_cols
    ] + [
        # Preserve string columns
        col(c) for c in string_cols
    ]
)

# Optionally add source labels
df_original = df_original.withColumn("source", lit("original"))
df_augmented = df_augmented.withColumn("source", lit("noisy"))

# Combine both
df_doubled = df_original.unionByName(df_augmented)

df_doubled.repartition(2000).write.parquet('/path/to/synthetic_dataset_doubled', mode='overwrite')

# Get Stats

In [None]:
import networkx as nx
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType
import random

edge_df = df_doubled

def print_graph_stats(edge_df, display=False):

    unique_nodes = edge_df.select("col_a").union(edge_df.select("col_b")).distinct()

    total_edges = edge_df.count()

    src_degrees = edge_df.groupBy("col_a").agg({"col_a": "count"}) \
        .withColumnRenamed("count(col_a)", "in_degree") \
        .withColumnRenamed("col_a", "merge_col")

    dst_degrees = edge_df.groupBy("col_b").agg({"col_b": "count"}) \
        .withColumnRenamed("count(col_b)", "out_degree") \
        .withColumnRenamed("col_b", "merge_col")

    total_degrees = src_degrees.join(dst_degrees, "merge_col", "outer") \
        .fillna(0) \
            .selectExpr(
                "merge_col", 
                "cast(out_degree as int) + cast(in_degree as int) as total_degree"
            )
        
    avg_degree = total_degrees.agg({"total_degree": "avg"}).collect()[0][0]
    max_degree = total_degrees.agg({"total_degree": "max"}).collect()[0][0]

    if display:
        print("\nGraph Statistics:")
        print(f"Number of unique Nodes: {unique_nodes.count()}")
        print(f"Number of edges: {total_edges}")
        print(f"Average degree: {avg_degree:.2f}")
        print(f"Maximum degree: {max_degree}")

    return total_degrees

def check_degree_distribution(edge_df, plot=False):
    total_degrees = print_graph_stats(edge_df)

    degree_dist = total_degrees.groupBy("total_degree").count() \
        .withColumnRenamed("count", "num_vertices")

    total_nodes = total_degrees.count()

    # Collect results to driver for plotting
    dist_rows = degree_dist.collect()
    data = {
        "total_degree": [row["total_degree"] for row in dist_rows],
        "num_vertices": [row["num_vertices"] for row in dist_rows]
    }

    df = pd.DataFrame(data)
    df["empirical"] = df["num_vertices"] / total_nodes
    df["target"] = df["total_degree"].map(target_distribution).fillna(1e-12)  # avoid log(0)
    
    # Sort by total_degree
    df = df.sort_values(by="total_degree")

    print("\nResulting degree distribution:")
    for row in df.itertuples():
        print(f"Degree {row.total_degree}: {row.empirical:.4f} (Target: {row.target:.4f})")

    if plot:
        plt.figure(figsize=(10, 6))
        plt.plot(df['total_degree'], df['empirical'], marker='o', label='Empirical Distribution')
        plt.plot(df['total_degree'], df['target'], marker='x', linestyle='--', label='Target Distribution')
        plt.xlabel('Degree')
        plt.ylabel('Percentage of Nodes (log scale)')
        plt.yscale('log')
        plt.title('Degree Distribution: Empirical vs Target (Log Scale)')
        plt.legend()
        plt.grid(True, which="both", ls="--")
        plt.tight_layout()
        plt.show()


print_graph_stats(edge_df, display=True)
check_degree_distribution(edge_df, True)
