# Data Generation

In [None]:
!pip install openpyxl

### Start Spark Application

In [None]:
import pandas as pd
import random
import string
from datetime import datetime, timedelta

from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import (
    StructType, StructField, StringType, IntegerType, LongType,
    FloatType, DoubleType, BooleanType, DateType, TimestampType, ArrayType
)

spark = SparkSession \
    .builder \
    .appName("MySparkApp") \
    .master("spark://masternode") \ # MUST BE CHANGED TO YOUR MASTER NODE
    .config("spark.driver.memory","32G") \
    .config("spark.executor.cores",12) \
    .config("spark.executor.instances",4) \
    .config("spark.executor.memory","16G") \
    .config("spark.sql.adaptive.advisoryPartitionSizeInBytes","160mb") \
    .config("spark.sql.adaptive.coalescePartitions.minPartitionSize","32mb") \
    .config("spark.sql.files.maxPartitionBytes","2gb") \
    .config("spark.sql.shuffle.partitions",200) \
    .config("spark.task.cpus",1) \
    .config("spark.sql.legacy.charVarcharAsString",True) \
    .getOrCreate()

In [None]:
import numpy as np

def create_synthetic_distribution(params, plot=True):

    slope = params.get('slope', -2)
    min_degree = params.get('min_degree', 1)
    max_degree = params.get('max_degree', 200_000)
    max_prob = params.get('max_prob', 0.5)

    # Create an array of degrees from min_degree to max_degree as floats
    degrees = np.arange(min_degree, max_degree + 1, dtype=float)

    # Calculate the scaling factor A to ensure the maximum probability at min_degree
    A = max_prob / (min_degree ** slope)

    # Compute the power-law decay values
    y_values = A * degrees ** slope

    # Convert degrees to integers for dictionary keys
    degrees_int = degrees.astype(int)

    # Create a dictionary mapping degrees to decay values
    decay_dict = dict(zip(degrees_int, y_values))

    return decay_dict

params = {
    'slope': -2,
    'intercept': 0.8,
    'r_squared': 0.98,
    'max_degree': 200_000,
    'min_degree': 1,
    'max_prob': 0.5,
    'degree_range': list(np.arange(1, 200_000))
}

target_distribution = create_synthetic_distribution(params, 200_000)

In [None]:
num_graphs = spark.sparkContext.defaultParallelism # number of cores available
num_graphs

### Create Graph Based on Degree Distribution

In [None]:
import networkx as nx
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, explode
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType
import random

# spark = SparkSession.builder.master("local[*]").appName("NetworkFlowGraph").getOrCreate()

def random_node():
    # return '.'.join(map(str, np.random.randint(0, 256, size=4).tolist()))
    return int(np.random.randint(1_000_000, 10_000_000_000))

def random_feature():
    return int(np.random.randint(1, 70000))  # cast to native int

def random_col_e():
    return str(np.random.choice(['col_e_A', 'col_e_B']))  # cast to native str

# num_graphs = spark.sparkContext.defaultParallelism # number of cores available
num_nodes_per_graph = 350_000

def configuration_model_with_distribution(n, degree_distribution,seed):
    """
    Generate a graph with a specific degree distribution
    """

    degrees = []
    remaining_nodes = n

    for degree, prob in sorted(degree_distribution.items()):
        if remaining_nodes <= 0:
            break
        count = min(int(n * prob + 0.5), remaining_nodes)
        if count > 0:
            degrees.extend([int(degree)] * count)
            remaining_nodes -= count

    if remaining_nodes > 0:
        min_degree = min(degree_distribution.keys())
        degrees.extend([min_degree] * remaining_nodes)

    if len(degrees) < 2:
        degrees = [1, 1]

    if sum(degrees) % 2 != 0:
        degrees[0] += 1

    try:
        g = nx.configuration_model(degrees, seed=seed)
        g = nx.Graph(g)

        if g.number_of_edges() == 0:
            raise nx.NetworkXError("Generated graph has no edges")

        return g
    except Exception as e:
        print(f"Error generating graph: {e}")
        return nx.barabasi_ablert_graph(n, 2)
    
def generate_custom_graph(partition_id, num_nodes, degree_distribution, seed):
    # np.random.seed(seed + partition_id)
    # random.seed(seed + partition_id)

    g = configuration_model_with_distribution(num_nodes, degree_distribution, seed + partition_id)

    node_map = {node: random_node() for node in g.nodes()}
    edges = [(node_map[edge[0]], node_map[edge[1]],
              random_feature(), random_feature(), random_col_e())
             for edge in g.edges()]
    
    return edges

target_distribution_bc = spark.sparkContext.broadcast(target_distribution)

directory_path = "/path/to/synthetic_dataset"
seed = 1
remaining_iterations = 10

# while get_directory_size(directory_path) < MAX_SIZE_BYTES:
start = time.time()
while remaining_iterations > 0:
    print(remaining_iterations)
    seed_bc = spark.sparkContext.broadcast(seed)
    generate_custom_graph_udf = udf(
        lambda partition_id: generate_custom_graph(
            partition_id, num_nodes_per_graph, target_distribution_bc.value, seed_bc.value
        ),
        ArrayType(StructType([
            StructField("col_a", LongType(), False),
            StructField("col_b", LongType(), False),
            StructField("col_c", IntegerType(), False),
            StructField("col_d", IntegerType(), False),
            StructField("col_e", StringType(), False),
        ]))
    )

    edge_df = (
        spark.range(num_graphs)
             .withColumn("edges", generate_custom_graph_udf("id"))
             .select(explode("edges").alias("edge"))
             .select("edge.col_a", "edge.col_b", "edge.col_c", "edge.col_d", "edge.col_e")
             .distinct()  # if needed
    )

    edge_df.write.mode("append").parquet(directory_path)

    seed += 1
    remaining_iterations -= 1

end = time.time()
print(f"TTR 10 Iterations: {round(end-start, 2)}")

### Alternative Option for Scaling: Double Size of Dataset and Add Random Noise (maintains distribution)

In [None]:
from pyspark.sql.functions import col, floor, rand, lit

# Define the noise range
NOISE_MIN = -5
NOISE_MAX = 5

long_cols = ["col_a", "col_b"]
integer_cols = ["col_c", "col_d"]
string_cols = ["col_e"]

# Original DataFrame
df_original = test

# Create the noisy copy
df_augmented = df_original.select(
    *[
        # Add integer noise to numeric columns
        (col(c) + floor(rand() * (NOISE_MAX - NOISE_MIN + 1)) + NOISE_MIN).cast("long").alias(c)
        for c in long_cols
    ] + [
        # Add integer noise to numeric columns
        (col(c) + floor(rand() * (NOISE_MAX - NOISE_MIN + 1)) + NOISE_MIN).cast("integer").alias(c)
        for c in integer_cols
    ] + [
        # Preserve string columns
        col(c) for c in string_cols
    ]
)

# Optionally add source labels
df_original = df_original.withColumn("source", lit("original"))
df_augmented = df_augmented.withColumn("source", lit("noisy"))

# Combine both
df_doubled = df_original.unionByName(df_augmented)

df_doubled.repartition(2000).write.parquet('/path/to/synthetic_dataset_doubled', mode='overwrite')

### Get Stats

In [None]:
import networkx as nx
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType
import random

edge_df = df_doubled

def print_graph_stats(edge_df, display=False):

    unique_nodes = edge_df.select("col_a").union(edge_df.select("col_b")).distinct()

    total_edges = edge_df.count()

    src_degrees = edge_df.groupBy("col_a").agg({"col_a": "count"}) \
        .withColumnRenamed("count(col_a)", "in_degree") \
        .withColumnRenamed("col_a", "merge_col")

    dst_degrees = edge_df.groupBy("col_b").agg({"col_b": "count"}) \
        .withColumnRenamed("count(col_b)", "out_degree") \
        .withColumnRenamed("col_b", "merge_col")

    total_degrees = src_degrees.join(dst_degrees, "merge_col", "outer") \
        .fillna(0) \
            .selectExpr(
                "merge_col", 
                "cast(out_degree as int) + cast(in_degree as int) as total_degree"
            )
        
    avg_degree = total_degrees.agg({"total_degree": "avg"}).collect()[0][0]
    max_degree = total_degrees.agg({"total_degree": "max"}).collect()[0][0]

    if display:
        print("\nGraph Statistics:")
        print(f"Number of unique Nodes: {unique_nodes.count()}")
        print(f"Number of edges: {total_edges}")
        print(f"Average degree: {avg_degree:.2f}")
        print(f"Maximum degree: {max_degree}")

    return total_degrees

def check_degree_distribution(edge_df, plot=False):
    total_degrees = print_graph_stats(edge_df)

    degree_dist = total_degrees.groupBy("total_degree").count() \
        .withColumnRenamed("count", "num_vertices")

    total_nodes = total_degrees.count()

    # Collect results to driver for plotting
    dist_rows = degree_dist.collect()
    data = {
        "total_degree": [row["total_degree"] for row in dist_rows],
        "num_vertices": [row["num_vertices"] for row in dist_rows]
    }

    df = pd.DataFrame(data)
    df["empirical"] = df["num_vertices"] / total_nodes
    df["target"] = df["total_degree"].map(target_distribution).fillna(1e-12)  # avoid log(0)
    
    # Sort by total_degree
    df = df.sort_values(by="total_degree")

    print("\nResulting degree distribution:")
    for row in df.itertuples():
        print(f"Degree {row.total_degree}: {row.empirical:.4f} (Target: {row.target:.4f})")

    if plot:
        plt.figure(figsize=(10, 6))
        plt.plot(df['total_degree'], df['empirical'], marker='o', label='Empirical Distribution')
        plt.plot(df['total_degree'], df['target'], marker='x', linestyle='--', label='Target Distribution')
        plt.xlabel('Degree')
        plt.ylabel('Percentage of Nodes (log scale)')
        plt.yscale('log')
        plt.title('Degree Distribution: Empirical vs Target (Log Scale)')
        plt.legend()
        plt.grid(True, which="both", ls="--")
        plt.tight_layout()
        plt.show()


print_graph_stats(edge_df, display=True)
check_degree_distribution(edge_df, True)


### Generate Non-Graph Data

#### Get Workflow Schema Information

In [None]:
# -------------------------------------
# Step 1. Read all sheets from the Excel file.
# -------------------------------------
excel_path = "HPE_NVDA_datagen.xlsx" # update this path as necessary

# Read every sheet into a dictionary: keys are sheet names, values are DataFrames.
sheets = pd.read_excel(excel_path, sheet_name=None)
# sheets = spark.read.  
sheet_names = list(sheets.keys())
print("Found sheets:", sheet_names)

# -------------------------------------
# Step 2. Process the tables overview (first sheet)
# -------------------------------------
# Assumption: The first sheet (e.g. "Tables") lists the table names and approximate row counts.
tables_overview_df = sheets[sheet_names[0]]
# Adjust these column names if your Excel file uses different names.
table_names = tables_overview_df["masked_table_id"].tolist()
approx_row_counts = tables_overview_df["num_rows_approx"].tolist()

print("Tables and approximate row counts:")
for tbl, cnt in zip(table_names, approx_row_counts):
    print(f"  {tbl}: ~{cnt} rows")

# -------------------------------------
# Step 3. Read each table's metadata (columns, types, etc.)
# -------------------------------------
# Here we assume that the sheet name for each table is the same as the table name.
table_metadata = {}
for tbl in table_names:
    if tbl in sheets:
        meta_df = sheets[tbl]
        table_metadata[tbl] = meta_df
        print(f"Loaded metadata for table '{tbl}'.")
    else:
        print(f"Warning: No metadata sheet found for table '{tbl}'.")

# -------------------------------------
# Step 4. Define a mapping from your Excel type names to Spark types.
# -------------------------------------
spark_type_mapping = {
    "StringType()": StringType(),
    "StringType": StringType(),
    "IntegerType()": IntegerType(),
    "IntegerType()": IntegerType(),
    "LongType()": LongType(),
    "FloatType()": FloatType(),
    "DoubleType()": DoubleType(),
    "BooleanType()": BooleanType(),
    "BooleanType()": BooleanType(),
    "DateType()": DateType(),
    "TimestampType()": TimestampType(),
    "ArrayType(IntegerType(), True)": ArrayType(IntegerType(), True),
    "ArrayType(StringType(), True)": ArrayType(StringType(), True)
}

def create_schema(meta_df):
    """
    Create a Spark schema (StructType) from the metadata DataFrame.
    For numerical types, if "min" and "max" are provided, they are stored in the field metadata.
    This version ensures that the type from the spreadsheet is used (if it matches).
    """
    fields = []
    # Ensure that the range columns exist in the DataFrame.
    has_range = ("min" in meta_df.columns) and ("max" in meta_df.columns)
    
    for idx, row in meta_df.iterrows():
        col_name = row["masked_column_name"]
        # Convert the Type from the spreadsheet to a lower-case string.
        type_str = str(row["spark_data_type"]).strip() if pd.notna(row["spark_data_type"]) else "string"
        spark_type = spark_type_mapping.get(type_str)
        
        if spark_type is None:
            # If the type is not recognized, warn and default to StringType.
            print(f"Warning: Unrecognized type '{row['spark_data_type']}' for column '{col_name}'. Using StringType.")
            spark_type = StringType()
        
        md = {}
        # For numerical types, if min and max values are provided, store them in metadata.
        if isinstance(spark_type, (IntegerType, LongType, FloatType, DoubleType)) and has_range:
            if pd.notna(row["min"]) and pd.notna(row["max"]):
                md["min"] = row["min"]
                md["max"] = row["max"]
        
        fields.append(StructField(col_name, spark_type, True, metadata=md))
    
    return StructType(fields)

# Create a dictionary of schemas for each table.
schemas = {}
for tbl, meta_df in table_metadata.items():
    schema = create_schema(meta_df)
    schemas[tbl] = schema
    print(f"Schema for table '{tbl}': {schema}")


# -------------------------------------
# Step 5. Process join information.
# -------------------------------------
# Assumption: The final sheet (last sheet) is named "Joins" and holds the join definitions.
join_info_df = sheets[sheet_names[1]]
joins = []
# Here we assume join_info_df has columns: "LeftTable", "LeftColumn", "RightTable", "RightColumn", and optionally "JoinType"
for idx, row in join_info_df.iterrows():
    join_detail = {
        "left_table": row["table1"],
        "right_table": row["table2"],
        "join_method": row["join_method"],
        "left_column": row["column1"],
        "right_column": row["column2"]
    }
    joins.append(join_detail)

print("Join definitions:")
for join in joins:
    print(f"  {join['left_table']}.{join['left_column']} {join['join_method'].upper()} JOIN {join['right_table']}.{join['right_column']}")

#### Generate Non-Graph Data

In [None]:

# ========================================
# PART 2: Generate random data for each table and register as temp views
# ========================================

def generate_random_dataframe(schema, num_rows):
    """
    Given a Spark StructType schema and a number of rows, generate a DataFrame with random data
    using Spark’s distributed operations.
    For numerical types, if metadata has "min" and "max", those bounds are used.
    """
    # Start with a DataFrame with a column "id" (this DataFrame is generated in a distributed fashion)
    df = spark.range(num_rows)
    
    # For each field in the schema, add a column with a random value.
    for field in schema.fields:
        col_name = field.name
        dt = field.dataType
        md = field.metadata or {}
        
        if isinstance(dt, (IntegerType, LongType)):
            # Use provided min and max if available; otherwise default to 1 and 1000.
            min_val = md.get("min", 1)
            max_val = md.get("max", 1000)
            expr = (F.rand() * (float(max_val) - float(min_val)) + float(min_val))
            # Cast appropriately.
            if isinstance(dt, IntegerType):
                df = df.withColumn(col_name, expr.cast("int"))
            else:
                df = df.withColumn(col_name, expr.cast("long"))
                
        elif isinstance(dt, (FloatType, DoubleType)):
            min_val = md.get("min", 0.0)
            max_val = md.get("max", 1000.0)
            expr = (F.rand() * (float(max_val) - float(min_val)) + float(min_val))
            if isinstance(dt, FloatType):
                df = df.withColumn(col_name, expr.cast("float"))
            else:
                df = df.withColumn(col_name, expr.cast("double"))
                
        elif isinstance(dt, BooleanType):
            # Generate a boolean value based on a threshold.
            df = df.withColumn(col_name, F.rand() > 0.5)
            
        elif isinstance(dt, DateType):
            # Generate a random date by adding a random number of days (e.g., 0 to 9000) to a base date.
            df = df.withColumn(col_name, F.expr("date_add('2000-01-01', cast(rand() * 9000 as int))"))
            
        elif isinstance(dt, TimestampType):
            # Generate a random timestamp by first generating a random date and then converting it.
            df = df.withColumn(col_name, F.expr("to_timestamp(date_add('2000-01-01', cast(rand() * 9000 as int)))"))
            
        elif isinstance(dt, StringType):
            # Use the built-in uuid() function for random strings.
            df = df.withColumn(col_name, F.expr("uuid()"))
            
        else:
            # For any unrecognized type, set the column to null.
            df = df.withColumn(col_name, F.lit(None))
            
    # Drop the original "id" column.
    return df.drop("id")

# Create and register a DataFrame for each table using the distributed random data generation.
# NOTE: THIS WAS SCALED DOWN FOR TESTING PURPOSES. UNCOMMENT LINE 74 AND COMMENT OUT LINES 68-73 FOR REAL TESTING
dfs = {}
for tbl, count in zip(table_names, approx_row_counts):
    if tbl != 'table_a':
        schema = schemas[tbl]
        if tbl == 'table_c':
            num_rows = 21000000
        else:
            num_rows = int(count)
        # num_rows = int(count)
        df = generate_random_dataframe(schema, num_rows)
        dfs[tbl] = df
        print(f"Created DataFrame for table '{tbl}' with {num_rows} random rows.")

# Testing

## Basic Tests

In [None]:
edge_df = spark.read.parquet('/path/to/synthetic_dataset_doubled')

### Sorts

In [None]:
sort1 = edge_df.orderBy(['col_a'])
sort1.write.format("noop").mode("overwrite").save()

In [None]:
sort2 = edge_df.orderBy(['col_a', 'col_b', 'col_c', 'col_d', 'col_e'])
sort2.write.format("noop").mode("overwrite").save()

### GroupBys

In [None]:
gb1 = edge_df.groupBy(['col_a']).count()
gb1.write.format("noop").mode("overwrite").save

In [None]:
gb2 = edge_df.groupBy(['col_a', 'col_b', 'col_c', 'col_d', 'col_e']).count()
gb2.write.format("noop").mode("overwrite").save

## Workflow Tests

### Joins

#### Single Step Join

In [None]:
table_a = edge_df
table_b = dfs['table_b']

join_test1 = table_a.join(table_b, [
        table_a["col_a"]==table_b["col_b_8"],
        table_a["col_b"]==table_b["col_b_3"],
        table_a["col_c"]==table_b["col_b_9"],
        table_a["col_d"]==table_b["col_b_1"],
    ],
    how='left'
)
join_test1.write.format("noop").mode("overwrite").save()

#### Workflow Joins

In [None]:
linked_join_test = (
    table_a
    .join(
        table_b,
        [
            table_a["col_a"] == table_b["col_b_8"],
            table_a["col_b"] == table_b["col_b_3"],
            table_a["col_c"] == table_b["col_b_9"],
            table_a["col_d"] == table_b["col_b_1"],
        ],
        how="left"
    )
    .join(
        table_c,
        [
            table_a["col_a"] == table_c["col_c_10"],
            table_a["col_b"] == table_c["col_c_9"],
            table_a["col_e"] == table_c["col_c_11"],
        ],
        how="left"
    )
    .join(
        table_d,
        [
            table_a["col_a"] == table_d["col_d_0"],
            table_a["col_c"] == table_d["col_d_1"],
        ],
        how="left"
    )
    .join(
        table_e,
        table_a["col_a"] == table_e["col_e_0"],
        how="left"
    )
)

linked_join_test.write.format("noop").mode("overwrite").save()

### Breadth First Search

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

# Create or get your Spark session
# spark = SparkSession.builder.getOrCreate()

# Assume your input DataFrame 'df' has columns "col_0" and "col_1".
# We create an 'edges' DataFrame with "src" and "dst" columns.
df = edge_df

edges = df.select(F.col("col_b").alias("src"), F.col("col_d").alias("dst"))

# Define the BFS starting point.
# Change the 'source' variable to the vertex from which you want to start the BFS.
source = 1000  # For example, use "A" as the starting vertex

# Create the initial frontier: the source vertex with distance 0.
frontier = spark.createDataFrame([(source, 0)], ["vertex", "distance"])

# Create a DataFrame to keep track of all visited vertices (and their distance from the source).
visited = frontier

# Loop until there are no new nodes to visit.
while frontier.count() > 0:
    # 1. Find neighbors: join the current frontier with the edges DataFrame.
    #    Each neighbor gets a distance equal to (current distance + 1).
    new_neighbors = frontier.join(edges, frontier.vertex == edges.src) \
                            .select(edges.dst.alias("vertex"),
                                    (frontier.distance + 1).alias("distance"))
    
    # 2. Exclude vertices that have already been visited.
    new_neighbors = new_neighbors.join(visited, on="vertex", how="left_anti").distinct()
    
    # 3. If no new vertices are found, exit the loop.
    if new_neighbors.count() == 0:
        break
    
    # 4. Add the new neighbors to the visited set.
    visited = visited.union(new_neighbors).distinct()
    
    # 5. Update the frontier to be the new neighbors.
    frontier = new_neighbors

# The 'visited' DataFrame now contains all vertices reachable from the source,
# along with the minimum number of steps (distance) from the source.
visited.write.format("noop").mode("overwrite").save()
     

### PageRank

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

# Create or get your Spark session
# spark = SparkSession.builder.getOrCreate()

df = edge_df.limit(10000) # Change as needed

# Set the reset (teleportation) probability and the number of iterations
alpha = 0.15
maxIter = 10

# Choose the personalized seed: take the first value from col_b
seed = df.select("col_b").first()[0]

# 1. Create the vertices DataFrame: union of unique IDs from col_b and col_d.
vertices = (
    df.select(F.col("col_b").alias("id"))
      .union(df.select(F.col("col_d").alias("id")))
      .distinct()
)

# 2. Create the edges DataFrame: define edge from col_b to col_d.
edges = df.select(F.col("col_b").alias("src"), F.col("col_d").alias("dst"))

# 3. Compute out-degrees: count of outgoing edges for each source vertex.
out_degrees = edges.groupBy("src").agg(F.count("*").alias("out_degree"))

# 4. Initialize each vertex with a PageRank value:
#    The seed gets 1.0 and all others start with 0.0.
vertices_rank = vertices.withColumn(
    "rank", F.when(F.col("id") == seed, 1.0).otherwise(0.0)
)

# 5. Iteratively update the PageRank values.
for i in range(maxIter):
    # 5a. For each edge, compute the contribution from its source.
    #     Join the edges with the current vertex ranks and the out-degrees.
    contribs = (
        edges.join(vertices_rank, edges.src == vertices_rank.id)
             .join(out_degrees, edges.src == out_degrees.src)
             .select(
                 edges.dst.alias("id"),
                 (vertices_rank.rank / out_degrees.out_degree).alias("contrib")
             )
    )
    
    # 5b. Sum the contributions arriving at each vertex.
    contribs_sum = contribs.groupBy("id").agg(F.sum("contrib").alias("sum_contrib"))
    
    # 5c. Compute the total rank from dangling nodes (vertices with no outgoing edges).
    dangling = (
        vertices_rank.join(out_degrees, vertices_rank.id == out_degrees.src, "left")
                     .withColumn("out_degree", F.coalesce(F.col("out_degree"), F.lit(0)))
                     .where(F.col("out_degree") == 0)
    )
    dangling_sum = dangling.agg(F.sum("rank").alias("dangling_sum")).collect()[0]["dangling_sum"]
    if dangling_sum is None:
        dangling_sum = 0.0
    
    # 5d. Update each vertex's rank:
    #     - If the vertex is the seed, it gets the reset term (alpha)
    #       plus (1 - alpha) times (its incoming contributions plus dangling rank).
    #     - Otherwise, it just gets (1 - alpha) times its incoming contributions.
    vertices_rank = (
        vertices.join(contribs_sum, on="id", how="left")
                .na.fill({"sum_contrib": 0.0})
                .withColumn("rank", 
                    F.when(F.col("id") == seed,
                           alpha + (1 - alpha) * (F.col("sum_contrib") + dangling_sum)
                    ).otherwise(
                           (1 - alpha) * F.col("sum_contrib")
                    )
                )
    )

# 6. Display the final personalized PageRank values.
vertices_rank.show()
