In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when, col

# Start Spark session
spark = SparkSession.builder.getOrCreate()

# Sample data
data = [
    (1, None),
    (2, 1),
    (3, 1),
    (4, 2),
    (5, 2)
]

columns = ["id", "p_id"]


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when, col

# Initialize a SparkSession, which is the entry point for all PySpark functionality.
# This is required to create and work with DataFrames.
spark = SparkSession.builder.getOrCreate()

# Sample data representing the Tree table with node IDs and parent IDs.
# A parent ID of `None` indicates the root node.
data = [(1, None), (2, 1), (3, 1), (4, 2), (5, 2)]

# Create a PySpark DataFrame from the sample data.
df = spark.createDataFrame(data, ["id", "p_id"])

# --- Step 1: Prepare a list of all unique parent nodes ---
# We select the `p_id` column, filter out `None` values (the root has no parent),
# and get the distinct IDs. We rename the column to "parent_id" for clarity in the join.
parent_nodes = (
    df.select("p_id")
      .where(col("p_id").isNotNull())
      .distinct()  # Important to avoid duplicate rows and improve performance
      .withColumnRenamed("p_id", "parent_id")
)

# --- Step 2: Join the main DataFrame with the list of parent nodes ---
# We perform a `left` join to keep all original nodes.
# The join condition matches a node's `id` to the `parent_id` list.
# If a match is found, the node has children and the "parent_id" column
# will be populated. If no match, it will be `null`.
df_with_children = df.join(parent_nodes, df.id == parent_nodes.parent_id, how="left")

# --- Step 3: Classify each node based on the join results ---
# We use a `when().otherwise()` expression to create the "type" column.
result = df_with_children.withColumn(
    "type",
    # Condition 1: If the node's `p_id` is null, it's a "Root".
    when(col("p_id").isNull(), "Root")
    # Condition 2: If the `parent_id` from the join is not null, the node is a parent
    #              and thus an "Inner" node.
    .when(col("parent_id").isNotNull(), "Inner")
    # Condition 3: Otherwise, if neither of the above conditions is met, it's a "Leaf".
    #              This means it has a parent (`p_id` is not null) but no children
    #              (`parent_id` is null).
    .otherwise("Leaf")
) \
.select(df.id.alias("id"), "type") \
.orderBy("id") # Order the final result by ID for a clean, consistent output.

# Display the final DataFrame showing the ID and its type.
result.show()


In [0]:
import pandas as pd

# Sample data
data = {
    "id": [1, 2, 3, 4, 5],
    "p_id": [None, 1, 1, 2, 2]
}

df = pd.DataFrame(data)

# Step 1: Identify nodes that are parents (appear in p_id column)
parents = set(df["p_id"].dropna())

# Step 2: Classify each node
def get_node_type(row):
    if pd.isna(row["p_id"]):
        return "Root"
    elif row["id"] in parents:
        return "Inner"
    else:
        return "Leaf"

df["type"] = df.apply(get_node_type, axis=1)

print(df[["id", "type"]])
