In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql.functions import *
from pyspark.sql.window import *

# Initialize Spark Session
spark = SparkSession.builder.appName("PeopleData").getOrCreate()

# Define the schema for the DataFrame
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("gender", StringType(), True)
])

# Manually input the data
data = [
    (107, "Days", "F"),
    (145, "Hawbaker", "M"),
    (155, "Hansel", "F"),
    (202, "Blackston", "M"),
    (227, "Criss", "F"),
    (278, "Keffer", "M"),
    (305, "Canty", "M"),
    (329, "Mozingo", "M"),
    (425, "Nolf", "M"),
    (534, "Waugh", "M"),
    (586, "Tong", "M"),
    (618, "Dimartino", "M"),
    (747, "Beane", "M"),
    (878, "Chatmon", "F"),
    (904, "Hansard", "F"),
]

# Create the DataFrame
people = spark.createDataFrame(data, schema=schema)



# Define the schema for the relations DataFrame
schema_relations = StructType([
    StructField("c_id", IntegerType(), True),
    StructField("p_id", IntegerType(), True)
])

# Manually input the data
data_relations = [
    (145, 202),
    (145, 107),
    (278, 305),
    (278, 155),
    (329, 425),
    (329, 227),
    (534, 586),
    (618, 904)
]

# Create the relations DataFrame
relations = spark.createDataFrame(data_relations, schema=schema_relations)

# Show the DataFrame
relations.show()


# Show the DataFrame
people.show()


+----+----+
|c_id|p_id|
+----+----+
| 145| 202|
| 145| 107|
| 278| 305|
| 278| 155|
| 329| 425|
| 329| 227|
| 534| 586|
| 618| 904|
+----+----+

+---+---------+------+
| id|     name|gender|
+---+---------+------+
|107|     Days|     F|
|145| Hawbaker|     M|
|155|   Hansel|     F|
|202|Blackston|     M|
|227|    Criss|     F|
|278|   Keffer|     M|
|305|    Canty|     M|
|329|  Mozingo|     M|
|425|     Nolf|     M|
|534|    Waugh|     M|
|586|     Tong|     M|
|618|Dimartino|     M|
|747|    Beane|     M|
|878|  Chatmon|     F|
|904|  Hansard|     F|
+---+---------+------+



In [0]:
ans_df=relations.join(people, relations.p_id == people.id, how="left") \
    .select(
        relations.c_id.alias("child_id"),
        relations.p_id.alias("parent_id"),
        when(col("gender") == "M", col("name")).otherwise(None).alias("Mother_Name"),
         when(col("gender") == "F", col("name")).otherwise(None).alias("Father_Name")
    )
# Alias the DataFrames
ans_df_alias = ans_df.alias("ans")
people_alias = people.alias("ppl")

# Perform groupBy, aggregation, and join
final_df = (
    ans_df_alias.groupBy("child_id")
    .agg(
        max("Mother_Name").alias("Mother_Name"),
        max("Father_Name").alias("Father_Name")
    )
    .join(people_alias, col("child_id") == col("ppl.id"), how="inner")
    .select(
        col("ppl.name").alias("child_name"),
        col("Mother_Name"),
        col("Father_Name")
    ).orderBy(col("child_name").asc())
)

# Show the result
final_df.show()



+----------+-----------+-----------+
|child_name|Mother_Name|Father_Name|
+----------+-----------+-----------+
| Dimartino|       null|    Hansard|
|  Hawbaker|  Blackston|       Days|
|    Keffer|      Canty|     Hansel|
|   Mozingo|       Nolf|      Criss|
|     Waugh|       Tong|       null|
+----------+-----------+-----------+



In [0]:
# Alias the DataFrames
ans_df_alias = ans_df.alias("ans")
people_alias = people.alias("ppl")

# group by chil_id & then join the chil_id with ppl_id to get the child_name 
final_df = (
    ans_df_alias.groupBy("child_id")
    .agg(
        max("Mother_Name").alias("Mother_Name"),
        max("Father_Name").alias("Father_Name")
    )
    .join(people_alias, col("child_id") == col("ppl.id"), how="inner")
    .select(
        col("ppl.name").alias("child_name"),
        col("Mother_Name"),
        col("Father_Name")
    ).orderBy(col("child_name").asc())
)

# Show the result
final_df.show()


+----------+-----------+-----------+
|child_name|Mother_Name|Father_Name|
+----------+-----------+-----------+
| Dimartino|       null|    Hansard|
|  Hawbaker|  Blackston|       Days|
|    Keffer|      Canty|     Hansel|
|   Mozingo|       Nolf|      Criss|
|     Waugh|       Tong|       null|
+----------+-----------+-----------+

