%md
**Exploding and Pivoting Multi-Category Data Using PySpark**

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split
from pyspark.sql.functions import explode
from pyspark.sql.functions import lit
from pyspark.sql.functions import *
spark = SparkSession.builder.appName("ARAB").getOrCreate()

# Sample input data
data = [
    ("A", "Apple,Mango,Orange"),
    ("B", "Apple"),
    ("C", "Guava,Cherry"),
    ("D", "Mango,Cherry,Orange")
]

# Create DataFrame
df_input = spark.createDataFrame(data, ["Person", "Basket"])
df_input.display()



Person,Basket
A,"Apple,Mango,Orange"
B,Apple
C,"Guava,Cherry"
D,"Mango,Cherry,Orange"


In [0]:
# Split the Basket string into an array
df_split = df_input.withColumn("Basket", split("Basket", ","))
df_split.display()

Person,Basket
A,"List(Apple, Mango, Orange)"
B,List(Apple)
C,"List(Guava, Cherry)"
D,"List(Mango, Cherry, Orange)"


In [0]:


# Explode the array to get one fruit per row
df_exploded = df_split.withColumn("Fruit", explode("Basket")).drop("Basket")
df_exploded.display()

Person,Fruit
A,Apple
A,Mango
A,Orange
B,Apple
C,Guava
C,Cherry
D,Mango
D,Cherry
D,Orange


In [0]:


# Step 4: Add flag for presence
df_flagged = df_exploded.withColumn("HasFruit", lit(1))
df_flagged.display()

Person,Fruit,HasFruit
A,Apple,1
A,Mango,1
A,Orange,1
B,Apple,1
C,Guava,1
C,Cherry,1
D,Mango,1
D,Cherry,1
D,Orange,1


In [0]:

# Step 5: Pivot to wide format
df_pivoted = df_flagged.groupBy("Person").pivot("Fruit").sum("HasFruit")
df_pivoted.display()

Person,Apple,Cherry,Guava,Mango,Orange
A,1.0,,,1.0,1.0
B,1.0,,,,
C,,1.0,1.0,,
D,,1.0,,1.0,1.0


In [0]:


#  Fill missing values with 0
df_filled = df_pivoted.fillna(0)

#  Convert 1/0 to Yes/No
df_final = df_filled.select(
    "Person",
    *[when(col(c) == 1, "Yes").otherwise("No").alias(c) for c in df_filled.columns if c != "Person"]
)

# Show result
df_final.display()

Person,Apple,Cherry,Guava,Mango,Orange
A,Yes,No,No,Yes,Yes
B,Yes,No,No,No,No
C,No,Yes,Yes,No,No
D,No,Yes,No,Yes,Yes
