# Module 07a - Advanced Operations - Complex Types - Exercises

## Instructions

This notebook contains exercises based on the concepts learned in Module 07a.

- Complete each exercise in the provided code cells
- Run the data setup cells first to generate/create necessary data
- Test your solutions by running the verification cells (if provided)
- Refer back to the main module notebook if you need help


## Data Setup

Run the cells below to set up the data needed for the exercises.


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import col, when, lit
import os

# Create SparkSession
spark = SparkSession.builder \
    .appName("Module Exercises") \
    .master("local[*]") \
    .getOrCreate()

# Set data directory
data_dir = "../data"
os.makedirs(data_dir, exist_ok=True)

print("SparkSession created successfully!")
print(f"Data directory: {os.path.abspath(data_dir)}")

# Create simple employee DataFrame
df_employees = spark.createDataFrame([
    ("Alice", 25, "Sales", 50000),
    ("Bob", 30, "IT", 60000),
    ("Charlie", 35, "Sales", 70000),
    ("Diana", 28, "IT", 55000),
    ("Eve", 32, "HR", 65000)
], ["Name", "Age", "Department", "Salary"])

print("Employee DataFrame created:")
df_employees.show()


SparkSession created successfully!
Data directory: c:\Users\Amitha.GS\data
Employee DataFrame created:
+-------+---+----------+------+
|   Name|Age|Department|Salary|
+-------+---+----------+------+
|  Alice| 25|     Sales| 50000|
|    Bob| 30|        IT| 60000|
|Charlie| 35|     Sales| 70000|
|  Diana| 28|        IT| 55000|
|    Eve| 32|        HR| 65000|
+-------+---+----------+------+



## Exercises

Complete the following exercises based on the concepts from Module 07a.


### Exercise 1: Basic Operation

Complete a basic operation based on Module 07a concepts.

In [None]:
# Select specific column
df_employees.select("Name", "Salary").show()


+-------+------+
|   Name|Salary|
+-------+------+
|  Alice| 50000|
|    Bob| 60000|
|Charlie| 70000|
|  Diana| 55000|
|    Eve| 65000|
+-------+------+



In [None]:
# Access array elements by index
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder.getOrCreate()

data = [
    ("Alice", ["Python", "SQL", "Spark"]),
    ("Bob", ["Java", "Scala"]),
    ("Charlie", ["Python", "R", "SQL"])
]

df = spark.createDataFrame(data, ["Name", "Skills"])
df.show(truncate=False)


+-------+--------------------+
|Name   |Skills              |
+-------+--------------------+
|Alice  |[Python, SQL, Spark]|
|Bob    |[Java, Scala]       |
|Charlie|[Python, R, SQL]    |
+-------+--------------------+



In [None]:
# access first index
df.select(
    "Name",
    col("Skills").getItem(0).alias("First_Skill")
).show()


+-------+-----------+
|   Name|First_Skill|
+-------+-----------+
|  Alice|     Python|
|    Bob|       Java|
|Charlie|     Python|
+-------+-----------+



In [None]:
# SQL indexing
df.selectExpr(
    "Name",
    "Skills[0] as First_Skill",
    "Skills[1] as Second_Skill"
).show()


+-------+-----------+------------+
|   Name|First_Skill|Second_Skill|
+-------+-----------+------------+
|  Alice|     Python|         SQL|
|    Bob|       Java|       Scala|
|Charlie|     Python|           R|
+-------+-----------+------------+



In [None]:
# filter using array index
df.filter(col("Skills").getItem(0) == "Python").show()


+-------+--------------------+
|   Name|              Skills|
+-------+--------------------+
|  Alice|[Python, SQL, Spark]|
|Charlie|    [Python, R, SQL]|
+-------+--------------------+



In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import array_contains, col

df.filter(array_contains(col("Skills"), "Python")).show()


+-------+--------------------+
|   Name|              Skills|
+-------+--------------------+
|  Alice|[Python, SQL, Spark]|
|Charlie|    [Python, R, SQL]|
+-------+--------------------+



In [None]:
df.createOrReplaceTempView("employees")

spark.sql("""
    SELECT Name, Skills
    FROM employees
    WHERE array_contains(Skills, 'Python')
""").show()


+-------+--------------------+
|   Name|              Skills|
+-------+--------------------+
|  Alice|[Python, SQL, Spark]|
|Charlie|    [Python, R, SQL]|
+-------+--------------------+



In [None]:
#explode
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode

df_exploded = df.select(
    "Name",
    explode("Skills").alias("Skill")
)

df_exploded.show()

+-------+------+
|   Name| Skill|
+-------+------+
|  Alice|Python|
|  Alice|   SQL|
|  Alice| Spark|
|    Bob|  Java|
|    Bob| Scala|
|Charlie|Python|
|Charlie|     R|
|Charlie|   SQL|
+-------+------+



In [None]:
# using stuct
from pyspark.sql.functions import struct

df_with_struct = df_employees.withColumn(
    "Details",
    struct(
        col("Age").alias("experience"),
        col("Salary").alias("projects")
    )
)

df_with_struct.printSchema()


root
 |-- Name: string (nullable = true)
 |-- Age: long (nullable = true)
 |-- Department: string (nullable = true)
 |-- Salary: long (nullable = true)
 |-- Details: struct (nullable = false)
 |    |-- experience: long (nullable = true)
 |    |-- projects: long (nullable = true)



In [None]:
from pyspark.sql.functions import struct, col

df_struct = df_employees.withColumn(
    "Details",
    struct(
        col("Age"),
        col("Salary")
    )
)

df_struct.show(truncate=False)
df_struct.printSchema()


+-------+---+----------+------+-----------+
|Name   |Age|Department|Salary|Details    |
+-------+---+----------+------+-----------+
|Alice  |25 |Sales     |50000 |{25, 50000}|
|Bob    |30 |IT        |60000 |{30, 60000}|
|Charlie|35 |Sales     |70000 |{35, 70000}|
|Diana  |28 |IT        |55000 |{28, 55000}|
|Eve    |32 |HR        |65000 |{32, 65000}|
+-------+---+----------+------+-----------+

root
 |-- Name: string (nullable = true)
 |-- Age: long (nullable = true)
 |-- Department: string (nullable = true)
 |-- Salary: long (nullable = true)
 |-- Details: struct (nullable = false)
 |    |-- Age: long (nullable = true)
 |    |-- Salary: long (nullable = true)



In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import MapType, StringType, IntegerType

spark = SparkSession.builder.getOrCreate()

data = [
    ("Alice", {"experience": 5, "projects": 10}),
    ("Bob", {"experience": 3, "projects": 5})
]

df = spark.createDataFrame(
    data,
    ["Name", "Details"]
)

df.show(truncate=False)
df.printSchema()


+-----+---------------------------------+
|Name |Details                          |
+-----+---------------------------------+
|Alice|{experience -> 5, projects -> 10}|
|Bob  |{experience -> 3, projects -> 5} |
+-----+---------------------------------+

root
 |-- Name: string (nullable = true)
 |-- Details: map (nullable = true)
 |    |-- key: string
 |    |-- value: long (valueContainsNull = true)



In [None]:
# key exists are not
from pyspark.sql.functions import map_contains_key

df.select(
    "Name",
    map_contains_key(col("Details"), "projects").alias("Has_Projects")
).show()


+-----+------------+
| Name|Has_Projects|
+-----+------------+
|Alice|        true|
|  Bob|        true|
+-----+------------+



In [None]:
# convert map to rows
from pyspark.sql.functions import explode

df.select(
    "Name",
    explode("Details").alias("Key", "Value")
).show()


+-----+----------+-----+
| Name|       Key|Value|
+-----+----------+-----+
|Alice|experience|    5|
|Alice|  projects|   10|
|  Bob|experience|    3|
|  Bob|  projects|    5|
+-----+----------+-----+



## Summary

Great job completing the exercises! Review your solutions and compare them with the solutions notebook if needed.
