In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, StringType, BooleanType, ArrayType, MapType
from pyspark.sql.functions import col, struct, when

In [2]:
spark = SparkSession.builder.master("local[*]").appName("Test12").getOrCreate()

In [3]:
structureData = [
    (("James","","Smith"),"36636","M",3100),
    (("Michael","Rose",""),"40288","M",4300),
    (("Robert","","Williams"),"42114","M",1400),
    (("Maria","Anne","Jones"),"39192","F",5500),
    (("Jen","Mary","Brown"),"","F",-1)
  ]

structureSchema = StructType([
    StructField("name", StructType([
        StructField("firstname", StringType(), True),
        StructField("middlename", StringType(), True),
        StructField("lastname", StringType(), True)])),
    StructField("id", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("salary", IntegerType(), True)
])

df = spark.createDataFrame(data = structureData, schema = structureSchema)
df.printSchema()
df.show(truncate = False)

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

+--------------------+-----+------+------+
|name                |id   |gender|salary|
+--------------------+-----+------+------+
|[James, , Smith]    |36636|M     |3100  |
|[Michael, Rose, ]   |40288|M     |4300  |
|[Robert, , Williams]|42114|M     |1400  |
|[Maria, Anne, Jones]|39192|F     |5500  |
|[Jen, Mary, Brown]  |     |F     |-1    |
+--------------------+-----+------+------+



## Adding & Changing struct of the DataFrame
- Using PySpark SQL function struct(), we can change the struct of the existing DataFrame and add a new StructType to it.

In [5]:
#adding a new struct type named "OtherInfo" & under it we are going to add 3 struct fields
#and will drop previously available 3 structfields "id", "gender", "salary"
updatedDF = df.withColumn("OtherInfo",
                         struct(col("id").alias("identifier"),
                               col("gender").alias("gender"),
                               col("salary").alias("salary"),
                               when(col("salary").cast(IntegerType()) < 2000, "Low")
                                .when(col("salary").cast(IntegerType()) < 4000, "Medium")
                                .otherwise("High").alias("Salary_Grade"))).drop("id", "gender", "salary")

updatedDF.printSchema()
updatedDF.show(truncate = False)

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- OtherInfo: struct (nullable = false)
 |    |-- identifier: string (nullable = true)
 |    |-- gender: string (nullable = true)
 |    |-- salary: integer (nullable = true)
 |    |-- Salary_Grade: string (nullable = false)

+--------------------+------------------------+
|name                |OtherInfo               |
+--------------------+------------------------+
|[James, , Smith]    |[36636, M, 3100, Medium]|
|[Michael, Rose, ]   |[40288, M, 4300, High]  |
|[Robert, , Williams]|[42114, M, 1400, Low]   |
|[Maria, Anne, Jones]|[39192, F, 5500, High]  |
|[Jen, Mary, Brown]  |[, F, -1, Low]          |
+--------------------+------------------------+

