In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
spark = SparkSession.builder \
.appName("Struct Type") \
.getOrCreate()

In [3]:
raw_data = [
    ("U001","Abhishek",28,"Hyderabad",50000),
    ("U002","Neha",32,"Delhi",62000),
    ("U003","Ravi",25,"Bangalore",45000),
    ("U004","Pooja",29,"Mumbai",58000)
]

In [2]:
from pyspark.sql.types import (StructType, StructField, StringType, IntegerType, LongType)

In [4]:
user_schema = StructType([
    StructField("user_id", StringType(), nullable=False),
    StructField("name", StringType(), nullable=True),
    StructField("age", IntegerType(), nullable=True),
    StructField("city", StringType(), nullable=True),
    StructField("salary", LongType(), nullable=True)
])

In [5]:
df_users = spark.createDataFrame(data=raw_data, schema=user_schema)
df_users.printSchema()
df_users.show()

root
 |-- user_id: string (nullable = false)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- salary: long (nullable = true)

+-------+--------+---+---------+------+
|user_id|    name|age|     city|salary|
+-------+--------+---+---------+------+
|   U001|Abhishek| 28|Hyderabad| 50000|
|   U002|    Neha| 32|    Delhi| 62000|
|   U003|    Ravi| 25|Bangalore| 45000|
|   U004|   Pooja| 29|   Mumbai| 58000|
+-------+--------+---+---------+------+



wrong data

In [6]:
raw_data2 = [ ("U005", "Amit", "Thirty", "Chennai", 40000)]

In [8]:
df_users = spark.createDataFrame(data=raw_data2, schema=user_schema)
#wont happen since age is not an int

ArrayType

In [9]:
from pyspark.sql.functions import ArrayType

In [10]:
interest_data = [
    ("U001",["AI","ML","Cloud"]),
    ("U002",["Testing","Automation"]),
    ("U003",["Data Engineering","Spark","Kafka"]),
    ("U004",["UI/UX"])
]

In [11]:
interest_schema = StructType([
    StructField("user_id", StringType(), nullable=False),
    StructField("interests", ArrayType(StringType()), nullable=True)
])

In [12]:
df_interests = spark.createDataFrame(interest_data, interest_schema)
df_interests.printSchema()
df_interests.show(truncate = False)

root
 |-- user_id: string (nullable = false)
 |-- interests: array (nullable = true)
 |    |-- element: string (containsNull = true)

+-------+--------------------------------+
|user_id|interests                       |
+-------+--------------------------------+
|U001   |[AI, ML, Cloud]                 |
|U002   |[Testing, Automation]           |
|U003   |[Data Engineering, Spark, Kafka]|
|U004   |[UI/UX]                         |
+-------+--------------------------------+



Explode

In [16]:
from pyspark.sql.functions import explode
df_interests.select("user_id",
                    explode("interests").alias("interest")
                    ).show()

+-------+----------------+
|user_id|        interest|
+-------+----------------+
|   U001|              AI|
|   U001|              ML|
|   U001|           Cloud|
|   U002|         Testing|
|   U002|      Automation|
|   U003|Data Engineering|
|   U003|           Spark|
|   U003|           Kafka|
|   U004|           UI/UX|
+-------+----------------+



mapType

In [18]:
from pyspark.sql.types import MapType

In [20]:
device_data = [
    ("U001",{"mobile":120,"laptop":300}),
    ("U002",{"tablet":80}),
    ("U003",{"mobile":200,"desktop":400}),
    ("U004",{"laptop":250})
]

In [21]:
device_schema = StructType([
    StructField("user_id", StringType(), nullable=False),
    StructField("devices", MapType(StringType(), IntegerType()), nullable=True)
])

In [22]:
df_devices = spark.createDataFrame(device_data, device_schema)
df_devices.printSchema()
df_devices.show()

root
 |-- user_id: string (nullable = false)
 |-- devices: map (nullable = true)
 |    |-- key: string
 |    |-- value: integer (valueContainsNull = true)

+-------+--------------------+
|user_id|             devices|
+-------+--------------------+
|   U001|{mobile -> 120, l...|
|   U002|      {tablet -> 80}|
|   U003|{mobile -> 200, d...|
|   U004|     {laptop -> 250}|
+-------+--------------------+



nested data

In [23]:
nested_data = [
    ("U001",("Hyderabad","Telangana",500081)),
    ("U002",("Delhi","Delhi",110001)),
    ("U003",("Bangalore","Karnataka",560001))
]

In [24]:
address_schema = StructType([
    StructField("city", StringType(), True),
    StructField("state", StringType(), True),
    StructField("pincode", IntegerType(), True)
])

profile_schema = StructType([
    StructField("user_id", StringType(), True),
    StructField("address", address_schema, True),
])

In [25]:
df_profile = spark.createDataFrame(nested_data, profile_schema)
df_profile.printSchema()
df_profile.show()

root
 |-- user_id: string (nullable = true)
 |-- address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- state: string (nullable = true)
 |    |-- pincode: integer (nullable = true)

+-------+--------------------+
|user_id|             address|
+-------+--------------------+
|   U001|{Hyderabad, Telan...|
|   U002|{Delhi, Delhi, 11...|
|   U003|{Bangalore, Karna...|
+-------+--------------------+



In [26]:
df_profile.select(
    "user_id",
    "address.city",
    "address.state"

).show()

+-------+---------+---------+
|user_id|     city|    state|
+-------+---------+---------+
|   U001|Hyderabad|Telangana|
|   U002|    Delhi|    Delhi|
|   U003|Bangalore|Karnataka|
+-------+---------+---------+



In [27]:
from pyspark.sql.functions import col
df_users.withColumn(
    "salary_int",
    col("salary").cast("int")
)

DataFrame[user_id: string, name: string, age: int, city: string, salary: bigint, salary_int: int]

In [None]:
from pyspark.sql.functions import to_date
df_orders.withColumn(
    "order_date",
    to_date(col("order_date"), "yyyy-MM-dd"))