# STRUCT TYPE VS MAP FIELD

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/30 00:47:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, BooleanType, MapType


## Define Struct Type

In [4]:
schema_students = StructType([
    StructField("id",IntegerType(),False),
    StructField("names",StructType([
        StructField("first_name",StringType(),True),
        StructField("middle_name",StringType(),True),
        StructField("last_name",StringType(),True)
    ])),
    StructField("active",BooleanType(),True)
])

## Struct Type Fixed

In [5]:
data_students_v1 = [
    (1, ('Jacob', 'Kyle', 'Smith'), True),
    (2, ('Linda', 'Mia', 'Jonh'), False),
    (3, ('Oliver', 'James', 'Johnson'), False)
]

In [6]:
df_students_v1 = spark.createDataFrame(data=data_students_v1,schema=schema_students)
df_students_v1.show(truncate=False)

                                                                                

+---+------------------------+------+
|id |names                   |active|
+---+------------------------+------+
|1  |{Jacob, Kyle, Smith}    |true  |
|2  |{Linda, Mia, Jonh}      |false |
|3  |{Oliver, James, Johnson}|false |
+---+------------------------+------+



## Struct Type w/Null

In [7]:
data_students_v2 = [
    (1, ('Jacob', None, 'Smith'), True),
    (2, ('Linda', 'Mia', 'Jonh'), False),
    (3, (None, None, None), False)
]

df_students_v2 = spark.createDataFrame(data=data_students_v2,schema=schema_students)
df_students_v2.show(truncate=False)

+---+--------------------+------+
|id |names               |active|
+---+--------------------+------+
|1  |{Jacob, NULL, Smith}|true  |
|2  |{Linda, Mia, Jonh}  |false |
|3  |{NULL, NULL, NULL}  |false |
+---+--------------------+------+



## Struct Type Different Size

In [8]:
data_students_v3 = [
    (1, ('Jacob', 'Smith'), True),
    (2, ('Linda', 'Mia', 'Jonh'), False),
    (3, ('Oliver', 'James', 'Johnson', 'Thomas'), False)
]

df_students_v3 = spark.createDataFrame(data=data_students_v3,schema=schema_students)
df_students_v3.show(truncate=False)

PySparkValueError: [LENGTH_SHOULD_BE_THE_SAME] obj and fields should be of the same length, got 2 and 3.

## Map Field

In [9]:
schema_catalog = StructType([
    StructField("id",IntegerType(),False),
    StructField("item",StringType(),True),
    StructField("description",MapType(StringType(), StringType()),True),
])

In [10]:
data_catalog = [
    (1, 'SmartTV', {"brand":"LG", "status":"available"}),
    (2, 'Microwave', {"category":"kitchen", "status":None}),
    (3, 'Smartphone', {"brand":"Iphone", "model":"13Pro", "connectivity": "4G", "camera": "13px"}),
]

In [11]:
df_catalog = spark.createDataFrame(data=data_catalog,schema=schema_catalog)
df_catalog.show(truncate=False)

+---+----------+---------------------------------------------------------------------+
|id |item      |description                                                          |
+---+----------+---------------------------------------------------------------------+
|1  |SmartTV   |{brand -> LG, status -> available}                                   |
|2  |Microwave |{category -> kitchen, status -> NULL}                                |
|3  |Smartphone|{model -> 13Pro, connectivity -> 4G, camera -> 13px, brand -> Iphone}|
+---+----------+---------------------------------------------------------------------+



                                                                                