# StructType VS StructField

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/30 00:15:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Define structure using StructType and StructFiel

In [3]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, BooleanType

In [4]:
data_catalog = [
    (1, 'Harry poter', 'Books', 120, True),
    (2, 'Microwave', 'Kitchen', None, True),
    (3, 'Jacket',180, None,None,),
    (4, None, 'Furniture', 250, False),
    (5, 'Xbox360', 'Toys', None, False)
]

In [5]:
schema_catalog = StructType([
    StructField("id",IntegerType(),False),
    StructField("item",StringType(),True),
    StructField("category",StringType(),True),
    StructField("price",IntegerType(),True),
    StructField("available",BooleanType(),True)
])

In [6]:
df_catalog = spark.createDataFrame(data=data_catalog,schema=schema_catalog)
df_catalog.show()

                                                                                

+---+-----------+---------+-----+---------+
| id|       item| category|price|available|
+---+-----------+---------+-----+---------+
|  1|Harry poter|    Books|  120|     true|
|  2|  Microwave|  Kitchen| NULL|     true|
|  3|     Jacket|      180| NULL|     NULL|
|  4|       NULL|Furniture|  250|    false|
|  5|    Xbox360|     Toys| NULL|    false|
+---+-----------+---------+-----+---------+



In [7]:
df_catalog.printSchema()

root
 |-- id: integer (nullable = false)
 |-- item: string (nullable = true)
 |-- category: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- available: boolean (nullable = true)



## Define Nested Structure

In [8]:
data_students = [
    (1, ('Jacob', 'Kyle', 'Smith'), (8, 10, 2), ("math", True, 4)),
    (2, ('Linda', 'Mia', 'Jonh'), (8, 10, 2), ("science", False, 10)),
    (3, ('Oliver', 'James', 'Johnson'), (7, 9, 3), ("english", True, 7)),
    (4, ('Emma', 'Sophia', 'Williams'), (6, 8, 5), ("history", False, 9)),
    (5, ('Liam', 'Lucas', 'Brown'), (9, 7, 4), ("math", True, 6)),
    (6, ('Ava', 'Isabella', 'Jones'), (10, 8, 6), ("science", True, 8)),
    (7, ('Noah', 'Mason', 'Davis'), (5, 7, 2), ("english", False, 5)),
    (8, ('Olivia', 'Emily', 'Miller'), (8, 9, 3), ("history", True, 10)),
    (9, ('Elijah', 'Logan', 'Garcia'), (7, 6, 4), ("math", False, 3)),
    (10, ('Sophia', 'Charlotte', 'Martinez'), (9, 8, 7), ("science", True, 7)),
]

In [9]:
schema_students = StructType([
    StructField("id",IntegerType(),False),
    StructField("names",StructType([
        StructField("first_name",StringType(),False),
        StructField("middle_name",StringType(),True),
        StructField("last_name",StringType(),False)
    ])),
    StructField("notes",StructType([
        StructField("exam_1",IntegerType(),True),
        StructField("exam_2",IntegerType(),True),
        StructField("exam_3",IntegerType(),True)
    ])),
    StructField("area",StructType([
        StructField("subject",StringType(),True),
        StructField("approve",BooleanType(),True),
        StructField("assistance",IntegerType(),True)
    ]))
])

In [10]:
df_students = spark.createDataFrame(data=data_students,schema=schema_students)
df_students.show(truncate=False)

                                                                                

+---+-----------------------------+----------+--------------------+
|id |names                        |notes     |area                |
+---+-----------------------------+----------+--------------------+
|1  |{Jacob, Kyle, Smith}         |{8, 10, 2}|{math, true, 4}     |
|2  |{Linda, Mia, Jonh}           |{8, 10, 2}|{science, false, 10}|
|3  |{Oliver, James, Johnson}     |{7, 9, 3} |{english, true, 7}  |
|4  |{Emma, Sophia, Williams}     |{6, 8, 5} |{history, false, 9} |
|5  |{Liam, Lucas, Brown}         |{9, 7, 4} |{math, true, 6}     |
|6  |{Ava, Isabella, Jones}       |{10, 8, 6}|{science, true, 8}  |
|7  |{Noah, Mason, Davis}         |{5, 7, 2} |{english, false, 5} |
|8  |{Olivia, Emily, Miller}      |{8, 9, 3} |{history, true, 10} |
|9  |{Elijah, Logan, Garcia}      |{7, 6, 4} |{math, false, 3}    |
|10 |{Sophia, Charlotte, Martinez}|{9, 8, 7} |{science, true, 7}  |
+---+-----------------------------+----------+--------------------+



In [11]:
df_students.printSchema()

root
 |-- id: integer (nullable = false)
 |-- names: struct (nullable = true)
 |    |-- first_name: string (nullable = false)
 |    |-- middle_name: string (nullable = true)
 |    |-- last_name: string (nullable = false)
 |-- notes: struct (nullable = true)
 |    |-- exam_1: integer (nullable = true)
 |    |-- exam_2: integer (nullable = true)
 |    |-- exam_3: integer (nullable = true)
 |-- area: struct (nullable = true)
 |    |-- subject: string (nullable = true)
 |    |-- approve: boolean (nullable = true)
 |    |-- assistance: integer (nullable = true)

