# ToJson Function

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/30 00:17:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/08/30 00:17:04 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


## Base nested dataframe

In [4]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, BooleanType

In [5]:
data_students = [
    (1, ('Jacob', 'Kyle', 'Smith'), (8, 10, 2), ("math", True, 4)),
    (2, ('Linda', 'Mia', 'Jonh'), (8, 10, 2), ("science", False, 10)),
    (3, ('Oliver', 'James', 'Johnson'), (7, 9, 3), ("english", True, 7)),
    (4, ('Emma', 'Sophia', 'Williams'), (6, 8, 5), ("history", False, 9)),
    (5, ('Liam', 'Lucas', 'Brown'), (9, 7, 4), ("math", True, 6)),
    (6, ('Ava', 'Isabella', 'Jones'), (10, 8, 6), ("science", True, 8)),
    (7, ('Noah', 'Mason', 'Davis'), (5, 7, 2), ("english", False, 5)),
    (8, ('Olivia', 'Emily', 'Miller'), (8, 9, 3), ("history", True, 10)),
    (9, ('Elijah', 'Logan', 'Garcia'), (7, 6, 4), ("math", False, 3)),
    (10, ('Sophia', 'Charlotte', 'Martinez'), (9, 8, 7), ("science", True, 7)),
]

schema_students = StructType([
    StructField("id",IntegerType(),False),
    StructField("names",StructType([
        StructField("first_name",StringType(),False),
        StructField("middle_name",StringType(),True),
        StructField("last_name",StringType(),False)
    ])),
    StructField("notes",StructType([
        StructField("exam_1",IntegerType(),True),
        StructField("exam_2",IntegerType(),True),
        StructField("exam_3",IntegerType(),True)
    ])),
    StructField("area",StructType([
        StructField("subject",StringType(),True),
        StructField("approve",BooleanType(),True),
        StructField("assistance",IntegerType(),True)
    ]))
])

In [6]:
df_students = spark.createDataFrame(data=data_students,schema=schema_students)
df_students.show(truncate=False)

                                                                                

+---+-----------------------------+----------+--------------------+
|id |names                        |notes     |area                |
+---+-----------------------------+----------+--------------------+
|1  |{Jacob, Kyle, Smith}         |{8, 10, 2}|{math, true, 4}     |
|2  |{Linda, Mia, Jonh}           |{8, 10, 2}|{science, false, 10}|
|3  |{Oliver, James, Johnson}     |{7, 9, 3} |{english, true, 7}  |
|4  |{Emma, Sophia, Williams}     |{6, 8, 5} |{history, false, 9} |
|5  |{Liam, Lucas, Brown}         |{9, 7, 4} |{math, true, 6}     |
|6  |{Ava, Isabella, Jones}       |{10, 8, 6}|{science, true, 8}  |
|7  |{Noah, Mason, Davis}         |{5, 7, 2} |{english, false, 5} |
|8  |{Olivia, Emily, Miller}      |{8, 9, 3} |{history, true, 10} |
|9  |{Elijah, Logan, Garcia}      |{7, 6, 4} |{math, false, 3}    |
|10 |{Sophia, Charlotte, Martinez}|{9, 8, 7} |{science, true, 7}  |
+---+-----------------------------+----------+--------------------+



In [7]:
df_students.printSchema()

root
 |-- id: integer (nullable = false)
 |-- names: struct (nullable = true)
 |    |-- first_name: string (nullable = false)
 |    |-- middle_name: string (nullable = true)
 |    |-- last_name: string (nullable = false)
 |-- notes: struct (nullable = true)
 |    |-- exam_1: integer (nullable = true)
 |    |-- exam_2: integer (nullable = true)
 |    |-- exam_3: integer (nullable = true)
 |-- area: struct (nullable = true)
 |    |-- subject: string (nullable = true)
 |    |-- approve: boolean (nullable = true)
 |    |-- assistance: integer (nullable = true)



## ToJson Function

In [8]:
from pyspark.sql.functions import to_json, col

In [9]:
df_students_json = df_students.select(
    col("id"),
    to_json(col("names")).alias("names"),
    to_json(col("notes")).alias("notes"),
    to_json(col("area")).alias("area")
)

df_students_json.show(truncate=False)

+---+------------------------------------------------------------------------+-----------------------------------+-----------------------------------------------------+
|id |names                                                                   |notes                              |area                                                 |
+---+------------------------------------------------------------------------+-----------------------------------+-----------------------------------------------------+
|1  |{"first_name":"Jacob","middle_name":"Kyle","last_name":"Smith"}         |{"exam_1":8,"exam_2":10,"exam_3":2}|{"subject":"math","approve":true,"assistance":4}     |
|2  |{"first_name":"Linda","middle_name":"Mia","last_name":"Jonh"}           |{"exam_1":8,"exam_2":10,"exam_3":2}|{"subject":"science","approve":false,"assistance":10}|
|3  |{"first_name":"Oliver","middle_name":"James","last_name":"Johnson"}     |{"exam_1":7,"exam_2":9,"exam_3":3} |{"subject":"english","approve":true,"assi

In [10]:
df_students_json.printSchema()

root
 |-- id: integer (nullable = false)
 |-- names: string (nullable = true)
 |-- notes: string (nullable = true)
 |-- area: string (nullable = true)

