to_json function

In [0]:
from pyspark.sql.types import StringType,StructType,StructField
from pyspark.sql.functions import to_json
data=[('maheer',{'hair':'black','eye':'brown'})]
schema=['name','properties']
df=spark.createDataFrame(data,schema)
df.show(truncate=False)
df.printSchema()

df1=df.withColumn('propString',to_json(df.properties))
df1.show(truncate=False)
df1.printSchema()

+------+-----------------------------+
|name  |properties                   |
+------+-----------------------------+
|maheer|{eye -> brown, hair -> black}|
+------+-----------------------------+

root
 |-- name: string (nullable = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)

+------+-----------------------------+------------------------------+
|name  |properties                   |propString                    |
+------+-----------------------------+------------------------------+
|maheer|{eye -> brown, hair -> black}|{"eye":"brown","hair":"black"}|
+------+-----------------------------+------------------------------+

root
 |-- name: string (nullable = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- propString: string (nullable = true)



In [0]:
data=[('maheer',('black','brown'))]
schema=StructType([\
    StructField('name',StringType()),
    StructField('properties',StructType([StructField('hair',StringType()),StructField('eye',StringType())]))
    ])
df=spark.createDataFrame(data,schema)
df.show()
df.printSchema()
df1=df.withColumn('PropJsonString',to_json(df.properties))
df1.show(truncate=False)
df1.printSchema()

+------+--------------+
|  name|    properties|
+------+--------------+
|maheer|{black, brown}|
+------+--------------+

root
 |-- name: string (nullable = true)
 |-- properties: struct (nullable = true)
 |    |-- hair: string (nullable = true)
 |    |-- eye: string (nullable = true)

+------+--------------+------------------------------+
|name  |properties    |PropJsonString                |
+------+--------------+------------------------------+
|maheer|{black, brown}|{"hair":"black","eye":"brown"}|
+------+--------------+------------------------------+

root
 |-- name: string (nullable = true)
 |-- properties: struct (nullable = true)
 |    |-- hair: string (nullable = true)
 |    |-- eye: string (nullable = true)
 |-- PropJsonString: string (nullable = true)



from_json()

In [0]:
data=[('maheer','{"hair":"black","eye":"brown"}')]
schema=["id","props"]
df=spark.createDataFrame(data,schema)
df.show(truncate=False)
df.printSchema()

+------+------------------------------+
|id    |props                         |
+------+------------------------------+
|maheer|{"hair":"black","eye":"brown"}|
+------+------------------------------+

root
 |-- id: string (nullable = true)
 |-- props: string (nullable = true)



In [0]:
from pyspark.sql.functions import from_json
from pyspark.sql.types import StructType,StructField,StringType
structTypeSchema=StructType([\
                            StructField('hair',StringType()),\
                            StructField('eye',StringType())])

df1=df.withColumn('probsStruct',from_json(df.props,structTypeSchema))
df1.show(truncate=False)
df1.printSchema()

+------+------------------------------+--------------+
|id    |props                         |probsStruct   |
+------+------------------------------+--------------+
|maheer|{"hair":"black","eye":"brown"}|{black, brown}|
+------+------------------------------+--------------+

root
 |-- id: string (nullable = true)
 |-- props: string (nullable = true)
 |-- probsStruct: struct (nullable = true)
 |    |-- hair: string (nullable = true)
 |    |-- eye: string (nullable = true)



In [0]:
df2=df1.withColumn('hair',df1.probsStruct.hair)\
    .withColumn('eye',df1.probsStruct.eye)

In [0]:
display(df2)

id,props,probsStruct,hair,eye
maheer,"{""hair"":""black"",""eye"":""brown""}","List(black, brown)",black,brown


### get_json_object()
* it's used to extract the json string based on path from json column

In [0]:
data=[('maheer','{"address":{"city":"hyd","state":"telangana"},"gender":"male"}'),\
     ('wafa','{"address":{"city":"bangalore","state":"karnataka"},"eye":"blue"}')
       ]
schema=['name','props']
df=spark.createDataFrame(data,schema)
df.show(truncate=False)
df.printSchema()

+------+-----------------------------------------------------------------+
|name  |props                                                            |
+------+-----------------------------------------------------------------+
|maheer|{"address":{"city":"hyd","state":"telangana"},"gender":"male"}   |
|wafa  |{"address":{"city":"bangalore","state":"karnataka"},"eye":"blue"}|
+------+-----------------------------------------------------------------+

root
 |-- name: string (nullable = true)
 |-- props: string (nullable = true)



In [0]:
from pyspark.sql.functions import get_json_object
df1=df.select('name',get_json_object('props','$.gender').alias('gender'))
df1.show()

+------+------+
|  name|gender|
+------+------+
|maheer|  male|
|  wafa|  null|
+------+------+

