In [3]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [4]:
spark = SparkSession.builder \
    .master("local") \
    .config("spark.sql.autoBroadcastJoinThreshold", -1) \
    .config("spark.executor.memory", "500mb") \
    .appName("Ex13") \
    .getOrCreate()

22/07/08 10:58:30 WARN Utils: Your hostname, computador resolves to a loopback address: 127.0.1.1; using 192.168.15.156 instead (on interface wlp2s0)
22/07/08 10:58:30 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/07/08 10:58:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [5]:
df = spark.read.json('./data/data.json')

                                                                                

## Como os dados estão múltiplas linhas, não há a leitura correta do arquivo

In [6]:
df.show()

+-----------------+----+------------+
|  _corrupt_record|  id|        type|
+-----------------+----+------------+
|                {|null|        null|
|   	"id": "0001",|null|        null|
|	"type": "donut",|null|        null|
| 	"name": "Cake",|null|        null|
|    	"ppu": 0.55,|null|        null|
|      	"batters":|null|        null|
|              		{|null|        null|
|     			"batter":|null|        null|
|            				[|null|        null|
|             null|1001|     Regular|
|             null|1002|   Chocolate|
|             null|1003|   Blueberry|
|             null|1004|Devil's Food|
|            				]|null|        null|
|             		},|null|        null|
|      	"topping":|null|        null|
|              		[|null|        null|
|             null|5001|        None|
|             null|5002|      Glazed|
|             null|5005|       Sugar|
+-----------------+----+------------+
only showing top 20 rows



## Releitura do df, separando pelos indicies

In [38]:
df = spark.read.option('multiline', 'true').json('./data/data.json')

In [39]:
df.show()

+--------------------+----+----+----+--------------------+-----+
|             batters|  id|name| ppu|             topping| type|
+--------------------+----+----+----+--------------------+-----+
|[[[1001, Regular]...|0001|Cake|0.55|[[5001, None], [5...|donut|
+--------------------+----+----+----+--------------------+-----+



### Batters.batter e topping são estruturados como arrays - necessário separar os dados

In [40]:
display(df)

DataFrame[batters: struct<batter:array<struct<id:string,type:string>>>, id: string, name: string, ppu: double, topping: array<struct<id:string,type:string>>, type: string]

In [41]:
df.printSchema()

root
 |-- batters: struct (nullable = true)
 |    |-- batter: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- id: string (nullable = true)
 |    |    |    |-- type: string (nullable = true)
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- ppu: double (nullable = true)
 |-- topping: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- type: string (nullable = true)
 |-- type: string (nullable = true)



## Uso de explode() separa os dados de arrays

In [42]:
df.select(explode('batters.batter')).show()

+--------------------+
|                 col|
+--------------------+
|     [1001, Regular]|
|   [1002, Chocolate]|
|   [1003, Blueberry]|
|[1004, Devil's Food]|
+--------------------+



### Criando a coluna topping_explode, contendo os dados de explode(topping)

In [43]:
df2 = df.withColumn('topping_explode', explode('topping'))

In [44]:
df2.show()

+--------------------+----+----+----+--------------------+-----+--------------------+
|             batters|  id|name| ppu|             topping| type|     topping_explode|
+--------------------+----+----+----+--------------------+-----+--------------------+
|[[[1001, Regular]...|0001|Cake|0.55|[[5001, None], [5...|donut|        [5001, None]|
|[[[1001, Regular]...|0001|Cake|0.55|[[5001, None], [5...|donut|      [5002, Glazed]|
|[[[1001, Regular]...|0001|Cake|0.55|[[5001, None], [5...|donut|       [5005, Sugar]|
|[[[1001, Regular]...|0001|Cake|0.55|[[5001, None], [5...|donut|[5007, Powdered S...|
|[[[1001, Regular]...|0001|Cake|0.55|[[5001, None], [5...|donut|[5006, Chocolate ...|
|[[[1001, Regular]...|0001|Cake|0.55|[[5001, None], [5...|donut|   [5003, Chocolate]|
|[[[1001, Regular]...|0001|Cake|0.55|[[5001, None], [5...|donut|       [5004, Maple]|
+--------------------+----+----+----+--------------------+-----+--------------------+



### Criação das colunas contendo os id e os tipos de topping, bem como a deleção das colunas topping e topping_explode

In [45]:
df2 = df2.withColumn('topping_id', col('topping_explode.id')
        ).withColumn('topping_type', col('topping_explode.type')
        ).drop('topping', 'topping_explode')

In [46]:
df2.show()

+--------------------+----+----+----+-----+----------+--------------------+
|             batters|  id|name| ppu| type|topping_id|        topping_type|
+--------------------+----+----+----+-----+----------+--------------------+
|[[[1001, Regular]...|0001|Cake|0.55|donut|      5001|                None|
|[[[1001, Regular]...|0001|Cake|0.55|donut|      5002|              Glazed|
|[[[1001, Regular]...|0001|Cake|0.55|donut|      5005|               Sugar|
|[[[1001, Regular]...|0001|Cake|0.55|donut|      5007|      Powdered Sugar|
|[[[1001, Regular]...|0001|Cake|0.55|donut|      5006|Chocolate with Sp...|
|[[[1001, Regular]...|0001|Cake|0.55|donut|      5003|           Chocolate|
|[[[1001, Regular]...|0001|Cake|0.55|donut|      5004|               Maple|
+--------------------+----+----+----+-----+----------+--------------------+



### Criando a coluna batter_explode, contendo os dados de explode(batter.batters)

In [48]:
df2 = df2.withColumn('batter_explode', explode('batters.batter'))

In [49]:
df2.show()

+--------------------+----+----+----+-----+----------+--------------------+--------------------+
|             batters|  id|name| ppu| type|topping_id|        topping_type|      batter_explode|
+--------------------+----+----+----+-----+----------+--------------------+--------------------+
|[[[1001, Regular]...|0001|Cake|0.55|donut|      5001|                None|     [1001, Regular]|
|[[[1001, Regular]...|0001|Cake|0.55|donut|      5001|                None|   [1002, Chocolate]|
|[[[1001, Regular]...|0001|Cake|0.55|donut|      5001|                None|   [1003, Blueberry]|
|[[[1001, Regular]...|0001|Cake|0.55|donut|      5001|                None|[1004, Devil's Food]|
|[[[1001, Regular]...|0001|Cake|0.55|donut|      5002|              Glazed|     [1001, Regular]|
|[[[1001, Regular]...|0001|Cake|0.55|donut|      5002|              Glazed|   [1002, Chocolate]|
|[[[1001, Regular]...|0001|Cake|0.55|donut|      5002|              Glazed|   [1003, Blueberry]|
|[[[1001, Regular]...|0001|Cak

### Criação das colunas contendo os id e os tipos de batter, bem como a deleção das colunas batters e batter_explode

In [51]:
df2 = df2.withColumn('batter_id', col('batter_explode.id')
        ).withColumn('batter_type', col('batter_explode.type')
        ).drop('batters', 'batter_explode')

In [52]:
df2.show()

+----+----+----+-----+----------+--------------------+---------+------------+
|  id|name| ppu| type|topping_id|        topping_type|batter_id| batter_type|
+----+----+----+-----+----------+--------------------+---------+------------+
|0001|Cake|0.55|donut|      5001|                None|     1001|     Regular|
|0001|Cake|0.55|donut|      5001|                None|     1002|   Chocolate|
|0001|Cake|0.55|donut|      5001|                None|     1003|   Blueberry|
|0001|Cake|0.55|donut|      5001|                None|     1004|Devil's Food|
|0001|Cake|0.55|donut|      5002|              Glazed|     1001|     Regular|
|0001|Cake|0.55|donut|      5002|              Glazed|     1002|   Chocolate|
|0001|Cake|0.55|donut|      5002|              Glazed|     1003|   Blueberry|
|0001|Cake|0.55|donut|      5002|              Glazed|     1004|Devil's Food|
|0001|Cake|0.55|donut|      5005|               Sugar|     1001|     Regular|
|0001|Cake|0.55|donut|      5005|               Sugar|     1002|