In [2]:
# Importando as bibliotecas utilizadas
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [4]:
# Criando a sessão, nomeando e configurando para mostrar tabela do tipo pandas 
spark = (
    SparkSession.builder
    .appName('PySpark - UNION e DROP')
    .config('spark.sql.repl.eagerEval.enabled', True)
    .getOrCreate()
)

In [10]:
# Carregando um arquivo PARQUET para dataframe limitado para melhor entedimento
livros = spark.read.parquet('./DATASETS/LIVROS.parquet').limit(5)
livros

id,cnpj_editora,data_lancamento,ean,isbn10,numero_paginas,preco
102961160,54.317.982/0001-79,2011-09-19,4367115749184,0-06-594558-1,516,113.84
43636240,16.938.042/0001-08,2019-05-11,1239569754256,0-611-23680-X,371,207.67
52928059,57.624.038/0001-52,2003-08-03,5462162528221,1-109-29542-1,198,107.89
88482271,16.249.378/0001-63,2022-09-07,5713629047213,1-229-68842-0,632,143.95
28125446,90.425.178/0001-77,2011-07-25,2226757040245,0-657-66391-3,384,156.49


In [14]:
# Duplicando o mesmo dataset (no union, colunas devem ser as mesmas e com mesma quantidade)
uni = livros.union(livros)
uni

id,cnpj_editora,data_lancamento,ean,isbn10,numero_paginas,preco
102961160,54.317.982/0001-79,2011-09-19,4367115749184,0-06-594558-1,516,113.84
43636240,16.938.042/0001-08,2019-05-11,1239569754256,0-611-23680-X,371,207.67
52928059,57.624.038/0001-52,2003-08-03,5462162528221,1-109-29542-1,198,107.89
88482271,16.249.378/0001-63,2022-09-07,5713629047213,1-229-68842-0,632,143.95
28125446,90.425.178/0001-77,2011-07-25,2226757040245,0-657-66391-3,384,156.49
102961160,54.317.982/0001-79,2011-09-19,4367115749184,0-06-594558-1,516,113.84
43636240,16.938.042/0001-08,2019-05-11,1239569754256,0-611-23680-X,371,207.67
52928059,57.624.038/0001-52,2003-08-03,5462162528221,1-109-29542-1,198,107.89
88482271,16.249.378/0001-63,2022-09-07,5713629047213,1-229-68842-0,632,143.95
28125446,90.425.178/0001-77,2011-07-25,2226757040245,0-657-66391-3,384,156.49


In [18]:
# Crian do um datafram com colunas inversas 
livros_inversos = livros.select('preco', 'numero_paginas', 'isbn10', 'ean', 'data_lancamento', 'cnpj_editora', 'id')
livros_inversos

preco,numero_paginas,isbn10,ean,data_lancamento,cnpj_editora,id
113.84,516,0-06-594558-1,4367115749184,2011-09-19,54.317.982/0001-79,102961160
207.67,371,0-611-23680-X,1239569754256,2019-05-11,16.938.042/0001-08,43636240
107.89,198,1-109-29542-1,5462162528221,2003-08-03,57.624.038/0001-52,52928059
143.95,632,1-229-68842-0,5713629047213,2022-09-07,16.249.378/0001-63,88482271
156.49,384,0-657-66391-3,2226757040245,2011-07-25,90.425.178/0001-77,28125446


In [19]:
# Tentando fazer a união dos dois
livros_inversos.union(livros)

preco,numero_paginas,isbn10,ean,data_lancamento,cnpj_editora,id
113.84,516,0-06-594558-1,4367115749184,2011-09-19,54.317.982/0001-79,102961160.0
207.67,371,0-611-23680-X,1239569754256,2019-05-11,16.938.042/0001-08,43636240.0
107.89,198,1-109-29542-1,5462162528221,2003-08-03,57.624.038/0001-52,52928059.0
143.95,632,1-229-68842-0,5713629047213,2022-09-07,16.249.378/0001-63,88482271.0
156.49,384,0-657-66391-3,2226757040245,2011-07-25,90.425.178/0001-77,28125446.0
102961160.0,54.317.982/0001-79,2011-09-19,4367115749184,0-06-594558-1,516,113.84
43636240.0,16.938.042/0001-08,2019-05-11,1239569754256,0-611-23680-X,371,207.67
52928059.0,57.624.038/0001-52,2003-08-03,5462162528221,1-109-29542-1,198,107.89
88482271.0,16.249.378/0001-63,2022-09-07,5713629047213,1-229-68842-0,632,143.95
28125446.0,90.425.178/0001-77,2011-07-25,2226757040245,0-657-66391-3,384,156.49


In [22]:
# Para não haver erros no nome das colunas pode se usar o unionByName 
livros_inversos.unionByName(livros)

preco,numero_paginas,isbn10,ean,data_lancamento,cnpj_editora,id
113.84,516,0-06-594558-1,4367115749184,2011-09-19,54.317.982/0001-79,102961160
207.67,371,0-611-23680-X,1239569754256,2019-05-11,16.938.042/0001-08,43636240
107.89,198,1-109-29542-1,5462162528221,2003-08-03,57.624.038/0001-52,52928059
143.95,632,1-229-68842-0,5713629047213,2022-09-07,16.249.378/0001-63,88482271
156.49,384,0-657-66391-3,2226757040245,2011-07-25,90.425.178/0001-77,28125446
113.84,516,0-06-594558-1,4367115749184,2011-09-19,54.317.982/0001-79,102961160
207.67,371,0-611-23680-X,1239569754256,2019-05-11,16.938.042/0001-08,43636240
107.89,198,1-109-29542-1,5462162528221,2003-08-03,57.624.038/0001-52,52928059
143.95,632,1-229-68842-0,5713629047213,2022-09-07,16.249.378/0001-63,88482271
156.49,384,0-657-66391-3,2226757040245,2011-07-25,90.425.178/0001-77,28125446


* Se for caso, pode se fazer um união com valores nulos com o parâmetro: 
> allowMissingColumns=True

In [24]:
# Removendo duplicatas completas, ou seja, linhas iguas 
uni.drop_duplicates() # ou também com dropDuplicates()

id,cnpj_editora,data_lancamento,ean,isbn10,numero_paginas,preco
102961160,54.317.982/0001-79,2011-09-19,4367115749184,0-06-594558-1,516,113.84
43636240,16.938.042/0001-08,2019-05-11,1239569754256,0-611-23680-X,371,207.67
52928059,57.624.038/0001-52,2003-08-03,5462162528221,1-109-29542-1,198,107.89
88482271,16.249.378/0001-63,2022-09-07,5713629047213,1-229-68842-0,632,143.95
28125446,90.425.178/0001-77,2011-07-25,2226757040245,0-657-66391-3,384,156.49


In [29]:
# Removendo duplicatas de uma coluna específica 
livros.dropDuplicates(['numero_paginas']) # nesse caso é o mesmo

id,cnpj_editora,data_lancamento,ean,isbn10,numero_paginas,preco
52928059,57.624.038/0001-52,2003-08-03,5462162528221,1-109-29542-1,198,107.89
43636240,16.938.042/0001-08,2019-05-11,1239569754256,0-611-23680-X,371,207.67
28125446,90.425.178/0001-77,2011-07-25,2226757040245,0-657-66391-3,384,156.49
102961160,54.317.982/0001-79,2011-09-19,4367115749184,0-06-594558-1,516,113.84
88482271,16.249.378/0001-63,2022-09-07,5713629047213,1-229-68842-0,632,143.95
