In [22]:
# Importando as bibliotecas utilizadas
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.window import Window

In [5]:
# Criando a sessão, nomeando e configurando para mostrar tabela do tipo pandas 
spark = (
    SparkSession.builder
    .appName('PySpark - Funções Janeladas')
    .config('spark.sql.repl.eagerEval.enabled', True)
    .getOrCreate()
)

In [8]:
# Carregando os dataframes
compras = spark.read.parquet('./DATASETS/COMPRAS.parquet').select('id', 'data', 'cd_livro', 'cd_cliente').alias('compras')
livros = spark.read.parquet('./DATASETS/LIVROS.parquet').select('id', 'data_lancamento', 'preco').alias('livros')
autores = spark.read.parquet('./DATASETS/AUTORES.parquet').alias('autores')

In [21]:
# Fazendo os joins para um só dataframe e removendo ids repetidos
df = (
    compras
    .join(livros, compras.cd_livro == livros.id)
    .join(autores, autores.id == livros.id)
    .drop(livros.id, autores.id)
)
df

id,data,cd_livro,cd_cliente,data_lancamento,preco,titulo,autor
12389,2021-07-24,30334762,3339828,2013-05-08,123.47,Em Busca do Tempo...,Marcel Proust
12534,2021-07-15,14347542,7799936,2007-04-05,182.43,Fogo Morto,José Lins do Rego
12574,2020-05-19,10325500,6273720,2000-09-09,30.62,A Obscena Senhora D,Hilda Hilst
12675,2021-07-11,35940339,649001,2014-08-11,213.6,Adeus às Armas,Ernest Hemingway
13457,2021-10-26,21006591,4312106,2019-02-24,96.84,Paulicéia Desvair...,Mário de Andrade
13478,2020-05-10,19488257,670483,2013-05-19,17.11,O Ódio que Você S...,Angie Thomas
13562,2021-10-28,35940339,1275633,2014-08-11,213.6,Adeus às Armas,Ernest Hemingway
13679,2020-10-19,30144651,1261140,2011-04-14,242.2,Zero,Ignácio De Loyola...
13687,2020-05-29,21923195,4098904,2015-02-22,38.89,O Ex-Mágico,Murilo Rubião
13796,2021-10-15,16778973,4024706,2017-06-23,99.81,Crime e Castigo,Fiódor Dostoiévski


In [36]:
# Criando janelamentos
win_num = Window.orderBy('id') # qtd linha
win_ordem_compra = Window.partitionBy('cd_cliente').orderBy('data') # qtd compra cada cliente
win_lancamento_livro = Window.partitionBy('autor').orderBy('data_lancamento') # data lancamento livro

In [41]:
# Adicionando uma coluna com o número da linha (somente exmplos, não aplicável)
(
    df
    .withColumn('num', F.row_number().over(win_num))
    .withColumn('ordem_compra', F.row_number().over(win_ordem_compra))
    # .dropDuplicates(['cd_livro', 'autor']) # rm duplicatas nessas duas colunas 
    # .withColumn('lancamento_livro', F.row_number().over(win_lancamento_livro))
    .withColumn('acumulado_cliente', F.round(F.sum('preco').over(win_ordem_compra), 2))
)

24/03/24 19:48:14 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/24 19:48:14 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/24 19:48:15 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/24 19:48:15 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/24 19:48:15 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/24 19:48:15 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/03/24 1

id,data,cd_livro,cd_cliente,data_lancamento,preco,titulo,autor,num,ordem_compra,acumulado_cliente
184365,2020-10-31,100520231,1010444,2009-08-17,35.55,Galáxias,Haroldo de Campos,3330,1,35.55
162408,2021-06-16,36059407,1010444,2021-01-20,222.98,As Histórias Comp...,Franz Kafka,2874,2,258.53
629783,2021-09-14,16581063,1010444,2013-05-16,145.25,Macunaíma – O Her...,Mário de Andrade,11521,3,403.78
594608,2021-10-11,30099528,1010444,2000-02-06,208.39,Retrato do Artist...,James Joyce,10824,4,612.17
561293,2021-11-02,30144651,1010444,2011-04-14,242.2,Zero,Ignácio De Loyola...,10118,5,854.37
648950,2022-01-15,22112497,1010444,2021-01-06,10.84,Mrs Dalloway,Virginia Woolf,11896,6,865.21
184567,2022-03-29,20414016,1010444,2011-06-09,212.1,Vidas Secas,Graciliano Ramos,3335,7,1077.31
26947,2022-05-04,36030824,1010444,2005-01-26,226.36,Tremor de Terra,Luiz Vilela,317,8,1303.67
472081,2022-06-19,26925428,1010444,2011-09-04,193.19,"Sing, Unburied, S...",Jesmyn Ward,8520,9,1496.86
27189,2022-06-26,12489208,1010444,2011-10-30,160.53,Triste Fim de Pol...,Lima Barreto,322,10,1657.39
