In [1]:
# # Dependencia
# # Instala o findspark
# !pip install findspark

In [1]:
# Importa o findspark e inicializa
import findspark
findspark.init()

# Import required modules
import pyspark
from pyspark.streaming import StreamingContext
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, DateType, TimestampType
from pyspark.sql.functions import col, sum, from_json, unix_timestamp, window
import pyspark.sql.functions as F

In [2]:
# Conector
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.1 pyspark-shell'
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0 pyspark-shell'

In [3]:
# Cria a sessão Spark
spark = SparkSession.builder.appName("case").getOrCreate()

24/05/25 21:29:51 WARN Utils: Your hostname, cj resolves to a loopback address: 127.0.1.1; using 192.168.15.34 instead (on interface enp2s0)
24/05/25 21:29:51 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/cj/.ivy2/cache
The jars for the packages stored in: /home/cj/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-beb94895-e474-44ab-9066-e24e5033b593;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.5.1 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.5.1 in central
	found org.apache.kafka#kafka-clients;3.4.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.3 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#commons-pool2;2.11.1 in central
:: resolution report :: resolve 632ms :: artifacts dl 19ms
	:: modules

In [4]:
# Vamos criar uma subscrição no tópico que tem o streaming de dados que desejamos "puxar" os dados.
df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "vendas-deshboard-bronze") \
    .option("failOnDataLoss", "false") \
    .option("startingOffsets", "earliest") \
    .load()
# latest
# earliest
df

DataFrame[key: binary, value: binary, topic: string, partition: int, offset: bigint, timestamp: timestamp, timestampType: int]

In [5]:
schema_dados = StructType([
    StructField("id_vendedor", IntegerType(), False),
    StructField("id_cliente", IntegerType(), False),
    StructField("id_produto", IntegerType(), False),
    StructField("id_venda", IntegerType(), False),
    StructField("quantidade", IntegerType(), False),
    StructField("valor_unitario", DoubleType(), False),
    StructField("valor_total", DoubleType(), False),
    StructField("desconto", DoubleType(), False),
    StructField("data", DateType(), False)
])

In [6]:
# Capturamos cada linha de dado (cada valor) como string
df_conversao = df.selectExpr("CAST(value AS STRING)")

# Parse do formato JSON em dataframe
df_conversao = df_conversao.withColumn("jsonData", from_json(col("value"), schema_dados)).select("jsonData.*")

df_conversao.printSchema()

root
 |-- id_vendedor: integer (nullable = true)
 |-- id_cliente: integer (nullable = true)
 |-- id_produto: integer (nullable = true)
 |-- id_venda: integer (nullable = true)
 |-- quantidade: integer (nullable = true)
 |-- valor_unitario: double (nullable = true)
 |-- valor_total: double (nullable = true)
 |-- desconto: double (nullable = true)
 |-- data: date (nullable = true)



In [7]:
# Renomeamos as colunas para simplificar nossa análise
df_conversao_dados = (df_conversao
    .select(
        col("id_vendedor").alias("vendedor"),
        col("id_cliente").alias("cliente"),
        col("id_produto").alias("produto"),
        col("id_venda").alias("venda"),
        col("quantidade").alias("quantidade"),
        col("valor_unitario").alias("valor_unitario"),
        col("valor_total").alias("total"),
        col("desconto").alias("desconto"),
        col("data").alias("data")
    )
)

In [23]:
# # Aqui temos o objeto que irá conter nossa análise, o cálculo da média dos volores totais de venda
df_media_vendas = (df_conversao_dados
                   .select(col('produto'), col('quantidade'),col('total'),col('desconto'),
                          ((col('quantidade') * col('total')) - col('desconto')).alias('valor_final'))
                       .groupby("produto").agg(
                            F.sum("quantidade").alias("QTD")
                            ,F.sum("total").alias("VALOR VENDA")
                            ,F.sum("desconto").alias("DESCONTO")
                            ,F.sum("valor_final").alias('VALOR FINAL')
                   )
                   .where(F.col("produto").isNotNull())
                   .orderBy(F.col("VALOR FINAL").desc())
)
df_media_vendas.printSchema()
df_media_vendas_json = df_media_vendas.select(F.to_json(F.struct(*df_media_vendas.columns)).alias("value"))
df_media_vendas_json.printSchema()

root
 |-- produto: integer (nullable = true)
 |-- QTD: long (nullable = true)
 |-- VALOR VENDA: double (nullable = true)
 |-- DESCONTO: double (nullable = true)
 |-- VALOR FINAL: double (nullable = true)

root
 |-- value: string (nullable = true)



In [24]:
query = df_media_vendas_json \
    .selectExpr("CAST(value AS STRING)") \
    .writeStream \
    .format("console") \
    .format("kafka") \
    .outputMode("complete") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("topic", "vendas-deshboard-gold") \
    .option("checkpointLocation", "./check.txt") \
    .option("truncate", False) \
    .start()
# \
#     .awaitTermination()

24/05/25 21:40:40 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
24/05/25 21:40:40 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
                                                                                

In [22]:
query.stop()

In [15]:
spark.stop()

In [None]:
query = df_media_vendas.writeStream.outputMode("complete").format("console").start()