In [1]:
# Dependencia
# Instala o findspark
!pip install findspark

Defaulting to user installation because normal site-packages is not writeable


In [2]:
# Importa o findspark e inicializa
import findspark
findspark.init()

# Import required modules
import pyspark
from pyspark.streaming import StreamingContext
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, DateType, TimestampType
from pyspark.sql.functions import col, sum, from_json, unix_timestamp, window
import pyspark.sql.functions as F

In [3]:
# Conector
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.1 pyspark-shell'

In [4]:
# Cria a sessão Spark
spark = SparkSession.builder.appName("case").getOrCreate()

24/06/13 12:03:02 WARN Utils: Your hostname, cj resolves to a loopback address: 127.0.1.1; using 192.168.15.34 instead (on interface enp2s0)
24/06/13 12:03:02 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/cj/.ivy2/cache
The jars for the packages stored in: /home/cj/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-e1a6d306-ea1a-48e8-b01a-065677dd452f;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.5.1 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.5.1 in central
	found org.apache.kafka#kafka-clients;3.4.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.3 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#commons-pool2;2.11.1 in central
:: resolution report :: resolve 458ms :: artifacts dl 27ms
	:: modules

In [5]:
# Vamos criar uma subscrição no tópico que tem o streaming de dados que desejamos "puxar" os dados.
from_kafka_df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "vendas-deshboard-bronze") \
    .option("failOnDataLoss", "false") \
    .option("startingOffsets", "earliest") \
    .load()
# latest
# earliest
from_kafka_df

DataFrame[key: binary, value: binary, topic: string, partition: int, offset: bigint, timestamp: timestamp, timestampType: int]

In [6]:
from_kafka_schema = StructType([
    StructField("id_vendedor", IntegerType(), False),
    StructField("id_cliente", IntegerType(), False),
    StructField("id_produto", IntegerType(), False),
    StructField("id_venda", IntegerType(), False),
    StructField("quantidade", IntegerType(), False),
    StructField("valor_unitario", DoubleType(), False),
    StructField("valor_total", DoubleType(), False),
    StructField("desconto", DoubleType(), False),
    StructField("data", DateType(), False)
])

In [7]:
# Capturamos cada linha de dado (cada valor) como string
from_kafka_value_str = from_kafka_df.selectExpr("CAST(value AS STRING)")

# Parse do formato JSON em dataframe
from_kafka_bronze_df = from_kafka_value_str.withColumn("jsonData", from_json(col("value"), from_kafka_schema)).select("jsonData.*")

from_kafka_bronze_df.printSchema()

root
 |-- id_vendedor: integer (nullable = true)
 |-- id_cliente: integer (nullable = true)
 |-- id_produto: integer (nullable = true)
 |-- id_venda: integer (nullable = true)
 |-- quantidade: integer (nullable = true)
 |-- valor_unitario: double (nullable = true)
 |-- valor_total: double (nullable = true)
 |-- desconto: double (nullable = true)
 |-- data: date (nullable = true)



In [8]:
gold_df = (
    from_kafka_bronze_df
        .select(col('valor_total').alias('ideal'), 'desconto', (col('valor_total') - col('desconto')).alias('venda'))
        .agg(F.sum('ideal').alias('ideal'),
             F.sum('desconto').alias('desconto'),
             F.sum('venda').alias('venda'))
        .select('*'
            ,((col('venda') / col('ideal')) * 100).alias('percentual_venda')
            ,((col('desconto') / col('ideal')) * 100).alias('percentual_desconto'))
        .where(col('ideal').isNotNull())
        .withColumn('grafico',F.lit('pizza'))
    )
gold_df.printSchema()
gold_json = gold_df.select(F.to_json(F.struct(*gold_df.columns)).alias("value"))
gold_json.printSchema()

root
 |-- ideal: double (nullable = true)
 |-- desconto: double (nullable = true)
 |-- venda: double (nullable = true)
 |-- percentual_venda: double (nullable = true)
 |-- percentual_desconto: double (nullable = true)
 |-- grafico: string (nullable = false)

root
 |-- value: string (nullable = true)



In [9]:
to_kafka_gold = (gold_json
    .selectExpr("CAST(value AS STRING)")
    .writeStream
    # .format("console")
    .format("kafka")
    .outputMode("complete")
    .option("kafka.bootstrap.servers", "localhost:9092")
    .option("topic", "vendas-deshboard-gold")
    .option("checkpointLocation", "./check.txt")
    .option("truncate", False)
    .start()
    # .awaitTermination()
)

24/06/13 12:03:08 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


In [10]:
to_kafka_gold.awaitTermination()

24/06/13 12:03:09 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
24/06/13 12:03:12 WARN HDFSBackedStateStoreProvider: The state for version 516 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/06/13 12:18:53 WARN NetworkClient: [Producer clientId=producer-1] Connection to node 1 (/127.0.0.1:9092) could not be established. Broker may not be available.
24/06/13 12:18:53 WARN NetworkClient: [AdminClient clientId=adminclient-1] Connection to node 1 (/127.0.0.1:9092) could not be established. Broker may not be available.
24/06/13 12:18:54 WARN NetworkClient: [Producer clientId=producer-1] Connection to node 1 (/127.0.0.1:9092) could not be established. Broker may not be available.
24/06/13 12:18:54 WARN NetworkClient: [AdminClient clientId=adminclient-1] Connection 

KeyboardInterrupt: 

In [None]:
query.stop()

In [None]:
spark.stop()

In [None]:
query = df_media_vendas.writeStream.outputMode("complete").format("console").start()