In [22]:
# Importar as bibliotecas
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import SparkSession

In [23]:
spark = SparkSession.builder \
    .appName("spark-streaming") \
    .config("spark.jars.packages",
            "org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.1,"
            "org.mongodb.spark:mongo-spark-connector_2.12:10.2.0") \
    .getOrCreate()


In [28]:
# Criar o dataframe do tipo stream, apontando para o servidor kafka e o tópico a ser consumido.
df = (spark.readStream
        .format("kafka")
        .option("kafka.bootstrap.servers", "spark-master:9092")
        .option("subscribe", "topico-mongo.spark-streaming.dados")
        .option("startingOffsets", "earliest") 
        .load()
)

In [33]:
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Schema externo do JSON recebido via Kafka
schema = StructType([
    StructField("payload", StructType([
        StructField("after", StringType(), True)  # <<< importante: after é uma string!
    ]))
])

# Schema interno que está codificado dentro da string JSON do campo "after"
after_schema = StructType([
    StructField("_id", StructType([
        StructField("$oid", StringType(), True)
    ])),
    StructField("id_projeto", IntegerType(), True),
    StructField("tempo", IntegerType(), True)
])

# 1. Primeiro: desserializa o valor inteiro do Kafka (JSON principal)
df1 = df.select(from_json(col("value").cast("string"), schema).alias("data"))

# 2. Segundo: desserializa o campo 'after' (string JSON) com o schema interno
df2 = df1.select(from_json(col("data.payload.after"), after_schema).alias("after"))

# 3. Seleciona os campos finais
dx = df2.select("after.id_projeto", "after.tempo")

In [34]:
ds = (dx.writeStream 
    .outputMode("append") 
    .format("console")
    .option("truncate", False)
    .start()
)

25/07/12 02:21:03 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-332d904a-cad9-4c01-aa68-097bc7515d66. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/07/12 02:21:03 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
                                                                                r, value.deserializer, enable.auto.commit, max.poll.records, auto.1) / 1]r, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
[Stage 25:>                                                         (0 + 0) / 1]r, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
1) /

-------------------------------------------
Batch: 0
-------------------------------------------
+----------+-----+
|id_projeto|tempo|
+----------+-----+
|32        |25   |
|78        |80   |
|56        |43   |
|56        |43   |
|78        |80   |
|78        |80   |
|78        |80   |
|78        |80   |
|78        |80   |
|NULL      |NULL |
|78        |80   |
|23        |43   |
|23        |43   |
|23        |43   |
|23        |43   |
|67        |21   |
+----------+-----+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
-------------------------------------------
Batch: 3
-------------------------------------------
+----------+-----+
|id_projeto|tempo|
+----------+-----+
|67        |21   |
+----------+-----+

+----------+-----+
|id_projeto|tempo|
+----------+-----+
|67        |21   |
+----------+-----+



                                                                                

-------------------------------------------
Batch: 6
-------------------------------------------
-------------------------------------------
Batch: 6
-------------------------------------------
+----------+-----+
|id_projeto|tempo|
+----------+-----+
|NULL      |NULL |
+----------+-----+

+----------+-----+
|id_projeto|tempo|
+----------+-----+
|NULL      |NULL |
+----------+-----+

-------------------------------------------
Batch: 6
-------------------------------------------
-------------------------------------------
Batch: 4
-------------------------------------------
+----------+-----+
|id_projeto|tempo|
+----------+-----+
|NULL      |NULL |
+----------+-----+

+----------+-----+
|id_projeto|tempo|
+----------+-----+
|67        |21   |
+----------+-----+

