In [2]:
# Importar as bibliotecas
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import SparkSession

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("trabalho-final") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.1,org.mongodb.spark:mongo-spark-connector_2.12:10.4.0") \
    .config("spark.mongodb.read.connection.uri", "mongodb://spark-mongo:27017/spark-db.music") \
    .config("spark.mongodb.write.connection.uri", "mongodb://spark-mongo:27017/spark-db.music") \
    .getOrCreate()


:: loading settings :: url = jar:file:/home/myuser/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/myuser/.ivy2/cache
The jars for the packages stored in: /home/myuser/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
org.mongodb.spark#mongo-spark-connector_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-e1e0a611-30d5-4863-afab-95c90397cf33;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.4.1 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.4.1 in central
	found org.apache.kafka#kafka-clients;3.3.2 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.1 in central
	found org.slf4j#slf4j-api;2.0.6 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#commons-pool2;2.11.1 in cen

# Colocando dados (.json) no MongoDB

In [4]:
# lendo o arquivo .json do HDFS

caminho_json = "hdfs://spark-master:9000/user/myuser/Musical_Instruments_5.json"

df = spark.read.json(caminho_json)

# Mostrar os dados
df.show()  # truncate=False evita cortar strings longas

                                                                                

+----------+--------+-------+--------------------+-----------+--------------+--------------------+--------------------+--------------+
|      asin| helpful|overall|          reviewText| reviewTime|    reviewerID|        reviewerName|             summary|unixReviewTime|
+----------+--------+-------+--------------------+-----------+--------------+--------------------+--------------------+--------------+
|1384719342|  [0, 0]|    5.0|Not much to write...|02 28, 2014|A2IBPI20UZIR0U|cassandra tu "Yea...|                good|    1393545600|
|1384719342|[13, 14]|    5.0|The product does ...|03 16, 2013|A14VAT5EAX3D9S|                Jake|                Jake|    1363392000|
|1384719342|  [1, 1]|    5.0|The primary job o...|08 28, 2013|A195EZSQDW3E21|Rick Bennette "Ri...|It Does The Job Well|    1377648000|
|1384719342|  [0, 0]|    5.0|Nice windscreen p...|02 14, 2014|A2C00NNG1ZQQG2|RustyBill "Sunday...|GOOD WINDSCREEN F...|    1392336000|
|1384719342|  [0, 0]|    5.0|This pop filter i...|02 21

In [5]:
df.write \
  .format("mongodb") \
  .option("spark.mongodb.write.connection.uri", "mongodb://spark-mongo:27017/spark-db.music") \
  .mode("append") \
  .save()



# Colocando Dados no Postgress

In [6]:
# lendo o arquivo .csv do HDFS

caminho_csv = "hdfs://spark-master:9000/user/myuser/spotify_history.csv"

# Ler o arquivo CSV do HDFS
df = spark.read.csv(caminho_csv, header=True, inferSchema=True)

df.show()

                                                                                

+--------------------+-------------------+----------+---------+--------------------+--------------------+--------------------+------------+----------+-------+-------+
|   spotify_track_uri|                 ts|  platform|ms_played|          track_name|         artist_name|          album_name|reason_start|reason_end|shuffle|skipped|
+--------------------+-------------------+----------+---------+--------------------+--------------------+--------------------+------------+----------+-------+-------+
|2J3n32GeLmMjwuAzy...|2013-07-08 02:44:34|web player|     3185| Say It, Just Say It|        The Mowgli's|Waiting For The Dawn|    autoplay|  clickrow|  FALSE|  FALSE|
|1oHxIPqJyvAYHy0PV...|2013-07-08 02:45:37|web player|    61865|Drinking from the...|       Calvin Harris|           18 Months|    clickrow|  clickrow|  FALSE|  FALSE|
|487OPlneJNni3NWC8...|2013-07-08 02:50:24|web player|   285386|         Born To Die|        Lana Del Rey|Born To Die - The...|    clickrow|   unknown|  FALSE|  FALSE

In [7]:
# Gravar dados no banco postgresql
df.write.format("jdbc") \
	.option("url","jdbc:postgresql://spark-postgres:5432/spark-db") \
	.option("dbtable","music") \
	.option("user","postgres") \
	.option("password","postgres123") \
	.option("driver","org.postgresql.Driver") \
	.save()



# Processando os dados em tempo real consumindo o topico (MongoDB) do Kafka

In [4]:
# Criar o dataframe do tipo stream, apontando para o servidor kafka e o tópico a ser consumido.
df = (spark.readStream
        .format("kafka")
        .option("kafka.bootstrap.servers", "spark-master:9092")
        .option("subscribe", "topico-mongo.spark-db.music")
        .option("startingOffsets", "earliest") 
        .load()
)

In [5]:
schema = StructType([
    StructField("payload", StructType([
        StructField("after", StringType(), True)  # <<< importante: after é uma string!
    ]))
])

# Schema interno que está codificado dentro da string JSON do campo "after"
after_schema = StructType([
    StructField("_id", StructType([
        StructField("$oid", StringType(), True)
    ])),
    StructField("asin", StringType(), True),
    StructField("helpful", ArrayType(
        StructType([StructField("$numberLong", StringType(), True)])
    )),
    StructField("overall", DoubleType(), True),
    StructField("reviewText", StringType(), True),
    StructField("reviewTime", StringType(), True),
    StructField("reviewerID", StringType(), True),
    StructField("reviewerName", StringType(), True),
    StructField("summary", StringType(), True),
    StructField("unixReviewTime", StructType([
        StructField("$numberLong", StringType(), True)
    ]))
])

# 1. Primeiro: desserializa o valor inteiro do Kafka (JSON principal)
df1 = df.select(from_json(col("value").cast("string"), schema).alias("data"))

# 2. Segundo: desserializa o campo 'after' (string JSON) com o schema interno
df2 = df1.select(from_json(col("data.payload.after"), after_schema).alias("after"))

dx = df2.select("after.asin", "after.helpful", "after.overall", "after.reviewTime", "after.reviewerName", "after.summary")

#pegando as compras com o overall maior que 4
df_maior_overall = dx.filter(dx.overall > 4.0)

# Escrevendo os dados processados no MongoDB

In [6]:
df_maior_overall.writeStream \
    .format("mongodb") \
    .option("checkpointLocation", "/tmp/checkpoints/mongo") \
    .option("database", "spark-db") \
    .option("collection", "best-products") \
    .outputMode("append") \
    .start()

25/07/21 01:14:42 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


<pyspark.sql.streaming.query.StreamingQuery at 0x7f1b712ca270>

25/07/21 01:14:43 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.


# Processando os dados em tempo real consumindo o topico (Postgres) do Kafka

In [7]:
# Criar o dataframe do tipo stream, apontando para o servidor kafka e o tópico a ser consumido.
df_postgress = (spark.readStream
        .format("kafka")
        .option("kafka.bootstrap.servers", "spark-master:9092")
        .option("subscribe", "music.public.music")
        .option("startingOffsets", "earliest") 
        .load()
)

In [None]:
df_valores = df_postgress.selectExpr("CAST(value AS STRING)")
query = (
    df_valores.writeStream
    .outputMode("append")
    .format("console")
    .option("truncate", False)
    .start()
)

In [14]:
schema = StructType([
    StructField("payload", StructType([
        StructField("after", StringType(), True)  # <<< importante: after é uma string!
    ]))
])

spotify_schema = StructType([
    StructField("spotify_track_uri", StringType(), False),
    StructField("ts", TimestampType(), False),
    StructField("platform", StringType(), False),
    StructField("ms_played", IntegerType(), False),
    StructField("track_name", StringType(), False),
    StructField("artist_name", StringType(), False),
    StructField("album_name", StringType(), False),
    StructField("reason_start", StringType(), True),
    StructField("reason_end", StringType(), True),
    StructField("shuffle", BooleanType(), True),
    StructField("skipped", BooleanType(), True)
])

# 1. Primeiro: desserializa o valor inteiro do Kafka (JSON principal)
df1 = df.select(from_json(col("value").cast("string"), schema).alias("data"))

# 2. Segundo: desserializa o campo 'after' (string JSON) com o schema interno
df2 = df1.select(from_json(col("data.payload.after"), spotify_schema).alias("after"))

# Use watermark para permitir agregações com stateful streaming

dx = df2.select(
    to_timestamp(col("after.ts")).alias("ts"),
    col("after.platform").alias("platform")
).withWatermark("ts", "10 minutes")

df_agg_plataforma = dx.groupBy("platform").agg(count("*").alias("total_execucoes"))

ds = (
    df_agg_musicas.writeStream
    .outputMode("update")  # <---- aqui muda
    .format("console")
    .option("truncate", False)
    .start()
)

25/07/21 01:20:29 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-9b7775a0-ec4d-4e1e-bc7a-3be6d5598947. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp c                                                                                r, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
1) / 1]r, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
[Stage 5:>                                                          (0 + 0) / 1]r, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
[Stage 5:>                                                          (0 + 0) / 1]r, value.deserializer, enable.auto.commit, max.pol

-------------------------------------------
Batch: 0
-------------------------------------------
+-----------------+---------------+
|spotify_track_uri|total_execucoes|
+-----------------+---------------+
|NULL             |10261          |
+-----------------+---------------+



In [None]:
df_postgress.selectExpr("CAST(value AS STRING)").writeStream \
    .format("console") \
    .option("truncate", False) \
    .start()

# Escrevendo os dados processados no Postgress

In [19]:
def write_to_postgres(batch_df, batch_id):
    (batch_df.write
        .format("jdbc")
        .option("url", "jdbc:postgresql://spark-postgres:5432/spark-db")
        .option("dbtable", "music")
        .option("user", "postgres")
        .option("password", "postgres123")
        .option("driver", "org.postgresql.Driver")
        .mode("append")
        .save()
    )

# Stream que escreve no Postgres
df_agg_plataforma.writeStream \
    .foreachBatch(write_to_postgres) \
    .outputMode("update") \
    .option("checkpointLocation", "/tmp/checkpoints/execucoes_plataforma") \
    .start()

25/07/21 01:27:14 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


<pyspark.sql.streaming.query.StreamingQuery at 0x7f1ba081e0c0>

25/07/21 01:27:14 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
25/07/21 01:27:16 ERROR MicroBatchExecution: Query [id = 3a7db58f-8903-4aca-a622-139e657705d7, runId = 1cc88b14-00fb-4a0d-be53-c1fb4bf377ec] terminated with error
py4j.Py4JException: An exception was raised by the Python Proxy. Return Message: Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/py4j/clientserver.py", line 617, in _call_proxy
    return_value = getattr(self.pool[obj_id], method)(*params)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pyspark/sql/utils.py", line 120, in call
    raise e
  File "/usr/local/lib/python3.12/dist-packages/pyspark/sql/utils.py", line 117, in call
    self.func(DataFrame(jdf, wrapped_session_jdf), batch_id)
  File "/tmp/ipykernel_8942/1030542473.py", line 10