In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType, StringType

In [2]:
# Session Spark Structured Streaming
spark = SparkSession.builder \
    .appName("KafkaRatingConsumer") \
    .master("yarn") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.1") \
    .config("spark.cassandra.connection.host", "cassandra") \
    .getOrCreate()

:: loading settings :: url = jar:file:/usr/local/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-40ff0354-b425-4a51-826c-52eccbe575af;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.5.1 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.5.1 in central
	found org.apache.kafka#kafka-clients;3.4.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.3 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#commons-pool2;2.11.1 in central
downloading https://repo1.maven.org/maven2/org/apache/spark/spark-sql-kafka-

In [3]:
# Schéma des messages JSON
schema = StructType([
    StructField("userId", IntegerType()),
    StructField("movieId", IntegerType()),
    StructField("rating", FloatType()),
    StructField("timestamp", StringType()),
])

In [4]:
# Lire depuis Kafka
df_raw = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:9092") \
    .option("subscribe", "movielens_ratings") \
    .option("startingOffsets", "earliest") \
    .load()

In [5]:
# Décoder les valeurs JSON
df = df_raw.selectExpr("CAST(value AS STRING)") \
    .select(from_json(col("value"), schema).alias("data")) \
    .select("data.*")

In [6]:
# Afficher en console
query = df.writeStream \
    .outputMode("append") \
    .format("console") \
    .start()

25/04/29 14:40:24 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-db71ac8d-5e44-43df-9e0b-fa6302dd1a42. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/04/29 14:40:24 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/04/29 14:40:24 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
| 25450|  91395|   2.1|1745935684|
|121502|    763|   3.7|1745935687|
|  4914|  45303|   2.0|1745935689|
| 93804|  31030|   3.8|1745935694|
| 59662|  79601|   3.0|1745935698|
| 18106|    935|   4.8|1745935703|
|136410|  89658|   3.9|1745935707|
| 71188| 113567|   3.5|1745935711|
| 81776|    510|   3.5|1745935715|
| 86614|   7045|   1.6|1745935719|
| 94884|  99609|   4.1|1745935722|
| 41803| 105974|   2.4|1745935727|
| 81438| 108844|   1.1|1745935730|
| 86962| 103418|   1.2|1745935733|
| 34199|  87229|   2.8|1745935737|
| 17757|   3896|   4.5|1745935741|
|109836|   8835|   1.1|1745935744|
| 90376| 131104|   1.1|1745935747|
| 26389|   5441|   2.3|1745935752|
| 20535|    439|   0.9|1745935755|
+------+-------+------+----------+
only showing top 20 rows



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|119154|   5581|   3.4|1745937627|
+------+-------+------+----------+



                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
| 86197| 103328|   3.0|1745937631|
+------+-------+------+----------+



In [None]:
query.awaitTermination()