In [1]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0,org.elasticsearch:elasticsearch-spark-30_2.12:8.11.0 pyspark-shell'

from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, to_timestamp, when, window, expr
from pyspark.sql.types import StructType, StringType, DoubleType, TimestampType

# Créer une session Spark
spark = SparkSession.builder \
    .appName("IoT Kafka") \
    .config("spark.jars.packages",
            "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0,"
            "org.elasticsearch:elasticsearch-spark-30_2.12:8.12.2") \
    .getOrCreate()

# Schéma des données JSON
schema = StructType() \
    .add("device_id", StringType()) \
    .add("temperature", DoubleType()) \
    .add("humidity", DoubleType()) \
    .add("pressure", DoubleType()) \
    .add("vibration", DoubleType()) \
    .add("latitude", DoubleType()) \
    .add("longitude", DoubleType()) \
    .add("timestamp", StringType())

# Lire depuis Kafka
raw_df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "broker:9092") \
    .option("subscribe", "iot_raw_data") \
    .option("startingOffsets", "latest") \
    .load()

# Extraire le JSON
parsed_df = raw_df.select(
    from_json(col("value").cast("string"), schema).alias("data")
).select("data.*")

# Conversion du timestamp
parsed_df = parsed_df.withColumn("timestamp", to_timestamp("timestamp"))

# Nettoyage des valeurs manquantes
clean_df = parsed_df.dropna()

# Ajout des règles métiers
alert_df = clean_df.withColumn(
    "status",
    when(col("temperature") > 50, "Alerte")
    .when((col("humidity") < 10) & (col("temperature") > 40), "Critique")
    .when((col("vibration") > 5) & ((col("pressure") < 950) | (col("pressure") > 1050)), "Alerte")
    .otherwise("Normal")
)

# Capteurs inactifs (si aucune donnée depuis >1h) => traitement par une autre logique (batch/join avec état)

# Exemple de corrélation régionale : moyenne température par région (arrondi lat/lon à 0.1°)
regional_alerts = alert_df \
    .withColumn("region_lat", expr("round(latitude, 1)")) \
    .withColumn("region_lon", expr("round(longitude, 1)")) \
    .groupBy(window("timestamp", "10 minutes"), "region_lat", "region_lon") \
    .avg("temperature", "humidity") \
    .withColumnRenamed("avg(temperature)", "avg_temp") \
    .withColumnRenamed("avg(humidity)", "avg_humidity") \
    .withColumn("regional_status", when(col("avg_temp") > 45, "Alerte regionale").otherwise("OK"))

# Écriture dans Elasticsearch
query = alert_df.writeStream \
    .format("org.elasticsearch.spark.sql") \
    .option("checkpointLocation", "/tmp/checkpoint_iot") \
    .option("es.nodes", "es01") \
    .option("es.port", "9200") \
    .option("es.nodes.wan.only", "true") \
    .option("es.net.ssl", "true") \
    .option("es.net.ssl.cert.allow.self.signed", "true") \
    .option("es.net.http.auth.user", "elastic") \
    .option("es.net.http.auth.pass", "Eselpil2") \
    .option("es.resource", "agritech_iot_data_enriched") \
    .outputMode("append") \
    .start()

# Kafka stream visible côté console (debug)
raw_df.selectExpr("CAST(value AS STRING)").writeStream \
    .format("console") \
    .outputMode("append") \
    .start()
query.awaitTermination()

StreamingQueryException: Query [id = 092e1c15-d10e-408c-9771-402cb70d901c, runId = 09e3817b-a974-4351-9dd0-1b79511c5b57] terminated with exception: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 0) (1304e2995f76 executor driver): org.apache.kafka.common.KafkaException: Failed to construct kafka consumer
	at org.apache.kafka.clients.consumer.KafkaConsumer.<init>(KafkaConsumer.java:823)
	at org.apache.kafka.clients.consumer.KafkaConsumer.<init>(KafkaConsumer.java:665)
	at org.apache.kafka.clients.consumer.KafkaConsumer.<init>(KafkaConsumer.java:613)
	at org.apache.spark.sql.kafka010.consumer.InternalKafkaConsumer.createConsumer(KafkaDataConsumer.scala:124)
	at org.apache.spark.sql.kafka010.consumer.InternalKafkaConsumer.<init>(KafkaDataConsumer.scala:61)
	at org.apache.spark.sql.kafka010.consumer.InternalKafkaConsumerPool$ObjectFactory.create(InternalKafkaConsumerPool.scala:207)
	at org.apache.spark.sql.kafka010.consumer.InternalKafkaConsumerPool$ObjectFactory.create(InternalKafkaConsumerPool.scala:202)
	at org.apache.commons.pool2.BaseKeyedPooledObjectFactory.makeObject(BaseKeyedPooledObjectFactory.java:82)
	at org.apache.commons.pool2.impl.GenericKeyedObjectPool.create(GenericKeyedObjectPool.java:780)
	at org.apache.commons.pool2.impl.GenericKeyedObjectPool.borrowObject(GenericKeyedObjectPool.java:439)
	at org.apache.commons.pool2.impl.GenericKeyedObjectPool.borrowObject(GenericKeyedObjectPool.java:350)
	at org.apache.spark.sql.kafka010.consumer.InternalKafkaConsumerPool.borrowObject(InternalKafkaConsumerPool.scala:85)
	at org.apache.spark.sql.kafka010.consumer.KafkaDataConsumer.retrieveConsumer(KafkaDataConsumer.scala:573)
	at org.apache.spark.sql.kafka010.consumer.KafkaDataConsumer.getOrRetrieveConsumer(KafkaDataConsumer.scala:558)
	at org.apache.spark.sql.kafka010.consumer.KafkaDataConsumer.$anonfun$get$1(KafkaDataConsumer.scala:294)
	at org.apache.spark.util.UninterruptibleThread.runUninterruptibly(UninterruptibleThread.scala:77)
	at org.apache.spark.sql.kafka010.consumer.KafkaDataConsumer.runUninterruptiblyIfPossible(KafkaDataConsumer.scala:618)
	at org.apache.spark.sql.kafka010.consumer.KafkaDataConsumer.get(KafkaDataConsumer.scala:290)
	at org.apache.spark.sql.kafka010.KafkaBatchPartitionReader.next(KafkaBatchPartitionReader.scala:64)
	at org.apache.spark.sql.execution.datasources.v2.PartitionIterator.hasNext(DataSourceRDD.scala:119)
	at org.apache.spark.sql.execution.datasources.v2.MetricsIterator.hasNext(DataSourceRDD.scala:156)
	at org.apache.spark.sql.execution.datasources.v2.DataSourceRDD$$anon$1.$anonfun$hasNext$1(DataSourceRDD.scala:63)
	at org.apache.spark.sql.execution.datasources.v2.DataSourceRDD$$anon$1.$anonfun$hasNext$1$adapted(DataSourceRDD.scala:63)
	at scala.Option.exists(Option.scala:376)
	at org.apache.spark.sql.execution.datasources.v2.DataSourceRDD$$anon$1.hasNext(DataSourceRDD.scala:63)
	at org.apache.spark.sql.execution.datasources.v2.DataSourceRDD$$anon$1.advanceToNextIter(DataSourceRDD.scala:97)
	at org.apache.spark.sql.execution.datasources.v2.DataSourceRDD$$anon$1.hasNext(DataSourceRDD.scala:63)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:513)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.elasticsearch.spark.rdd.EsRDDWriter.write(EsRDDWriter.scala:82)
	at org.elasticsearch.spark.sql.streaming.EsStreamQueryWriter.run(EsStreamQueryWriter.scala:62)
	at org.elasticsearch.spark.sql.streaming.EsSparkSqlStreamingSink.$anonfun$addBatch$5(EsSparkSqlStreamingSink.scala:72)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	at java.base/java.lang.Thread.run(Thread.java:833)
Caused by: org.apache.kafka.common.config.ConfigException: No resolvable bootstrap urls given in bootstrap.servers
	at org.apache.kafka.clients.ClientUtils.parseAndValidateAddresses(ClientUtils.java:89)
	at org.apache.kafka.clients.ClientUtils.parseAndValidateAddresses(ClientUtils.java:48)
	at org.apache.kafka.clients.consumer.KafkaConsumer.<init>(KafkaConsumer.java:731)
	... 41 more

Driver stacktrace:

In [None]:
!apt update && apt install -y netcat
!nc -zv broker 9092



In [2]:
import requests
requests.get("https://es01:9200/", auth=("elastic", "Eselpil2"), verify=False)





<Response [200]>

In [1]:
!curl -u elastic:Eselpil2 https://es01:9200 -k


/bin/bash: line 1: curl: command not found


In [2]:
import pyspark
print(pyspark.__version__)


3.5.0


In [None]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0,org.elasticsearch:elasticsearch-spark-30_2.12:8.11.0 pyspark-shell'

from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, to_timestamp, when, window, expr
from pyspark.sql.types import StructType, StringType, DoubleType, TimestampType

# Créer une session Spark
spark = SparkSession.builder \
    .appName("IoT Kafka") \
    .config("spark.jars.packages",
            "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0,"
            "org.elasticsearch:elasticsearch-spark-30_2.12:8.12.2") \
    .getOrCreate()

# Schéma des données JSON
schema = StructType() \
    .add("device_id", StringType()) \
    .add("temperature", DoubleType()) \
    .add("humidity", DoubleType()) \
    .add("pressure", DoubleType()) \
    .add("vibration", DoubleType()) \
    .add("latitude", DoubleType()) \
    .add("longitude", DoubleType()) \
    .add("timestamp", StringType())

# Lire depuis Kafka
raw_df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "broker:9092") \
    .option("subscribe", "iot_raw_data") \
    .option("startingOffsets", "latest") \
    .load()

# Extraire le JSON
parsed_df = raw_df.select(
    from_json(col("value").cast("string"), schema).alias("data")
).select("data.*")

# Conversion du timestamp
parsed_df = parsed_df.withColumn("timestamp", to_timestamp("timestamp"))

# Nettoyage des valeurs manquantes
clean_df = parsed_df.dropna()

# Ajout des règles métiers
alert_df = clean_df.withColumn(
    "status",
    when(col("temperature") > 50, "Alerte")
    .when((col("humidity") < 10) & (col("temperature") > 40), "Critique")
    .when((col("vibration") > 5) & ((col("pressure") < 950) | (col("pressure") > 1050)), "Alerte")
    .otherwise("Normal")
)

# Capteurs inactifs (si aucune donnée depuis >1h) => traitement par une autre logique (batch/join avec état)

# Exemple de corrélation régionale : moyenne température par région (arrondi lat/lon à 0.1°)
regional_alerts = alert_df \
    .withColumn("region_lat", expr("round(latitude, 1)")) \
    .withColumn("region_lon", expr("round(longitude, 1)")) \
    .groupBy(window("timestamp", "10 minutes"), "region_lat", "region_lon") \
    .avg("temperature", "humidity") \
    .withColumnRenamed("avg(temperature)", "avg_temp") \
    .withColumnRenamed("avg(humidity)", "avg_humidity") \
    .withColumn("regional_status", when(col("avg_temp") > 45, "Alerte regionale").otherwise("OK"))

query1 = alert_df.writeStream \
    .format("json") \
    .option("path", "/notebooks/iot_output_json") \
    .option("checkpointLocation", "/notebooks/iot_checkpoint_json") \
    .outputMode("append") \
    .start()

# Écriture dans Elasticsearch
query2 = alert_df.writeStream \
    .format("org.elasticsearch.spark.sql") \
    .option("checkpointLocation", "/tmp/checkpoint_iot") \
    .option("es.nodes", "es01") \
    .option("es.port", "9200") \
    .option("es.nodes.wan.only", "true") \
    .option("es.net.ssl", "true") \
    .option("es.net.ssl.cert.allow.self.signed", "true") \
    .option("es.net.http.auth.user", "elastic") \
    .option("es.net.http.auth.pass", "Eselpil2") \
    .option("es.resource", "agritech_iot_data_enriched") \
    .outputMode("append") \
    .start()

# Kafka stream visible côté console (debug)
query3 = raw_df.selectExpr("CAST(value AS STRING)").writeStream \
    .format("console") \
    .outputMode("append") \
    .start()
query1.awaitTermination()
query2.awaitTermination()
query3.awaitTermination()
