In [14]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, to_timestamp, when, window, expr
from pyspark.sql.types import StructType, StringType, DoubleType, TimestampType

# Créer une session Spark
spark = SparkSession.builder \
    .appName("IoT Kafka") \
    .config("spark.jars.packages",
            "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0,"
            "org.elasticsearch:elasticsearch-spark-30_2.12:8.12.2") \
    .getOrCreate()

# Schéma des données JSON
schema = StructType() \
    .add("device_id", StringType()) \
    .add("temperature", DoubleType()) \
    .add("humidity", DoubleType()) \
    .add("pressure", DoubleType()) \
    .add("vibration", DoubleType()) \
    .add("latitude", DoubleType()) \
    .add("longitude", DoubleType()) \
    .add("timestamp", StringType())

# Lire depuis Kafka
raw_df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "broker:9092") \
    .option("subscribe", "iot_raw_data") \
    .option("startingOffsets", "latest") \
    .load()

# Extraire le JSON
parsed_df = raw_df.select(
    from_json(col("value").cast("string"), schema).alias("data")
).select("data.*")

# Conversion du timestamp
parsed_df = parsed_df.withColumn("timestamp", to_timestamp("timestamp"))

# Nettoyage des valeurs manquantes
clean_df = parsed_df.dropna()

# Ajout des règles métiers
alert_df = clean_df.withColumn(
    "status",
    when(col("temperature") > 50, "Alerte")
    .when((col("humidity") < 10) & (col("temperature") > 40), "Critique")
    .when((col("vibration") > 5) & ((col("pressure") < 950) | (col("pressure") > 1050)), "Alerte")
    .otherwise("Normal")
)

# Capteurs inactifs (si aucune donnée depuis >1h) => traitement par une autre logique (batch/join avec état)

# Exemple de corrélation régionale : moyenne température par région (arrondi lat/lon à 0.1°)
regional_alerts = alert_df \
    .withColumn("region_lat", expr("round(latitude, 1)")) \
    .withColumn("region_lon", expr("round(longitude, 1)")) \
    .groupBy(window("timestamp", "10 minutes"), "region_lat", "region_lon") \
    .avg("temperature", "humidity") \
    .withColumnRenamed("avg(temperature)", "avg_temp") \
    .withColumnRenamed("avg(humidity)", "avg_humidity") \
    .withColumn("regional_status", when(col("avg_temp") > 45, "Alerte regionale").otherwise("OK"))

# Écriture dans Elasticsearch
query = alert_df.writeStream \
    .format("org.elasticsearch.spark.sql") \
    .option("checkpointLocation", "/tmp/checkpoint_iot") \
    .option("es.nodes", "es01") \
    .option("es.port", "9200") \
    .option("es.nodes.wan.only", "true") \
    .option("es.net.ssl", "true") \
    .option("es.net.ssl.cert.allow.self.signed", "true") \
    .option("es.net.http.auth.user", "elastic") \
    .option("es.net.http.auth.pass", "Eselpil2") \
    .option("es.resource", "iot_data_enriched/doc") \
    .option("es.mapping.id", "unique_id") \
    .outputMode("update") \
    .start()

query.awaitTermination()

StreamingQueryException: [STREAM_FAILED] Query [id = 3fe3b645-6033-4ce8-8275-4e393678d9ae, runId = 1d9174d5-b439-46a9-afe7-ecdeeee57e0c] terminated with exception: Failed to create new KafkaAdminClient

In [None]:
!apt update && apt install -y netcat
!nc -zv broker 9092



In [5]:
import requests
requests.get("https://es01:9200", auth=("elastic", "Eselpil2"), verify=False)





<Response [200]>