In [1]:
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark.sql.functions import col, when, unix_timestamp
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType

# Initialisation de la session Spark
spark = SparkSession.builder \
    .appName("IoT Data Pipeline") \
    .getOrCreate()

# Configuration de la source de données (ici on imagine une source Kafka)






In [None]:
# Spécifier la configuration de Kafka
kafka_stream_df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "iot_data_topic") \
    .load()

# Les données de Kafka sont stockées dans des colonnes 'key' et 'value' sous forme de 'binary'.
# Nous devons décoder les données et les transformer en un DataFrame structuré.

# Définir un schéma pour les données IoT
schema = StructType([
    StructField("machine_id", StringType(), True),
    StructField("timestamp", StringType(), True),
    StructField("temperature", FloatType(), True),
    StructField("vibration", FloatType(), True),
])

# Décoder les données et les structurer
iot_data_df = kafka_stream_df.selectExpr("CAST(value AS STRING)") \
    .select(from_json(col("value"), schema).alias("data")) \
    .select("data.*")

# Affichage des premières lignes
iot_data_df.show(5)


## Nettoyage des données

In [None]:
# Supprimer les lignes où des valeurs critiques sont manquantes (par exemple, température ou vibration)
iot_data_clean_df = iot_data_df.dropna(subset=["temperature", "vibration"])

# Affichage après nettoyage
iot_data_clean_df.show(5)


## calcul de statut des machines

In [None]:
# Définir des seuils pour les statuts des machines
iot_data_clean_df = iot_data_clean_df.withColumn(
    "machine_status",
    when((col("temperature") > 40) | (col("vibration") > 1000), "Critique")
    .when((col("temperature") > 35) & (col("temperature") <= 40), "Alerte")
    .otherwise("Normal")
)

iot_data_clean_df.show(5)


## Calcul de la durée de l'état actuel

In [None]:
# Convertir le timestamp en type datetime pour pouvoir calculer la durée
iot_data_clean_df = iot_data_clean_df.withColumn(
    "timestamp", unix_timestamp("timestamp", "yyyy-MM-dd HH:mm:ss").cast("timestamp")
)

# Calculer la durée en minutes entre chaque événement (fenêtre temporelle)
from pyspark.sql.window import Window
from pyspark.sql import functions as F

windowSpec = Window.partitionBy("machine_id").orderBy("timestamp")

iot_data_clean_df = iot_data_clean_df.withColumn(
    "duration_in_current_status",
    (F.unix_timestamp("timestamp") - F.unix_timestamp(F.lag("timestamp", 1).over(windowSpec))) / 60
)

iot_data_clean_df.show(5)


## Détection des anomalies et ajout de métriques supplémentaires

In [None]:
# Anomalie basée sur une température excessive
iot_data_clean_df = iot_data_clean_df.withColumn(
    "is_anomaly",
    when((col("temperature") > 40) | (col("vibration") > 1000), True).otherwise(False)
)

iot_data_clean_df.show(5)


## Envoi des données transformées vers Elasticsearch

In [None]:
# Configuration de l'index Elasticsearch
es_config = {
    "es.nodes": "localhost",
    "es.port": "9200",
    "es.index.auto.create": "true",
    "es.write.operation": "index"
}

# Envoi des données transformées vers Elasticsearch en streaming
iot_data_clean_df.writeStream \
    .format("org.elasticsearch.spark.sql") \
    .option("checkpointLocation", "/tmp/checkpoint/") \
    .options(**es_config) \
    .start() \
    .awaitTermination()


In [1]:
import os
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0,org.elasticsearch:elasticsearch-spark-30_2.12:8.11.0 pyspark-shell'
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, from_json, unix_timestamp, lag, expr
from pyspark.sql.window import Window
from pyspark.sql.types import StructType, StringType, DoubleType, TimestampType

# 1. Initialiser Spark Session
spark = SparkSession.builder \
    .appName("IoTDataProcessing") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")

# 2. Définir le schéma des données JSON venant de Kafka
schema = StructType() \
    .add("timestamp", StringType()) \
    .add("machine_id", StringType()) \
    .add("region", StringType()) \
    .add("season", StringType()) \
    .add("temperature", DoubleType()) \
    .add("humidity", DoubleType()) \
    .add("soil_moisture", DoubleType()) \
    .add("vibration", DoubleType()) \
    .add("pressure", DoubleType())

# 3. Lire les données Kafka
raw_df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "broker:9092") \
    .option("subscribe", "iot_raw_data") \
    .load()

json_df = raw_df.selectExpr("CAST(value AS STRING) as json") \
    .select(from_json(col("json"), schema).alias("data")) \
    .select("data.*")

# 4. Convertir timestamp et enrichir
iot_df = json_df.withColumn("timestamp", col("timestamp").cast(TimestampType()))

# Enrichissement 1 : température > 50°C
iot_df = iot_df.withColumn(
    "alerte_temperature",
    when(col("temperature") > 50, "Alerte").otherwise("Normal")
)

# Enrichissement 2 : humidité < 10% et température > 40°C
iot_df = iot_df.withColumn(
    "alerte_sol_sec",
    when((col("humidity") < 10) & (col("temperature") > 40), "Critique").otherwise("Normal")
)

# Enrichissement 3 : vibrations > 5 + pression > 1.5
iot_df = iot_df.withColumn(
    "alerte_vibration_pression",
    when((col("vibration") > 5) & (col("pressure") > 1.5), "Alerte").otherwise("Normal")
)

# Enrichissement 4 : machine inactive > 1h
iot_df = iot_df.withColumn("timestamp_sec", unix_timestamp("timestamp"))
windowSpec = Window.partitionBy("machine_id").orderBy("timestamp")
iot_df = iot_df.withColumn(
    "delai_lecture",
    col("timestamp_sec") - lag("timestamp_sec", 1).over(windowSpec)
)
iot_df = iot_df.withColumn(
    "statut_capteur",
    when(col("delai_lecture") > 3600, "Défaillant").otherwise("Actif")
)

# Enrichissement 5 : Alerte régionale (si ≥ 2 alertes dans une région)
iot_df.createOrReplaceTempView("iot_data")

alerte_regionale = spark.sql("""
SELECT region, COUNT(*) as nb_alertes
FROM iot_data
WHERE alerte_temperature = 'Alerte' OR alerte_vibration_pression = 'Alerte'
GROUP BY region
HAVING nb_alertes >= 2
""")

iot_df = iot_df.join(
    alerte_regionale.withColumn("alerte_regionale", expr("'Alerte régionale'")),
    on="region",
    how="left"
).fillna({"alerte_regionale": "Rien à signaler"})

# Statut global
iot_df = iot_df.withColumn(
    "statut_global",
    when((col("alerte_sol_sec") == "Critique") | (col("statut_capteur") == "Défaillant"), "Critique")
    .when((col("alerte_temperature") == "Alerte") |
          (col("alerte_vibration_pression") == "Alerte") |
          (col("alerte_regionale") == "Alerte régionale"), "Alerte")
    .otherwise("Normal")
)

# 5. Écrire dans Elasticsearch
iot_df.writeStream \
    .format("org.elasticsearch.spark.sql") \
    .option("checkpointLocation", "/tmp/checkpoint_iot") \
    .option("es.nodes", "es01") \
    .option("es.port", "9200") \
    .option("es.resource", "iot_data_enriched/doc") \
    .outputMode("append") \
    .start()


Py4JJavaError: An error occurred while calling o133.start.
: org.apache.spark.SparkClassNotFoundException: [DATA_SOURCE_NOT_FOUND] Failed to find the data source: org.elasticsearch.spark.sql. Please find packages at `https://spark.apache.org/third-party-projects.html`.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.dataSourceNotFoundError(QueryExecutionErrors.scala:724)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:647)
	at org.apache.spark.sql.streaming.DataStreamWriter.startInternal(DataStreamWriter.scala:370)
	at org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:251)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:833)
Caused by: java.lang.ClassNotFoundException: org.elasticsearch.spark.sql.DefaultSource
	at java.base/java.net.URLClassLoader.findClass(URLClassLoader.java:445)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:592)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:525)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$5(DataSource.scala:633)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$4(DataSource.scala:633)
	at scala.util.Failure.orElse(Try.scala:224)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:633)
	... 14 more
