In [None]:
import socket

s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
try:
    s.connect(('localhost', 9092))
    print("Connexion réussie à localhost:9092")
except Exception as e:
    print("Erreur de connexion:", e)
finally:
    s.close()


Exercice 1

In [None]:
from kafka import KafkaProducer
import json

producer = KafkaProducer(
    bootstrap_servers='localhost:29092',
    value_serializer=lambda v: json.dumps(v).encode('utf-8')
)


message = {"msg": "Hello Kafka"}
producer.send('weather_stream', message)
producer.flush()
print("Message envoyé au topic weather_stream")


Exercice 2

In [None]:
from kafka import KafkaConsumer
import json

topic = 'weather_stream'

consumer = KafkaConsumer(
    topic,
    bootstrap_servers='localhost:29092', 
    auto_offset_reset='earliest',
    value_deserializer=lambda m: json.loads(m.decode('utf-8'))
)

print(f"Lecture des messages du topic {topic}")

for message in consumer:
    print("Message reçu:", message.value)


Exercice 3

In [None]:
import requests
from kafka import KafkaProducer
import json
import time

def get_weather(lat, lon):
    url = f"https://api.open-meteo.com/v1/forecast?latitude={lat}&longitude={lon}&current_weather=true"
    response = requests.get(url)
    response.raise_for_status()
    return response.json()

def run_producer(lat, lon):
    producer = KafkaProducer(
        bootstrap_servers='localhost:29092',
        value_serializer=lambda v: json.dumps(v).encode('utf-8')
    )
    topic = 'weather_stream'

    print(f"Streaming météo pour latitude={lat}, longitude={lon} vers le topic {topic}")

    for _ in range(3):  
        weather_data = get_weather(lat, lon)
        current_weather = weather_data.get('current_weather', {})
        if current_weather:
            message = {
                'latitude': lat,
                'longitude': lon,
                'weather': current_weather
            }
            producer.send(topic, message)
            producer.flush()
            print("Message envoyé:", message)
        else:
            print("Pas de données météo reçues")
        time.sleep(10)

latitude = 48.8566
longitude = 2.3522

run_producer(latitude, longitude)


Exercice 4

In [None]:
try:
    spark.stop()
except:
    pass

from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, expr, when, to_json, struct
from pyspark.sql.types import StructType, StructField, DoubleType, StringType, TimestampType

spark = SparkSession.builder \
    .appName("KafkaSpark4") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.13:4.0.1") \
    .getOrCreate()


print(f"Spark version: {spark.version}")
print(f"Scala version: {spark.sparkContext._jvm.scala.util.Properties.versionString()}")

schema = StructType([
    StructField("latitude", DoubleType()),
    StructField("longitude", DoubleType()),
    StructField("weather", StructType([
        StructField("temperature", DoubleType()),
        StructField("windspeed", DoubleType()),
        StructField("time", StringType())
    ]))
])

df_raw = spark.readStream.format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:9092") \
    .option("subscribe", "weather_stream") \
    .option("startingOffsets", "earliest") \
    .load()

df_json = df_raw.selectExpr("CAST(value AS STRING) as json_str")

df_parsed = df_json.select(from_json(col("json_str"), schema).alias("data")).select("data.*")

df_weather = df_parsed.select(
    col("latitude"),
    col("longitude"),
    expr("cast(weather.temperature as double)").alias("temperature"),
    expr("cast(weather.windspeed as double)").alias("windspeed"),
    expr("cast(weather.time as timestamp)").alias("event_time")
).withColumn("wind_alert_level",
             when(col("windspeed") < 10, "level_0")
             .when((col("windspeed") >= 10) & (col("windspeed") <= 20), "level_1")
             .otherwise("level_2")) \
 .withColumn("heat_alert_level",
             when(col("temperature") < 25, "level_0")
             .when((col("temperature") >= 25) & (col("temperature") <= 35), "level_1")
             .otherwise("level_2"))

df_out = df_weather.select(to_json(struct(
    "event_time",
    "temperature",
    "windspeed",
    "wind_alert_level",
    "heat_alert_level"
)).alias("value"))

query = df_out.writeStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:9092") \
    .option("topic", "weather_transformed") \
    .option("checkpointLocation", "/tmp/spark_checkpoint_weather_transform") \
    .start()

query.awaitTermination()

Exercice 5

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, window, count, avg, min, max
from pyspark.sql.types import StructType, StructField, TimestampType, DoubleType, StringType

try:
    spark.stop()
except:
    pass

spark = SparkSession.builder \
    .appName("Exo5_AgrégatsSeparés") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.13:4.0.1") \
    .getOrCreate()

schema = StructType([
    StructField("event_time", TimestampType()),
    StructField("temperature", DoubleType()),
    StructField("windspeed", DoubleType()),
    StructField("wind_alert_level", StringType()),
    StructField("heat_alert_level", StringType()),
    StructField("city", StringType()),
    StructField("country", StringType())
])

df_raw = spark.readStream.format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:9092") \
    .option("subscribe", "weather_transformed") \
    .option("startingOffsets", "earliest") \
    .load()

df_json = df_raw.selectExpr("CAST(value AS STRING) as json_str")
df_parsed = df_json.select(from_json(col("json_str"), schema).alias("data")).select("data.*")

window_duration = "5 minutes"
slide_duration = "1 minute"

with_watermark = df_parsed.withWatermark("event_time", "10 minutes")

alerts = with_watermark.filter(
    (col("wind_alert_level").isin("level_1", "level_2")) |
    (col("heat_alert_level").isin("level_1", "level_2"))
)

alerts_count = alerts.groupBy(
    window("event_time", window_duration, slide_duration),
    "city", "country"
).agg(count("*").alias("alert_count"))

temp_stats = with_watermark.groupBy(
    window("event_time", window_duration, slide_duration),
    "city", "country"
).agg(
    avg("temperature").alias("avg_temperature"),
    min("temperature").alias("min_temperature"),
    max("temperature").alias("max_temperature"),
)

alerts_query = alerts_count.writeStream \
    .outputMode("update") \
    .format("console") \
    .option("truncate", False) \
    .start()

temp_query = temp_stats.writeStream \
    .outputMode("update") \
    .format("console") \
    .option("truncate", False) \
    .start()

alerts_query.awaitTermination()
temp_query.awaitTermination()


Exercice 6


In [None]:
import requests
from kafka import KafkaProducer
import json
import time

def get_weather(lat, lon):
    url = f"https://api.open-meteo.com/v1/forecast?latitude={lat}&longitude={lon}&current_weather=true"
    response = requests.get(url)
    response.raise_for_status()
    return response.json()

def run_producer(lat, lon):
    producer = KafkaProducer(
        bootstrap_servers='localhost:29092',
        value_serializer=lambda v: json.dumps(v).encode('utf-8')
    )
    topic = 'weather_stream'

    print(f"Streaming météo pour latitude={lat}, longitude={lon} vers le topic {topic}")

    for _ in range(3):  
        weather_data = get_weather(lat, lon)
        current_weather = weather_data.get('current_weather', {})
        if current_weather:
            message = {
                'latitude': lat,
                'longitude': lon,
                'weather': current_weather
            }
            producer.send(topic, message)
            producer.flush()
            print("Message envoyé:", message)
        else:
            print("Pas de données météo reçues")
        time.sleep(10)

latitude = 48.8566
longitude = 2.3522

run_producer(latitude, longitude)


Exercice 7

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, TimestampType, DoubleType, StringType

try:
    spark.stop()
except:
    pass

spark = SparkSession.builder \
    .appName("SimpleKafkaToLocal") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.13:4.0.1") \
    .getOrCreate()

schema = StructType([
    StructField("event_time", TimestampType()),
    StructField("temperature", DoubleType()),
    StructField("windspeed", DoubleType()),
    StructField("wind_alert_level", StringType()),
    StructField("heat_alert_level", StringType()),
    StructField("city", StringType()),
    StructField("country", StringType())
])

df_raw = spark.readStream.format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "weather_transformed") \
    .option("startingOffsets", "earliest") \
    .load()

df_json = df_raw.selectExpr("CAST(value AS STRING) as json_str")
df_parsed = df_json.select(from_json(col("json_str"), schema).alias("data")).select("data.*")

alerts = df_parsed.filter(
    (col("wind_alert_level").isin("level_1", "level_2"))
    | (col("heat_alert_level").isin("level_1", "level_2"))
)

query = alerts.writeStream \
    .outputMode("append") \
    .format("json") \
    .option("path", "./alerts_data") \
    .option("checkpointLocation", "./alerts_checkpoint") \
    .start()

query.awaitTermination()


Exercice 8


In [None]:
from pyspark.sql.types import StructType, StructField, TimestampType, DoubleType, StringType
from pyspark.sql.functions import col
import matplotlib.pyplot as plt
import pandas as pd

pdf = pd.read_json("./alerts_data/alerts_test.json")  

pdf["event_time"] = pd.to_datetime(pdf["event_time"])

schema = StructType([
    StructField("event_time", TimestampType()),
    StructField("temperature", DoubleType()),
    StructField("windspeed", DoubleType()),
    StructField("wind_alert_level", StringType()),
    StructField("heat_alert_level", StringType()),
    StructField("city", StringType()),
    StructField("country", StringType())
])

df = spark.createDataFrame(pdf, schema=schema)
df.show()


plt.figure(figsize=(10,4))
plt.plot(pdf["event_time"], pdf["temperature"])
plt.title("Évolution température")
plt.xlabel("Temps")
plt.ylabel("Température (°C)")
plt.show()

plt.figure(figsize=(10,4))
plt.plot(pdf["event_time"], pdf["windspeed"])
plt.title("Évolution vitesse du vent")
plt.xlabel("Temps")
plt.ylabel("Vitesse du vent (m/s)")
plt.show()

alerts_count_wind = pdf.groupby("wind_alert_level").size()
alerts_count_heat = pdf.groupby("heat_alert_level").size()

plt.bar(alerts_count_wind.index, alerts_count_wind.values, alpha=0.7, label="Vent")
plt.bar(alerts_count_heat.index, alerts_count_heat.values, alpha=0.4, label="Chaleur")
plt.title("Nombre d'alertes vent et chaleur par niveau")
plt.legend()
plt.show()


Exercice 9

In [None]:
from datetime import datetime

def fetch_weather_history_yearly(start_year, end_year):
    all_records = []
    today_str = datetime.today().strftime('%Y-%m-%d')
    for year in range(start_year, end_year + 1):
        start_date = f"{year}-01-01"
        if year == datetime.today().year:
            end_date = today_str 
        else:
            end_date = f"{year}-12-31"
        print(f"Fetching data for {start_date} to {end_date}...")
        url = "https://archive-api.open-meteo.com/v1/archive"
        params = {
            "latitude": 48.8566,
            "longitude": 2.3522,
            "start_date": start_date,
            "end_date": end_date,
            "hourly": "temperature_2m,windspeed_10m",
            "timezone": "Europe/Paris"
        }
        response = requests.get(url, params=params)
        response.raise_for_status()
        data = response.json()
        for dt, temp, wind in zip(data["hourly"]["time"], data["hourly"]["temperature_2m"], data["hourly"]["windspeed_10m"]):
            all_records.append({
                "event_time": dt,
                "temperature": temp,
                "windspeed": wind,
                "city": "Paris",
                "country": "France"
            })
    return all_records


Exercice 10

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, row_number
from pyspark.sql.types import StructType, StructField, StringType, DoubleType
from pyspark.sql.window import Window
import json
import datetime

spark = SparkSession.builder.appName("Exo10_RecordsClimatiques_Local").getOrCreate()

schema = StructType([
    StructField("event_time", StringType()),
    StructField("temperature", DoubleType()),
    StructField("windspeed", DoubleType()),
    StructField("precipitation", DoubleType(), True),
    StructField("city", StringType()),
    StructField("country", StringType())
])

df = spark.read.schema(schema).json("./alerts_data/alerts_test.json")
df = df.withColumn("date", to_date(col("event_time")))

w = Window.partitionBy("city")

max_temp_per_day = df.groupBy("city", "date").max("temperature").withColumnRenamed("max(temperature)", "max_temp")
max_temp_record = max_temp_per_day.withColumn("rn", row_number().over(w.orderBy(col("max_temp").desc()))) \
    .filter(col("rn") == 1).select("city", "date", "max_temp")

min_temp_per_day = df.groupBy("city", "date").min("temperature").withColumnRenamed("min(temperature)", "min_temp")
min_temp_record = min_temp_per_day.withColumn("rn", row_number().over(w.orderBy(col("min_temp").asc()))) \
    .filter(col("rn") == 1).select("city", "date", "min_temp")

max_wind_record = df.withColumn("rn", row_number().over(w.orderBy(col("windspeed").desc()))) \
    .filter(col("rn") == 1).select("city", "event_time", "windspeed")

if "precipitation" in df.columns:
    rain_per_day = df.groupBy("city", "date").sum("precipitation").withColumnRenamed("sum(precipitation)", "total_precip")
    rain_record = rain_per_day.withColumn("rn", row_number().over(w.orderBy(col("total_precip").desc()))) \
        .filter(col("rn") == 1).select("city", "date", "total_precip")
else:
    rain_record = None

def convert_dates(obj):
    if isinstance(obj, dict):
        return {k: convert_dates(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_dates(v) for v in obj]
    elif isinstance(obj, (datetime.date, datetime.datetime)):
        return obj.isoformat()
    else:
        return obj

records = {
    "max_temp": max_temp_record.toPandas().to_dict(orient="records"),
    "min_temp": min_temp_record.toPandas().to_dict(orient="records"),
    "max_wind": max_wind_record.toPandas().to_dict(orient="records"),
    "max_precip": rain_record.toPandas().to_dict(orient="records") if rain_record is not None else []
}
records_json_ready = convert_dates(records)

with open("./alerts_data/weather_records.json", "w") as f:
    json.dump(records_json_ready, f, indent=2)

print("Records calculés et écrits dans alerts_data/weather_records.json")


Exercice 11

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, month, year, avg, count, when
from pyspark.sql.types import StructType, StructField, StringType, DoubleType
import json
import datetime

spark = SparkSession.builder.appName("Exo11_ClimatologieUrbaine_Local").getOrCreate()

schema = StructType([
    StructField("event_time", StringType()),
    StructField("temperature", DoubleType()),
    StructField("windspeed", DoubleType()),
    StructField("wind_alert_level", StringType()),
    StructField("city", StringType()),
    StructField("country", StringType())
])

df = spark.read.schema(schema).json("./alerts_data/alerts_test.json")

df = df.withColumn("month", month(col("event_time"))).withColumn("year", year(col("event_time")))

seasonal_profile = df.groupBy("country", "city", "month").agg(
    avg("temperature").alias("temperature_month_avg"),
    avg("windspeed").alias("windspeed_month_avg"),
    (count(when(col("wind_alert_level").isin("level_1", "level_2"), 1)) / count("*")).alias("alert_probability")
)

profile_dict = seasonal_profile.toPandas().to_dict(orient="records")

with open("./alerts_data/seasonal_profile.json", "w") as f:
    json.dump(profile_dict, f, indent=2)

print("Profils saisonniers calculés et écrits dans alerts_data/seasonal_profile.json")


Exercice 12

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, month, year, avg, stddev, min, max, expr, count, when, percentile_approx
from pyspark.sql.types import StructType, StructField, StringType, DoubleType
import json
import os
import datetime

spark = SparkSession.builder.appName("Exo12_ValidationEnrichissement").getOrCreate()

schema = StructType([
    StructField("country", StringType()),
    StructField("city", StringType()),
    StructField("month", DoubleType()),  
    StructField("temperature_month_avg", DoubleType()),
    StructField("windspeed_month_avg", DoubleType()),
    StructField("alert_probability", DoubleType())
])

df = spark.read.schema(schema).json("./alerts_data/seasonal_profile.json")

from pyspark.sql.window import Window
from pyspark.sql.functions import countDistinct

grouped = df.groupBy("country", "city").agg(countDistinct("month").alias("months_count"))
incomplete_profiles = grouped.filter(col("months_count") < 12).collect()

if incomplete_profiles:
    print("Profils incomplets détectés:")
    for r in incomplete_profiles:
        print(f"{r['city']}, {r['country']} : {r['months_count']} mois détectés")
else:
    print("Tous les profils ont 12 mois complets")


df_enriched = df.groupBy("country", "city", "month").agg(
    avg("temperature_month_avg").alias("avg_temp"),
    stddev("temperature_month_avg").alias("std_temp"),
    min("temperature_month_avg").alias("min_temp"),
    max("temperature_month_avg").alias("max_temp"),
    expr("percentile_approx(temperature_month_avg, 0.5)").alias("median_temp"),
    expr("percentile_approx(temperature_month_avg, array(0.25, 0.75))").alias("q25_q75_temp"),

    avg("windspeed_month_avg").alias("avg_wind"),
    stddev("windspeed_month_avg").alias("std_wind"),
    min("windspeed_month_avg").alias("min_wind"),
    max("windspeed_month_avg").alias("max_wind"),
    expr("percentile_approx(windspeed_month_avg, 0.5)").alias("median_wind"),
    expr("percentile_approx(windspeed_month_avg, array(0.25, 0.75))").alias("q25_q75_wind"),

    avg("alert_probability").alias("avg_alert_prob")
)

df_valid = df_enriched.filter(
    (col("min_temp") >= -50) & (col("max_temp") <= 60) &
    (col("min_wind") >= 0) & (col("max_wind") <= 60)
)


profile_enriched_pd = df_valid.toPandas()

output_dir = "./alerts_data/seasonal_profile_enriched"
os.makedirs(output_dir, exist_ok=True)

with open(os.path.join(output_dir, "profile_enriched.json"), "w") as f:
    json.dump(profile_enriched_pd.to_dict(orient="records"), f, indent=2)

print("Profils saisonniers validés, enrichis et sauvegardés en local sous", output_dir)

Exercice 13

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, month, year, avg, stddev, min, max, count, when, expr, to_date
from pyspark.sql.types import StructType, StructField, StringType, DoubleType
import json
import datetime
import os
import pandas as pd

spark = SparkSession.builder.appName("Exo13_DetectionAnomalies").getOrCreate()

profile_schema = StructType([
    StructField("country", StringType()),
    StructField("city", StringType()),
    StructField("month", DoubleType()),
    StructField("avg_temp", DoubleType()),
    StructField("std_temp", DoubleType()),
    StructField("min_temp", DoubleType()),
    StructField("max_temp", DoubleType()),
    StructField("median_temp", DoubleType()),
    StructField("q25_q75_temp", StringType()), 
    StructField("avg_wind", DoubleType()),
    StructField("std_wind", DoubleType()),
    StructField("min_wind", DoubleType()),
    StructField("max_wind", DoubleType()),
    StructField("median_wind", DoubleType()),
    StructField("q25_q75_wind", StringType()),
    StructField("avg_alert_prob", DoubleType())
])

profiles_df = spark.read.schema(profile_schema).json("./alerts_data/seasonal_profile_enriched/profile_enriched.json")

realtime = {
    "event_time": "2025-09-23T15:00:00",
    "city": "Paris",
    "country": "France",
    "temperature": 30,
    "windspeed": 20,
    "wind_alert_level": "level_2"
}

realtime_month = datetime.datetime.strptime(realtime["event_time"], "%Y-%m-%dT%H:%M:%S").month

profiles_pd = profiles_df.toPandas()
profile_row = profiles_pd[
    (profiles_pd["country"] == realtime["country"]) &
    (profiles_pd["city"] == realtime["city"]) &
    (profiles_pd["month"] == realtime_month)
]
if profile_row.empty:
    print("Pas de profil historique pour cette clé")
else:
    profile = profile_row.iloc[0]
    anomalies = []
    if abs(realtime["temperature"] - profile["avg_temp"]) > 5:
        anomalies.append({
            "variable": "temperature",
            "observed_value": realtime["temperature"],
            "expected_value": profile["avg_temp"],
            "anomaly_type": "temperature_anomaly"
        })
    wind_thresh = profile["avg_wind"] + 2 * profile["std_wind"] if not pd.isna(profile["std_wind"]) else None
    if wind_thresh and realtime["windspeed"] > wind_thresh:
        anomalies.append({
            "variable": "windspeed",
            "observed_value": realtime["windspeed"],
            "expected_value": profile["avg_wind"],
            "anomaly_type": "windspeed_anomaly"
        })
   

    event_anomalies = []
    for an in anomalies:
        event_anomalies.append({
            "city": realtime["city"],
            "country": realtime["country"],
            "event_time": realtime["event_time"],
            "variable": an["variable"],
            "observed_value": an["observed_value"],
            "expected_value": an["expected_value"],
            "anomaly_type": an["anomaly_type"]
        })

    if event_anomalies:
        out_dir = f"./hdfs-data/{realtime['country']}/{realtime['city']}/anomalies/{datetime.datetime.now().year}/{datetime.datetime.now().month:02d}"
        os.makedirs(out_dir, exist_ok=True)
        fname = f"{out_dir}/anomalies_{realtime['event_time'].replace(':','')}.json"
        with open(fname, "w") as f:
            json.dump(event_anomalies, f, indent=2)
        print(f"Anomalies détectées et sauvegardées dans {fname}")
    else:
        print("Aucune anomalie détectée")


Exercice 14 - dashboard.py