In [None]:
# 02_ingestion_usgs.ipynb
from pyspark.sql import SparkSession
import requests, json, time, os
from datetime import datetime

spark = SparkSession.builder.appName("USGS_Ingestion").getOrCreate()

RAW_BASE = "hdfs://namenode:8020/raw/usgs"   # ou "/mnt/data/raw/usgs" si local
os.makedirs("/mnt/data/usgs_temp", exist_ok=True)

def fetch_and_save(feed_url="https://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/all_hour.geojson"):
    r = requests.get(feed_url, timeout=30)
    data = r.json()
    timestamp = datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
    local_path = f"/mnt/data/usgs_temp/earthquakes_{timestamp}.json"
    with open(local_path, "w", encoding="utf-8") as f:
        json.dump(data, f)
    # read with spark
    df = spark.read.json(local_path)
    # normalize structure: features -> properties + geometry
    df2 = df.selectExpr("explode(features) as feat").selectExpr(
        "feat.properties.*",
        "feat.geometry.coordinates as coords",
        "feat.id as event_id"
    ).withColumnRenamed("time", "time_ms")
    # convert time_ms -> timestamp
    import pyspark.sql.functions as F
    df2 = df2.withColumn("event_time", (F.col("time_ms")/1000).cast("timestamp")) \
             .withColumn("longitude", F.col("coords").getItem(0)) \
             .withColumn("latitude", F.col("coords").getItem(1)) \
             .withColumn("depth_km", F.col("coords").getItem(2))
    out_path = f"{RAW_BASE}/events/{datetime.utcnow().strftime('%Y/%m/%d')}/earthquakes_{timestamp}.parquet"
    df2.write.mode("append").parquet(out_path)
    print("Saved to", out_path)
    return out_path

# If you want to run continuously (for demo, run a few iterations)
if _name_ == "_main_":
    for i in range(3):  # ajuster / enlever la boucle pour production
        fetch_and_save()
        time.sleep(300)  # toutes les 5 minutes (300s)