In [0]:
%restart_python

In [0]:
landed_df = spark.table("workspace.default.eco2mix_regional_landing")


In [0]:
# columns you want to KEEP
simple_cols = [
    "code_insee_region",
    "libelle_region",
    "nature",
    "date",
    "heure",
    "date_heure",
    "consommation",
    "thermique",
    "nucleaire",
    "eolien",
    "solaire",
    "hydraulique",
    "bioenergies",
    "ech_physiques",
]

landed_df_simple = landed_df.select(*simple_cols)

display(landed_df_simple)


In [0]:
landed_df_simple.describe([
    "consommation", 
    "thermique",
    "nucleaire",
    "eolien",
    "solaire",
    "hydraulique",
    "bioenergies",
    "ech_physiques",]).show()

Casting types

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType

landed_df_typed = (
    landed_df_simple
    # date & timestamp
    .withColumn("date", F.to_date("date", "yyyy-MM-dd"))
    .withColumn("date_heure", F.to_timestamp("date_heure"))  # ISO, no pattern needed
    # numeric columns â†’ int (or long)
    .withColumn("consommation",   F.col("consommation").cast(IntegerType()))
    .withColumn("thermique",      F.col("thermique").cast(IntegerType()))
    .withColumn("nucleaire",      F.col("nucleaire").cast(IntegerType()))
    .withColumn("eolien",         F.col("eolien").cast(IntegerType()))
    .withColumn("solaire",        F.col("solaire").cast(IntegerType()))
    .withColumn("hydraulique",    F.col("hydraulique").cast(IntegerType()))
    .withColumn("bioenergies",    F.col("bioenergies").cast(IntegerType()))
    .withColumn("ech_physiques",  F.col("ech_physiques").cast(IntegerType()))
)


In [0]:
landed_df_processed = (
    df_typed
    .withColumn("year",          F.year("date_heure"))
    .withColumn("month",         F.month("date_heure"))
    .withColumn("day_of_month",  F.dayofmonth("date_heure"))
    .withColumn("hour_of_day",   F.hour("date_heure"))
    .withColumn("minute_of_hour",F.minute("date_heure"))
)


In [0]:
landed_df_processed.printSchema()
landed_df_processed.show(5, truncate=False)


In [0]:
(
    landed_df_processed
    .write
    .mode("overwrite")
    .format("delta")
    .saveAsTable("silver_eco2mix_regional_tr_simple")
)

In [0]:
silver_df = spark.table("silver_eco2mix_regional_tr_simple")
display(silver_df)