CELL 1 — Notebook context
EDA - Gold Time Series Dataset
Objective:
- Validate data quality
- Check temporal continuity
- Understand seasonality
- Confirm ML readiness

Basic row count & time range

In [0]:
from pyspark.sql.functions import min, max

df_gold_timeseries = spark.read.table("energydemand.default.eco2mix_regional_gold")
total_rows = df_gold_timeseries.count()
print("Total rows:", total_rows)

df_gold_timeseries.select(
min("date_heure").alias("start_ts"),
max("date_heure").alias("end_ts")
).display()

CELL 3 — Schema verification


In [0]:
df_gold_timeseries.printSchema()

CELL 4 — Null checks (must be zero)

In [0]:
from pyspark.sql.functions import col, sum, when

df_gold_timeseries.select(
    sum(
        when(
            col("y_consumption").isNull(), 1
        ).otherwise(0)
    ).alias("null_y")
).display()

CELL 5 — Descriptive statistics



In [0]:
df_gold_timeseries.select("y_consumption").describe().display()

CELL 6 — Distribution (sampling)

In [0]:

sample_pd = (
df_gold_timeseries
.select("y_consumption")
.sample(0.1, seed=42)
.toPandas()
)
sample_pd.hist(bins=50)


CELL 7 — Temporal continuity check


In [0]:
from pyspark.sql.functions import lag, unix_timestamp
from pyspark.sql.window import Window
w = Window.orderBy("date_heure")
df_gaps = (
df_gold_timeseries
.withColumn("prev_ts", lag("date_heure").over(w))
.withColumn(
"diff_seconds",
unix_timestamp("date_heure") - unix_timestamp("prev_ts")
)
)
df_gaps.select("diff_seconds").describe().display()

In [0]:
#CELL 8 — Count missing time steps
# This code counts the number of time intervals between consecutive rows in the time series
# where the gap is greater than 1 hour (3600 seconds). A high count (e.g., 2306) indicates
# there are 2306 instances where data is missing for one or more hours, suggesting
# temporal discontinuity or missing timestamps in the dataset.

gap_count = df_gaps.filter(col("diff_seconds") > 3600).count()
print("Number of gaps (> 1 hour):", gap_count)

In [0]:
#CELL 9 — Hourly seasonality
df_gold_timeseries.groupBy("hour_of_day").avg("y_consumption").orderBy("hour_of_day").display()


Databricks visualization. Run in Databricks to view.

In [0]:
#CELL 10 — Daily seasonality
df_gold_timeseries.groupBy("day_of_month").avg("y_consumption").orderBy("day_of_month").display()


Databricks visualization. Run in Databricks to view.

In [0]:
#CELL 11 — Monthly seasonality
df_gold_timeseries.groupBy("month").avg("y_consumption").orderBy("month").display()


Databricks visualization. Run in Databricks to view.

In [0]:
#CELL 13 — Outlier detection (simple)
from pyspark.sql.functions import avg, stddev
stats = df_gold_timeseries.select(
avg("y_consumption").alias("mean"),
stddev("y_consumption").alias("std")
).collect()[0]
mean_val = stats["mean"]
std_val = stats["std"]
df_outliers = df_gold_timeseries.filter(
(col("y_consumption") > mean_val + 5 * std_val) |
(col("y_consumption") < mean_val - 5 * std_val)
)
print("Outlier count:", df_outliers.count())
