In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yaml
from ydata_profiling import ProfileReport
import sys
from pathlib import Path

# add project root (project/ is the parent of notebooks/)
project_root = Path.cwd().parent
sys.path.append(str(project_root))
print("Project root added to PYTHONPATH:", project_root)
from src.data.load import load_raw_data



sns.set_theme(style="whitegrid")
pd.set_option("display.max_columns", None)

#read config file
config_path = "../configs/base.yaml"
with open(config_path,"r") as f:
    config = yaml.safe_load(f)

data_path = Path(config["raw_data_path"])
target_col = config["target"]

print(f"Config loaded successfully!")
print(f"Data path: {data_path}")
print(f"Target column:{target_col}")

#3. Load data
df = load_raw_data(str(data_path))
print(f"Data loaded successfully! Shape = {df.shape}")
df.head()

NameError: name '__file__' is not defined

In [None]:
print("\n Data type and missing information:")
print(df.info())

print("\n Descriptive statistics:")
display(df.describe().T)

print("\n missing value statistics:")
print(df.isna().sum())

In [None]:
#basic sanity and per-zone continuity
print(df.dtypes)
print("Unique zones:", df["zone_id"].nunique(), sorted(df["zone_id"].unique().tolist()))
#sort and check hourly gaps
df = df.sort_values(["zone_id", "datetime"]).reset_index(drop=True)
gaps = (df.groupby("zone_id")["datetime"]
          .diff().dt.total_seconds().div(3600)
        .fillna(1))
print("Share of non-1h steps:", (gaps != 1).mean())


In [None]:
#distributions(target & winds)
num_cols = [target_col, "U10", "V10", "U100", "V100"]
ax = df[num_cols].hist(bins=40, figsize=(12,8))
plt.suptitle("Distributions", y=1.02)
plt.show()

df[num_cols].describe().T

In [None]:
#temporary analysis features
df["_WS10"] = np.hypot(df["U10"], df["V10"])
df["_WS100"] = np.hypot(df["U100"], df["V100"])

sns.scatterplot(data=df.sample(min(len(df), 30000), random_state = 0),
                x="_WS100", y=target_col, hue="ZONEID", s=10, alpha=0.4, legend=False)
plt.title("Target vs. WS100 (sample)")
plt.show()

df[["_WS10", "_WS100", target_col]].corr(numeric_only=True)

In [None]:
#seasonality(hour/month)
tmp = df.assign(
    hour=df["datetime"].dt.hour,
    month=df["datetime"].dt.month
)

hourly = tmp.groupby(["zone_id", "hour"])[target_col].mean().reset_index()
sns.lineplot(data=hourly, x="hour", y=target_col, hue="zone_id")
plt.title("Mean Target by Hour")
plt.show()

monthly = tmp.groupby(["zone_id", "month"])[target_col].mean().reset_index()
sns.lineplot(data=monthly, x="month", y=target_col, hue="zone_id")
plt.title("Mean Target by Month")
plt.show()

In [None]:
#missing rates
miss = df.isna().mean().sort_values(ascending=False)
print("Missing rate:\n", miss)

#IQR outlier flag
def iqr_outlier(s, k=1.5):
    q1, q3 = s.quantile([0.25, 0.75]) 
    iqr = q3 - q1
    return(s < q1 - k*iqr) | (s > q3 + k*iqr)

out_cols = [target_col, "_WS10", "_WS100", "U10", "V10", "U100", "V100"]
flags = {c: iqr_outlier(df[c].dropna()) for c in out_cols if df[c].dtype!="O"}
print({c: int(flags[c].sum()) for c in flags})                          

In [None]:
# Nothing executed here â€” notes for FE:
# - All rolling stats must be computed AFTER a shift(1) within each ZONEID.
# - Train/val/test must be split by time BEFORE model fitting.
# - TIMESTAMP should NOT be fed directly to LightGBM; instead use sine/cosine or drop it.
pass

In [None]:
#zone distribution
plt.figure(figsize=(10, 5))
sns.boxplot(x="zone_id", y=target_col, data=df)
plt.title("Distribution of Wind Power by Zone")
plt.show()

In [None]:
#time trend(zone 1)
zone_1 = df[df["zone_id"] == 1]
plt.figure(figsize=(12,4))
plt.plot(zone_1["TIMESTAMP"], zone_1[target_col], linewidth=0.7)
plt.title("Wind Power Over Time (Zone 1)")
plt.xlabel("Time")
plt.ylabel("Power")
plt.show()

In [None]:
#correlation matrix
plt.figure(figsize=(8, 6))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap="Blues")
plt.title("Correlation Matrix")
plt.show()

In [None]:
#generate HTML profiling report
profile = ProfileReport(df, title="Wind Forecast EDA Report", explorative=True)
profile.to_file("../notebooks/EDA_report.html")

print("HTML EDA report saved to notebooks/EDA_report.html")