In [1]:

import pandas as pd

# Load datasets
file_paths = {
    "data_2006_2023": "processed_data/data_2006_2023.csv",
    "elecBalance": "processed_data/elecBalance.csv",
    "energyPrice": "processed_data/energyPrice.csv",
    "GDP": "processed_data/GDP.csv",
    "nao": "processed_data/nao.csv",
    "populationNL": "processed_data/populationNL.csv",
    "renewableEnergy": "processed_data/renwableEnergy.csv",
    "weather": "processed_data/weather.csv",
    "yearlyFinalConsPerSource": "processed_data/yearlyfinalConsPerSource.csv",
}

dataframes = {name: pd.read_csv(path) for name, path in file_paths.items()}


In [2]:
# Add 'Year' column to `yearlyFinalConsPerSource`
dataframes['yearlyFinalConsPerSource']['Year'] = range(2006, 2023)


In [3]:
# Filter `energyPrice` to keep only rows with `PriceComponents` equal to "TotalPrice"
if "PriceComponents" in dataframes["energyPrice"].columns:
    dataframes["energyPrice"] = dataframes["energyPrice"][
        dataframes["energyPrice"]["PriceComponents"] == "TotalPrice"
    ]


In [4]:
# Add or fix the `Year` column in all datasets
for name, df in dataframes.items():
    if "Year" not in df.columns:
        if "Periods" in df.columns:
            # Extract year from 'Periods'
            df["Year"] = pd.to_numeric(df["Periods"].str.extract(r"(\d{4})")[0], errors="coerce")
        elif "datetime" in df.columns:
            # Extract year from 'datetime'
            df["Year"] = pd.to_datetime(df["datetime"], errors="coerce").dt.year
        elif name == "yearlyFinalConsPerSource":
            # Add range of years for yearly consumption data
            df["Year"] = range(2006, 2023)
        else:
            print(f"Cannot derive 'Year' column for dataset: {name}")
    else:
        # Ensure `Year` column is numeric
        df["Year"] = pd.to_numeric(df["Year"], errors="coerce")


In [5]:
start_year = 2009
end_year = 2022

# Filter datasets based on the Year or Date column
for name, df in dataframes.items():
    if "Year" in df.columns:
        dataframes[name] = df[(df["Year"] >= start_year) & (df["Year"] <= end_year)]
    elif "Date" in df.columns:
        df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
        dataframes[name] = df[(df["Date"].dt.year >= start_year) & (df["Date"].dt.year <= end_year)]


In [6]:
# Aggregate hourly loadConsumption data to daily
if "data_2006_2023" in dataframes:
    hourly_data = dataframes["data_2006_2023"]
    # Ensure `datetime` is in datetime format
    hourly_data["datetime"] = pd.to_datetime(hourly_data["datetime"], errors="coerce")
    # Aggregate by date (sum daily consumption)
    daily_load = hourly_data.groupby(hourly_data["datetime"].dt.date)["loadConsumption"].sum().reset_index()
    daily_load.rename(columns={"datetime": "Date", "loadConsumption": "daily_load"}, inplace=True)
    daily_load["Date"] = pd.to_datetime(daily_load["Date"])
    dataframes["data_2006_2023_daily"] = daily_load


In [7]:
# Add `Year` column to datasets where missing
for name, df in dataframes.items():
    if "Year" not in df.columns:
        if "Date" in df.columns:
            df["Year"] = pd.to_datetime(df["Date"], errors="coerce").dt.year
        elif "Periods" in df.columns:
            df["Year"] = df["Periods"].str.extract(r"(\d{4})").astype(int)
        else:
            print(f"Cannot derive `Year` column for dataset: {name}")


In [8]:
def upsample_to_daily(df, date_col="Year", value_cols=None):
    """
    Upsample data to daily granularity using interpolation or forward-fill.
    :param df: Input DataFrame
    :param date_col: Column with yearly or monthly values (Year or Date)
    :param value_cols: Columns to interpolate (default: numeric columns)
    :return: DataFrame with daily granularity
    """
    # Ensure the column exists
    if date_col not in df.columns:
        raise KeyError(f"Column '{date_col}' not found in DataFrame. Available columns: {df.columns}")
    
    # Convert date_col to datetime if it's not already
    df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
    if df[date_col].isnull().all():
        raise ValueError(f"Column '{date_col}' could not be converted to datetime.")

    # Create daily date range and upsample
    daily_df = pd.DataFrame({'Date': pd.date_range(start=df[date_col].min(), end=df[date_col].max(), freq="D")})
    daily_df = daily_df.merge(df, left_on="Date", right_on=date_col, how="left").drop(columns=[date_col])

    # Interpolate specified value columns
    if value_cols:
        daily_df[value_cols] = daily_df[value_cols].interpolate(method="linear").fillna(method="bfill")

    return daily_df


In [9]:
# Check and fix `Year` column across datasets
for name, df in dataframes.items():
    if "Year" not in df.columns:
        print(f"Dataset '{name}' is missing the 'Year' column.")
    else:
        try:
            df["Year"] = pd.to_numeric(df["Year"], errors="coerce").astype("Int64")
            if df["Year"].isna().any():
                print(f"Dataset '{name}' has invalid 'Year' values after conversion.")
        except Exception as e:
            print(f"Error processing 'Year' in dataset '{name}': {e}")


In [10]:
# Check and convert non-numeric columns for datasets where needed
for name, df in dataframes.items():
    if name == "populationNL":
        print(f"Inspecting dataset '{name}'...")
        for col in df.columns:
            try:
                df[col] = pd.to_numeric(df[col], errors="ignore")
            except Exception as e:
                print(f"Could not convert column '{col}' in dataset '{name}': {e}")


Inspecting dataset 'populationNL'...


In [11]:
# upsample_to_daily function
def upsample_to_daily(df, date_col="Year", value_cols=None):
    """
    Upsample data to daily granularity using interpolation and forward/backward filling.
    """
    if date_col not in df.columns:
        raise KeyError(f"Column '{date_col}' not found in DataFrame. Available columns: {df.columns}")

    df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
    if df[date_col].isnull().all():
        raise ValueError(f"Column '{date_col}' could not be converted to datetime.")

    daily_df = pd.DataFrame({"Date": pd.date_range(start=df[date_col].min(), end=df[date_col].max(), freq="D")})
    daily_df = daily_df.merge(df, left_on="Date", right_on=date_col, how="left").drop(columns=[date_col])

    if value_cols:
        daily_df[value_cols] = daily_df[value_cols].interpolate(method="linear").bfill().ffill()

    return daily_df


In [12]:
# Skip datasets without numeric columns during upsampling
upsampled_dataframes = {}
for name, df in dataframes.items():
    numeric_columns = df.select_dtypes(include="number").columns.tolist()
    if not numeric_columns:
        print(f"Skipping dataset '{name}' - No numeric columns for upsampling.")
        continue
    try:
        upsampled_dataframes[name] = upsample_to_daily(df, date_col="Year", value_cols=numeric_columns)
    except Exception as e:
        print(f"Error processing dataset '{name}': {e}")


Error processing dataset 'data_2006_2023': "['Year'] not in index"
Error processing dataset 'elecBalance': "['Year'] not in index"
Error processing dataset 'energyPrice': "['Year'] not in index"
Error processing dataset 'GDP': "['Year'] not in index"
Error processing dataset 'nao': "['Year'] not in index"
Error processing dataset 'populationNL': "None of [Index(['Year'], dtype='object')] are in the [columns]"
Error processing dataset 'renewableEnergy': "['Year'] not in index"
Error processing dataset 'weather': "['Year'] not in index"
Error processing dataset 'yearlyFinalConsPerSource': "['Year'] not in index"
Error processing dataset 'data_2006_2023_daily': "['Year'] not in index"


In [None]:

# Fix or add the 'Year' column for all datasets
for name, df in dataframes.items():
    if "Year" not in df.columns:
        if "Periods" in df.columns:
            # Extract year from 'Periods'
            df["Year"] = pd.to_numeric(df["Periods"].str.extract(r"(\d{4})")[0], errors="coerce")
        elif "datetime" in df.columns:
            # Extract year from 'datetime'
            df["Year"] = pd.to_datetime(df["datetime"], errors="coerce").dt.year
        elif name == "yearlyFinalConsPerSource":
            # Add range of years for yearly consumption data
            df["Year"] = range(2006, 2023)
        else:
            print(f"Cannot derive 'Year' column for dataset: {name}")
    else:
        # Ensure 'Year' column is numeric
        df["Year"] = pd.to_numeric(df["Year"], errors="coerce")
