In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import calendar
import ipywidgets as widgets
from ipywidgets import interact

# --- Set global Matplotlib parameters ---
plt.rcParams["font.family"] = "serif"
plt.rcParams["text.usetex"] = True
plt.rcParams["text.latex.preamble"] = r"\usepackage{amsmath}"
plt.rcParams["figure.dpi"] = 1200
plt.rcParams["font.size"] = 20
plt.rcParams["axes.labelsize"] = 20
plt.rcParams["axes.titlesize"] = 20
plt.rcParams["legend.fontsize"] = 20
plt.rcParams["xtick.direction"] = "in"
plt.rcParams["ytick.direction"] = "in"
plt.rcParams["xtick.major.size"] = 5.0
plt.rcParams["xtick.minor.size"] = 3.0
plt.rcParams["ytick.major.size"] = 5.0
plt.rcParams["ytick.minor.size"] = 3.0
plt.rcParams["axes.linewidth"] = 1.5
plt.rcParams["legend.handlelength"] = 2.0

import matplotlib.pyplot as plt
plt.rcParams["text.usetex"] = False


**Update interactive eda**

In [2]:
# --- Load training data (Years 1 and 2) ---
train_df = pd.read_excel('.././datasets/training.xlsx', sheet_name='Data')

# --- Load testing data (Year 3)
test_df = pd.read_excel('.././datasets/testing.xlsx', sheet_name='Data')

**Hourly GHI Variation**

Training Data (Years 1 and 2)

In [None]:
def plot_hourly_variation(month, site):
    # Create subplots: left plot for Year 1, right plot for Year 2
    fig, axes = plt.subplots(1, 2, figsize=(16, 8), sharey=True)
    
    for idx, year in enumerate([1, 2]):
        ax = axes[idx]
        # Filter the DataFrame for the selected month and current year
        filtered_df = train_df[(train_df["Year"] == year) & (train_df["Month"] == month)]
        
        # Plot each day's hourly variation
        for day in sorted(filtered_df["Day"].unique()):
            subset = filtered_df[filtered_df["Day"] == day]
            ax.plot(subset["Hour"], subset[site],
                    alpha=0.5, linestyle="-", marker="o", lw=1.0)
        
        # Compute and plot the centroid: average GHI for each hour across all days
        centroid = filtered_df.groupby("Hour")[site].mean()
        ax.plot(centroid.index, centroid, color='black', lw=2.5, marker='D', linestyle='-', label="Centroid")
        
        ax.set_xlabel("Hour of the Day")
        ax.set_title(f"Hourly GHI Variation for {calendar.month_name[month]} (Year {year}) - {site}")
        ax.legend(loc="upper left")
    
    axes[0].set_ylabel("GHI (Global Horizontal Irradiance)")
    plt.tight_layout()
    plt.show()

# Create widget for month selection (using month names for clarity)
month_widget = widgets.Dropdown(
    options={calendar.month_name[m]: m for m in sorted(train_df["Month"].unique()) if m != 0},
    value=4,
    description="Month:"
)

# Create widget for site selection (assumes site columns contain "GHI")
site_columns = [col for col in train_df.columns if "GHI" in col]
site_widget = widgets.Dropdown(
    options=site_columns,
    value=site_columns[0],
    description="Site:"
)

# Use the interactive widget with month and site selections
interact(plot_hourly_variation, month=month_widget, site=site_widget);

interactive(children=(Dropdown(description='Month:', index=3, options={'January': np.int64(1), 'February': np.…

In [None]:
def plot_hourly_variation(day, month, site):
    # Create subplots: left for Year 1, right for Year 2
    fig, axes = plt.subplots(1, 2, figsize=(16, 8), sharey=True)
    
    for idx, year in enumerate([1, 2]):
        ax = axes[idx]
        # Filter the DataFrame for the selected day, month, and year
        filtered_df = train_df[(train_df["Year"] == year) & 
                               (train_df["Month"] == month) & 
                               (train_df["Day"] == day)]
        
        # Plot the hourly variation
        ax.plot(filtered_df["Hour"], filtered_df[site],
                marker="o", linestyle="-", lw=1.5, label=f"Year {year}")
        
        ax.set_xlabel("Hour of the Day")
        ax.set_title(f"{calendar.month_name[month]} {day}, Year {year}")
        ax.legend(loc="upper left")
    
    axes[0].set_ylabel("GHI (Global Horizontal Irradiance)")
    plt.suptitle(f"Hourly GHI Variation on {calendar.month_name[month]} {day} - {site}")
    plt.show()

# Create widget for month selection (using month names for clarity)
month_widget = widgets.Dropdown(
    options={calendar.month_name[m]: m for m in sorted(train_df["Month"].unique()) if m != 0},
    value=4,
    description="Month:"
)

# Create widget for day selection
day_widget = widgets.Dropdown(
    options=sorted(train_df["Day"].unique()),
    value=1,
    description="Day:"
)

# Create widget for site selection (assumes site columns contain "GHI")
site_columns = [col for col in train_df.columns if "GHI" in col]
site_widget = widgets.Dropdown(
    options=site_columns,
    value=site_columns[0],
    description="Site:"
)

# Use the interactive widget: day, month, and site
interact(plot_hourly_variation, day=day_widget, month=month_widget, site=site_widget)

interactive(children=(Dropdown(description='Day:', options=(np.int64(1), np.int64(2), np.int64(3), np.int64(4)…

<function __main__.plot_hourly_variation(day, month, site)>

**Testing Data (Year 3)**

In [5]:
def plot_hourly_variation(month, site):
    # Filter the testing DataFrame for the specified month and Year 3 (testing data)
    filtered_df = test_df[(test_df["Year"] == 3) & (test_df["Month"] == month)]
    
    plt.figure(figsize=(10, 6))
    
    # Plot each day's hourly variation
    for day in sorted(filtered_df["Day"].unique()):
        subset = filtered_df[filtered_df["Day"] == day]
        plt.plot(subset["Hour"], subset[site],
                 alpha=0.5, linestyle="-", marker="o", lw=1.0)
    
    # Compute and plot the centroid: average GHI for each hour across all days
    centroid = filtered_df.groupby("Hour")[site].mean()
    plt.plot(centroid.index, centroid, color='black', lw=2.5, marker='D', linestyle='-', label="Centroid")
    
    plt.xlabel("Hour of the Day")
    plt.ylabel("GHI (Global Horizontal Irradiance)")
    plt.title(f"Hourly GHI Variation for {calendar.month_name[month]} (Year 3) - {site}")
    plt.legend(loc="upper left")
    plt.show()

# Create widget for month selection (using month names for clarity)
month_widget = widgets.Dropdown(
    options={calendar.month_name[m]: m for m in sorted(test_df["Month"].unique()) if m != 0},
    value=4,
    description="Month:"
)

# Create widget for site selection (assumes site columns contain "GHI")
site_columns = [col for col in test_df.columns if "GHI" in col]
site_widget = widgets.Dropdown(
    options=site_columns,
    value=site_columns[0],
    description="Site:"
)

# Use the interactive widget for month and site selections
interact(plot_hourly_variation, month=month_widget, site=site_widget);

interactive(children=(Dropdown(description='Month:', index=3, options={'January': np.int64(1), 'February': np.…

In [6]:
def plot_hourly_variation(day, month, site):
    # Filter the testing DataFrame for the specified day, month, and Year 3
    filtered_df = test_df[(test_df["Year"] == 3) & 
                          (test_df["Month"] == month) & 
                          (test_df["Day"] == day)]
    
    plt.figure(figsize=(10, 6))
    
    # Plot the hourly variation for Year 3
    plt.plot(filtered_df["Hour"], filtered_df[site],
             marker="o", linestyle="-", lw=1.5, label="Year 3")
    
    plt.xlabel("Hour of the Day")
    plt.ylabel("GHI (Global Horizontal Irradiance)")
    plt.title(f"Hourly GHI Variation on {calendar.month_name[month]} {day}, Year 3 - {site}")
    plt.legend(loc="upper left")
    plt.show()

# Create widget for month selection (using month names for clarity)
month_widget = widgets.Dropdown(
    options={calendar.month_name[m]: m for m in sorted(test_df["Month"].unique()) if m != 0},
    value=4,
    description="Month:"
)

# Create widget for day selection
day_widget = widgets.Dropdown(
    options=sorted(test_df["Day"].unique()),
    value=1,
    description="Day:"
)

# Create widget for site selection (assumes site columns contain "GHI")
site_columns = [col for col in test_df.columns if "GHI" in col]
site_widget = widgets.Dropdown(
    options=site_columns,
    value=site_columns[0],
    description="Site:"
)

# Use the interactive widget for day, month, and site selections
interact(plot_hourly_variation, day=day_widget, month=month_widget, site=site_widget);

interactive(children=(Dropdown(description='Day:', options=(np.int64(1), np.int64(2), np.int64(3), np.int64(4)…

**Hourly Temperature Variation**

Training Data (Years 1 and 2)

In [9]:
def plot_hourly_variation(month, site):
    # Create subplots: left plot for Year 1, right plot for Year 2
    fig, axes = plt.subplots(1, 2, figsize=(16, 8), sharey=True)
    
    for idx, year in enumerate([1, 2]):
        ax = axes[idx]
        # Filter the DataFrame for the selected month and current year
        filtered_df = train_df[(train_df["Year"] == year) & (train_df["Month"] == month)]
        
        # Plot each day's hourly variation
        for day in sorted(filtered_df["Day"].unique()):
            subset = filtered_df[filtered_df["Day"] == day]
            ax.plot(subset["Hour"], subset[site],
                    alpha=0.5, linestyle="-", marker="o", lw=1.0)
        
        # Compute and plot the centroid: average GHI for each hour across all days
        centroid = filtered_df.groupby("Hour")[site].mean()
        ax.plot(centroid.index, centroid, color='black', lw=2.5, marker='D', linestyle='-', label="Centroid")
        
        ax.set_xlabel("Hour of the Day")
        ax.set_title(f"Hourly Temp for {calendar.month_name[month]} (Year {year}) - {site}")
        ax.legend(loc="upper left")
    
    axes[0].set_ylabel("Temperature")
    plt.tight_layout()
    plt.show()

# Create widget for month selection (using month names for clarity)
month_widget = widgets.Dropdown(
    options={calendar.month_name[m]: m for m in sorted(train_df["Month"].unique()) if m != 0},
    value=4,
    description="Month:"
)

# Create widget for site selection (assumes site columns contain "GHI")
site_columns = [col for col in train_df.columns if "Temp" in col]
site_widget = widgets.Dropdown(
    options=site_columns,
    value=site_columns[0],
    description="Site:"
)

# Use the interactive widget with month and site selections
interact(plot_hourly_variation, month=month_widget, site=site_widget);

interactive(children=(Dropdown(description='Month:', index=3, options={'January': np.int64(1), 'February': np.…

In [10]:
def plot_hourly_variation(day, month, site):
    # Create subplots: left for Year 1, right for Year 2
    fig, axes = plt.subplots(1, 2, figsize=(16, 8), sharey=True)
    
    for idx, year in enumerate([1, 2]):
        ax = axes[idx]
        # Filter the DataFrame for the selected day, month, and year
        filtered_df = train_df[(train_df["Year"] == year) & 
                               (train_df["Month"] == month) & 
                               (train_df["Day"] == day)]
        
        # Plot the hourly variation
        ax.plot(filtered_df["Hour"], filtered_df[site],
                marker="o", linestyle="-", lw=1.5, label=f"Year {year}")
        
        ax.set_xlabel("Hour of the Day")
        ax.set_title(f"{calendar.month_name[month]} {day}, Year {year}")
        ax.legend(loc="upper left")
    
    axes[0].set_ylabel("Temperature")
    plt.suptitle(f"Hourly Temp on {calendar.month_name[month]} {day} - {site}")
    plt.show()

# Create widget for month selection (using month names for clarity)
month_widget = widgets.Dropdown(
    options={calendar.month_name[m]: m for m in sorted(train_df["Month"].unique()) if m != 0},
    value=4,
    description="Month:"
)

# Create widget for day selection
day_widget = widgets.Dropdown(
    options=sorted(train_df["Day"].unique()),
    value=1,
    description="Day:"
)

# Create widget for site selection (assumes site columns contain "GHI")
site_columns = [col for col in train_df.columns if "Temp" in col]
site_widget = widgets.Dropdown(
    options=site_columns,
    value=site_columns[0],
    description="Site:"
)

# Use the interactive widget: day, month, and site
interact(plot_hourly_variation, day=day_widget, month=month_widget, site=site_widget)

interactive(children=(Dropdown(description='Day:', options=(np.int64(1), np.int64(2), np.int64(3), np.int64(4)…

<function __main__.plot_hourly_variation(day, month, site)>

**Testing Data (Year 3)**

In [11]:
def plot_hourly_variation(month, site):
    # Filter the testing DataFrame for the specified month and Year 3 (testing data)
    filtered_df = test_df[(test_df["Year"] == 3) & (test_df["Month"] == month)]
    
    plt.figure(figsize=(10, 6))
    
    # Plot each day's hourly variation
    for day in sorted(filtered_df["Day"].unique()):
        subset = filtered_df[filtered_df["Day"] == day]
        plt.plot(subset["Hour"], subset[site],
                 alpha=0.5, linestyle="-", marker="o", lw=1.0)
    
    # Compute and plot the centroid: average GHI for each hour across all days
    centroid = filtered_df.groupby("Hour")[site].mean()
    plt.plot(centroid.index, centroid, color='black', lw=2.5, marker='D', linestyle='-', label="Centroid")
    
    plt.xlabel("Hour of the Day")
    plt.ylabel("Temperature")
    plt.title(f"Hourly Temp for {calendar.month_name[month]} (Year 3) - {site}")
    plt.legend(loc="upper left")
    plt.show()

# Create widget for month selection (using month names for clarity)
month_widget = widgets.Dropdown(
    options={calendar.month_name[m]: m for m in sorted(test_df["Month"].unique()) if m != 0},
    value=4,
    description="Month:"
)

# Create widget for site selection (assumes site columns contain "GHI")
site_columns = [col for col in test_df.columns if "Temp" in col]
site_widget = widgets.Dropdown(
    options=site_columns,
    value=site_columns[0],
    description="Site:"
)

# Use the interactive widget for month and site selections
interact(plot_hourly_variation, month=month_widget, site=site_widget);

interactive(children=(Dropdown(description='Month:', index=3, options={'January': np.int64(1), 'February': np.…

In [12]:
def plot_hourly_variation(day, month, site):
    # Filter the testing DataFrame for the specified day, month, and Year 3
    filtered_df = test_df[(test_df["Year"] == 3) & 
                          (test_df["Month"] == month) & 
                          (test_df["Day"] == day)]
    
    plt.figure(figsize=(10, 6))
    
    # Plot the hourly variation for Year 3
    plt.plot(filtered_df["Hour"], filtered_df[site],
             marker="o", linestyle="-", lw=1.5, label="Year 3")
    
    plt.xlabel("Hour of the Day")
    plt.ylabel("Temperature")
    plt.title(f"Hourly Temp on {calendar.month_name[month]} {day}, Year 3 - {site}")
    plt.legend(loc="upper left")
    plt.show()

# Create widget for month selection (using month names for clarity)
month_widget = widgets.Dropdown(
    options={calendar.month_name[m]: m for m in sorted(test_df["Month"].unique()) if m != 0},
    value=4,
    description="Month:"
)

# Create widget for day selection
day_widget = widgets.Dropdown(
    options=sorted(test_df["Day"].unique()),
    value=1,
    description="Day:"
)

# Create widget for site selection (assumes site columns contain "GHI")
site_columns = [col for col in test_df.columns if "Temp" in col]
site_widget = widgets.Dropdown(
    options=site_columns,
    value=site_columns[0],
    description="Site:"
)

# Use the interactive widget for day, month, and site selections
interact(plot_hourly_variation, day=day_widget, month=month_widget, site=site_widget);

interactive(children=(Dropdown(description='Day:', options=(np.int64(1), np.int64(2), np.int64(3), np.int64(4)…

**Electric Load**

In [13]:
def plot_hourly_load_variation(month):
    # Create subplots: left for Year 1, right for Year 2
    fig, axes = plt.subplots(1, 2, figsize=(16, 6), sharey=True)
    
    for idx, year in enumerate([1, 2]):
        ax = axes[idx]
        # Filter the DataFrame for the selected month and current year
        filtered_df = train_df[(train_df["Year"] == year) & (train_df["Month"] == month)]
        
        # Plot each day's hourly load variation
        for day in sorted(filtered_df["Day"].unique()):
            subset = filtered_df[filtered_df["Day"] == day]
            ax.plot(subset["Hour"], subset["Load"],
                    alpha=0.5, linestyle="-", marker="o", lw=1.0)
        
        # Compute and plot the centroid: average load for each hour across all days
        centroid = filtered_df.groupby("Hour")["Load"].mean()
        ax.plot(centroid.index, centroid, color='black', lw=2.5, marker='D',
                linestyle='-', label="Centroid")
        
        ax.set_xlabel("Hour of the Day")
        ax.set_title(f"Hourly Load for {calendar.month_name[month]} (Year {year})")
        ax.legend(loc="upper left")
    
    axes[0].set_ylabel("Electricity Load")
    plt.suptitle(f"Hourly Electricity Load Variation for {calendar.month_name[month]}")
    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.show()

# Create widget for month selection (using month names for clarity)
month_widget = widgets.Dropdown(
    options={calendar.month_name[m]: m for m in sorted(train_df["Month"].unique()) if m != 0},
    value=4,
    description="Month:"
)

# Use the interactive widget to update the plot
interact(plot_hourly_load_variation, month=month_widget);

interactive(children=(Dropdown(description='Month:', index=3, options={'January': np.int64(1), 'February': np.…

In [14]:
def plot_hourly_load_variation(day, month):
    # Create subplots: left for Year 1, right for Year 2
    fig, axes = plt.subplots(1, 2, figsize=(16, 6), sharey=True)
    
    for idx, year in enumerate([1, 2]):
        ax = axes[idx]
        # Filter the DataFrame for the selected day, month, and current year
        filtered_df = train_df[(train_df["Year"] == year) & 
                               (train_df["Month"] == month) & 
                               (train_df["Day"] == day)]
        
        # Plot the hourly load variation for the specified day
        ax.plot(filtered_df["Hour"], filtered_df["Load"],
                marker="o", linestyle="-", lw=1.5, label=f"Year {year}")
        
        ax.set_xlabel("Hour of the Day")
        ax.set_title(f"{calendar.month_name[month]} {day}, Year {year}")
        ax.legend(loc="upper left")
    
    axes[0].set_ylabel("Electricity Load")
    plt.suptitle(f"Hourly Electricity Load Variation on {calendar.month_name[month]} {day}")
    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.show()

# Create widget for month selection (displaying month names for clarity)
month_widget = widgets.Dropdown(
    options={calendar.month_name[m]: m for m in sorted(train_df["Month"].unique()) if m != 0},
    value=4,
    description="Month:"
)

# Create widget for day selection
day_widget = widgets.Dropdown(
    options=sorted(train_df["Day"].unique()),
    value=1,
    description="Day:"
)

# Use the interactive widget for day and month selections
interact(plot_hourly_load_variation, day=day_widget, month=month_widget);

interactive(children=(Dropdown(description='Day:', options=(np.int64(1), np.int64(2), np.int64(3), np.int64(4)…