## Loading Eirgrid and Met Eireann Data

In [1]:
import pandas as pd


#  load in the cleaned data frames

eirgrid = pd.read_csv('eirgrid_data/combined_hourly.csv', parse_dates=['date'], index_col='date')

met_eireann = pd.read_csv('met_eireann_data/combined.csv', parse_dates=['date'], index_col='date')



In [2]:
print(eirgrid.info())
print(met_eireann.info())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 101508 entries, 2014-01-01 00:00:00 to 2025-07-31 23:00:00
Data columns (total 33 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   GMT Offset                 101508 non-null  int64  
 1   NI Generation              101508 non-null  float64
 2   NI Demand                  101508 non-null  float64
 3   NI Wind Availability       101508 non-null  float64
 4   NI Wind Generation         101508 non-null  float64
 5   NI Solar Availability      66448 non-null   float64
 6   NI Solar Generation        66448 non-null   float64
 7   Moyle I/C                  31388 non-null   float64
 8   NI Wind Penetration        31388 non-null   float64
 9   NI Solar Penetration       31388 non-null   float64
 10  IE Generation              101508 non-null  float64
 11  IE Demand                  101508 non-null  float64
 12  IE Wind Availability       101508 non-null  float64


## EDA on Eirgrid data

In [6]:
import matplotlib.pyplot as plt

In [13]:
plt.style.use('ggplot')

# Filter columns to exclude those starting with 'NI' or 'AI'
columns_to_plot = [col for col in eirgrid.select_dtypes(include=['float64', 'int64']).columns 
                  if not (col.startswith('NI') or col.startswith('AI'))]

for column in columns_to_plot:
    plt.figure(figsize=(15, 5))
    eirgrid[column].plot()
    plt.title(column)
    plt.xlabel('Date')
    plt.xticks(rotation=45)
    plt.tight_layout()
    
    # Save individual plot
    plt.savefig(f'plots/eirgrid_{column.lower().replace("/", "_").replace(" ", "_")}.png')
    plt.close()


## EDA on Met Eireann data

In [14]:
plt.style.use('ggplot')

# Get numeric columns excluding 'ind' columns and indices
numeric_columns = [col for col in met_eireann.select_dtypes(include=['float64']).columns 
                  if not col.startswith('ind')]

# Plot each numeric column
for column in numeric_columns:
    plt.figure(figsize=(15, 5))
    
    # Plot line for each station
    for station in met_eireann['Station'].unique():
        station_data = met_eireann[met_eireann['Station'] == station]
        station_data[column].plot(label=station, alpha=0.7)
    
    plt.title(column)
    plt.xlabel('Date')
    plt.ylabel(column)
    plt.xticks(rotation=45)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    
    # Save individual plot
    plt.savefig(f'plots/met_eireann_{column.lower()}.png', bbox_inches='tight')
    plt.close()


## Pop out window to better analyse multiple series against each other

In [3]:
# Create a dictionary to store columns and their first non-NaN dates
column_dates = {}

for column in eirgrid.columns:
    if column.startswith('NI') or column.startswith('AI'):
        continue  # Skips columns relevant to Northern Ireland or All Ireland
    non_nan_dates = eirgrid[eirgrid[column].notna()].index
    
    if not non_nan_dates.empty:
        first_non_nan_date = non_nan_dates[0]
        column_dates[column] = first_non_nan_date

# Sort columns by their first appearance date
sorted_columns = sorted(column_dates.items(), key=lambda x: x[1])

# Print results
for column, first_date in sorted_columns:
    print(f"{column}: {first_date.year}")

GMT Offset: 2014
IE Generation: 2014
IE Demand: 2014
IE Wind Availability: 2014
IE Wind Generation: 2014
SNSP: 2014
Moyle I/C: 2022
IE Solar Availability: 2022
IE Solar Generation: 2022
IE Hydro: 2022
IE Wind Penetration: 2022
IE Solar Penetration: 2022
Inter-Jurisdictional Flow: 2022
EWIC I/C: 2023


In [4]:
# list out columns from met eireann dataframe and what Stations have nan values for those columns
Stations = met_eireann['Station'].unique()
for column in met_eireann.columns:
    stations_with_any_values = met_eireann[met_eireann[column].notna()]['Station'].unique()

    if len(stations_with_any_values) == len(Stations):
        print(f'Column {column} has data from all stations')
        
    else:
        print(f"Column: {column} has data from: {', '.join(stations_with_any_values)}")
        

Column Station has data from all stations
Column ind has data from all stations
Column rain has data from all stations
Column ind.1 has data from all stations
Column temp has data from all stations
Column ind.2 has data from all stations
Column wetb has data from all stations
Column dewpt has data from all stations
Column vappr has data from all stations
Column rhum has data from all stations
Column msl has data from all stations
Column: ind.3 has data from: ATHENRY, FINNER, GURTEEN, JOHNSTOWN_CASTLE_2, MACE_HEAD, MALIN_HEAD, MULLINGAR, NEWPORT, ROCHES_POINT, SHANNON_AIRPORT, SHERKIN_ISLAND, VALENTIA_OBSERVATORY
Column: wdsp has data from: ATHENRY, FINNER, GURTEEN, JOHNSTOWN_CASTLE_2, MACE_HEAD, MALIN_HEAD, MULLINGAR, NEWPORT, ROCHES_POINT, SHANNON_AIRPORT, SHERKIN_ISLAND, VALENTIA_OBSERVATORY
Column: ind.4 has data from: ATHENRY, FINNER, GURTEEN, JOHNSTOWN_CASTLE_2, MACE_HEAD, MALIN_HEAD, MULLINGAR, NEWPORT, ROCHES_POINT, SHANNON_AIRPORT, SHERKIN_ISLAND, VALENTIA_OBSERVATORY
Column: w

In [9]:
# Make sure pandas is imported, as we need it for the averaging logic
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib qt

def interactive_dual_plot(series1, series2, stations=None):
    """
    Create an interactive plot with two y-axes, showing individual station data
    transparently and their average with full opacity.
    
    Parameters:
    series1: str - First series name
    series2: str - Second series name
    stations: list - List of stations (only needed for Met Eireann data)
    """
    fig, ax1 = plt.subplots(figsize=(15, 7))
    ax2 = ax1.twinx()
    
    # --- Define vibrant colours for the plots ---
    color1 = 'tab:blue'
    color2 = 'tab:orange'
    
    met_columns = met_eireann.select_dtypes(include=['float64']).columns
    
    # --- Plot first series (ax1) ---
    if series1 in met_columns:
        if not stations or len(stations) < 1:
            raise ValueError(f"{series1} is a weather parameter. Please provide at least one station.")
        
        station_dfs = []
        for station in stations:
            station_data = met_eireann[met_eireann['Station'] == station][series1]
            # Plot individual station with low alpha and no legend entry
            ax1.plot(station_data.index, station_data, alpha=0.2, color=color1, label='_nolegend_')
            station_dfs.append(station_data)
            
        # Calculate and plot the average of all stations
        all_stations_df = pd.concat(station_dfs, axis=1)
        average_series = all_stations_df.mean(axis=1)
        ax1.plot(average_series.index, average_series, label=f"Average - {series1}", color=color1)
    else:
        # Plot Eirgrid data normally
        ax1.plot(eirgrid.index, eirgrid[series1], label=series1, color=color1)
    
    # --- Plot second series (ax2) ---
    if series2 in met_columns:
        if not stations or len(stations) < 1:
            raise ValueError(f"{series2} is a weather parameter. Please provide at least one station.")
            
        station_dfs = []
        for station in stations:
            station_data = met_eireann[met_eireann['Station'] == station][series2]
            # Plot individual station with low alpha and no legend entry
            ax2.plot(station_data.index, station_data, alpha=0.2, color=color2, label='_nolegend_')
            station_dfs.append(station_data)

        # Calculate and plot the average of all stations
        all_stations_df = pd.concat(station_dfs, axis=1)
        average_series = all_stations_df.mean(axis=1)
        ax2.plot(average_series.index, average_series, label=f"Average - {series2}", color=color2)
    else:
        # Plot Eirgrid data normally
        ax2.plot(eirgrid.index, eirgrid[series2], label=series2, color=color2)
    
    # --- Customize the plot ---
    ax1.set_xlabel('Date')
    ax1.set_ylabel(series1, color=color1)
    ax2.set_ylabel(series2, color=color2)
    ax1.tick_params(axis='y', labelcolor=color1)
    ax2.tick_params(axis='y', labelcolor=color2)
    
    # Add legends
    lines1, labels1 = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax1.legend(lines1 + lines2, labels1 + labels2, loc='upper left')
    
    plt.title(f'{series1} vs {series2}')
    plt.grid(True, linestyle='--', alpha=0.6)
    fig.tight_layout()
    plt.show()


In [12]:
# --- Example Usage ---
# Assuming 'eirgrid' and 'met_eireann' DataFrames are loaded

# In another Jupyter cell, call your function.
# A new window with your plot will now pop up!
interactive_dual_plot('IE Solar Generation', 'sun', stations=['SHANNON_AIRPORT'])

## Computing Correlation Coefficients

In [13]:
import pandas as pd

def calculate_correlation(series1, series2, stations=None):
    """
    Calculates the Pearson correlation coefficient between two time series.

    If a series is from Met Éireann data and no stations are provided,
    it will use the average of all available stations.

    Parameters:
    series1 (str): The name of the first series (e.g., 'IE Wind Generation').
    series2 (str): The name of the second series (e.g., 'temp').
    stations (list, optional): A list of specific stations to use. 
                               Defaults to None (use all).

    Returns:
    float: The correlation coefficient, a value between -1.0 and 1.0.
    """
    
    # This helper function gets and prepares a single data series
    def get_series(series_name, requested_stations):
        met_columns = met_eireann.select_dtypes(include=['float64']).columns
        
        # --- Handle Met Éireann Data ---
        if series_name in met_columns:
            stations_to_use = requested_stations
            # If no stations were passed, use all unique stations in the dataframe
            if requested_stations is None:
                stations_to_use = met_eireann['Station'].unique()
                print(f"-> For '{series_name}', no stations specified. Using average of all {len(stations_to_use)} available stations.")
            else:
                print(f"-> For '{series_name}', using average of {len(stations_to_use)} specified station(s).")

            # Filter data for the chosen stations
            station_data = met_eireann[met_eireann['Station'].isin(stations_to_use)]
            
            # Pivot to align station data by date, then average them
            pivoted_data = station_data.pivot(columns='Station', values=series_name)
            final_series = pivoted_data.mean(axis=1)
            final_series.name = series_name
            return final_series

        # --- Handle EirGrid Data ---
        elif series_name in eirgrid.columns:
            print(f"-> Using '{series_name}' series from Eirgrid data.")
            final_series = eirgrid[series_name]
            return final_series
        
        # --- Handle Error ---
        else:
            raise ValueError(f"Series '{series_name}' not found in either dataset.")

    # --- Main Function Logic ---
    s1 = get_series(series1, stations)
    s2 = get_series(series2, stations)
    
    # Combine the two series, which aligns them by their index (date)
    # and drop any rows where one of the series has a missing value.
    combined_df = pd.concat([s1, s2], axis=1).dropna()

    if combined_df.empty:
        print("\nWarning: The two series have no overlapping time periods. Correlation cannot be calculated.")
        return 0.0

    # Calculate the Pearson correlation coefficient
    correlation = combined_df.iloc[:, 0].corr(combined_df.iloc[:, 1])
    
    print(f"\n✅ The correlation between '{series1}' and '{series2}' is: {correlation:.4f}")
    return correlation



In [None]:
# --- Example Usage ---
# Assuming 'eirgrid' and 'met_eireann' DataFrames are loaded

# Case 1: Eirgrid vs. Met Éireann (specifying stations)
print("--- Case 1: Eirgrid vs. Specific Met Éireann Stations ---")
corr1 = calculate_correlation('IE Wind Generation', 'wdsp')



--- Case 1: Eirgrid vs. Specific Met Éireann Stations ---


-> Using 'IE Wind Generation' series from Eirgrid data.
-> For 'wdsp', no stations specified. Using average of all 14 available stations.

✅ The correlation between 'IE Wind Generation' and 'wdsp' is: 0.8223

--- Case 2: Eirgrid vs. All Met Éireann Stations (Implicit) ---
-> Using 'IE Demand' series from Eirgrid data.
-> For 'temp', no stations specified. Using average of all 14 available stations.

✅ The correlation between 'IE Demand' and 'temp' is: -0.0707

--- Case 3: Two Met Éireann Series vs. Each Other ---
-> For 'rain', using average of 2 specified station(s).
-> For 'rhum', using average of 2 specified station(s).

✅ The correlation between 'rain' and 'rhum' is: 0.2432


In [31]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

def create_cross_correlation_heatmap(eirgrid_df, met_eireann_df):
    """
    Calculates and plots the cross-correlation between float64 columns of
    the two dataframes, excluding self-correlation within each dataframe.

    Args:
        eirgrid_df (pd.DataFrame): The Eirgrid dataframe with a datetime index.
        met_eireann_df (pd.DataFrame): The Met Éireann dataframe with a datetime index.
    """
    # --- Data Processing Steps ---

    # 1. Process the Met Éireann dataframe: average values for each datetime
    

    # 2. Process the Eirgrid dataframe: exclude columns that start with 'NI' or 'AI'
    eirgrid_cols_to_keep = [
        col for col in eirgrid_df.columns
        if not col.startswith('NI') and not col.startswith('AI')
    ]
    eirgrid_df_filtered = eirgrid_df[eirgrid_cols_to_keep]

    # 3. Filter dataframes to include only float64 columns
    eirgrid_float = eirgrid_df_filtered.select_dtypes(include=['float64'])
    met_eireann_float = met_eireann_df.select_dtypes(include=['float64'])

    met_eireann_df_avg = met_eireann_float.groupby(level=0).mean()

    # 4. Store column names for slicing
    eirgrid_cols = eirgrid_float.columns
    met_eireann_cols = met_eireann_df_avg.columns

    # 5. Align the dataframes on their date indices using an inner join
    combined_df = eirgrid_float.join(met_eireann_df_avg, how='inner')

    # 6. Calculate the full correlation matrix
    full_correlation_matrix = combined_df.corr(method='pearson')

    # --- Heatmap Creation Step (The New Change) ---
    # print(full_correlation_matrix)
    # 7. Extract the cross-correlation block
    # We locate the indices of the Met Éireann columns in the full correlation matrix
    # and select the block where the index is Eirgrid columns and the columns are Met Éireann columns.
    cross_correlation_matrix = full_correlation_matrix.loc[ eirgrid_cols, met_eireann_cols]

    # 8. Create the heatmap for the cross-correlation
    plt.figure(figsize=(8, len(eirgrid_cols) )) # Adjust figure size based on Eirgrid rows
    sns.heatmap(
        cross_correlation_matrix,
        annot=True,
        cmap='coolwarm',
        fmt=".2f",
        linewidths=0.5,
        linecolor='black',
        cbar_kws={'label': 'Pearson Correlation Coefficient'}
    )
    plt.title('Cross-Correlation Heatmap: Eirgrid vs. Met Éireann Float64 Columns')
    plt.ylabel('Eirgrid Variables')
    plt.xlabel('Met Éireann Variables')
    plt.show()


create_cross_correlation_heatmap(eirgrid, met_eireann)