This script loads a raw Excel file containing monthly wind capacity values (MW) for the Republic of Ireland, removes unstructured header rows, corrects types, handles missing data, and prepares a model-ready .csv file — aligned with your energy and weather datasets.

In [None]:
#pandas for data manipulation
import pandas as pd
#to help manage file paths and ensure folder creation when saving
import os

Cleans raw monthly wind capacity data:
Skips non-data header rows
Extracts date and capacity columns
Converts types and formats
Filters for data from 2008 onward
Saves a clean CSV file to the processed folder

In [4]:
def clean_wind_capacity_data(input_path, output_path):
   
    # Load the Excel sheet, skip the first 2 rows, and focus on Date + Capacity columns
    df_raw = pd.read_excel(input_path, sheet_name="Sheet1", skiprows=2, usecols=[2, 5])
    df_raw.columns = ["Date", "Wind_Capacity_MW"]

    # Drop rows with missing values in either column
    df_raw.dropna(subset=["Date", "Wind_Capacity_MW"], inplace=True)

    # Ensure correct data types
    df_raw["Date"] = pd.to_datetime(df_raw["Date"], errors="coerce")
    df_raw["Wind_Capacity_MW"] = pd.to_numeric(df_raw["Wind_Capacity_MW"], errors="coerce")

    # Drop any rows where conversion failed
    df_cleaned = df_raw.dropna().copy()

    # Keep only data from 2008 onward
    df_cleaned = df_cleaned[df_cleaned["Date"].dt.year >= 2008]

    # Sort by date and reset index
    df_cleaned.sort_values("Date", inplace=True)
    df_cleaned.reset_index(drop=True, inplace=True)

    # Create output directory if it doesn't exist
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    # Save cleaned file
    df_cleaned.to_csv(output_path, index=False)
    print(f"✅ Cleaned wind capacity data saved to: {output_path}")

    return df_cleaned


In [5]:
#Run the Cleaning Function(When Script is Executed)
if __name__ == "__main__":
    # Define input and output paths using your locked structure
    input_file = "../data/raw/Wind_monthly_capacity_data.xlsx"
    output_file = "../data/processed/Cleaned_Wind_Capacity.csv"

    # Execute cleaning
    clean_wind_capacity_data(input_file, output_file)

✅ Cleaned wind capacity data saved to: ../data/processed/Cleaned_Wind_Capacity.csv
