In [1]:
from datetime import datetime, timedelta

download_names_list = []

start_date = "20230101"
end_date = "20241220"

date_list = []
current_date = datetime.strptime(start_date, "%Y%m%d")
end_date_datetime = datetime.strptime(end_date, "%Y%m%d")

while current_date <= end_date_datetime:
    date_list.append(current_date.strftime("%Y%m%d"))
    current_date += timedelta(days=1)

for date in date_list:
    download_names_list.append(f"marginalpdbcpt_{date}.1")

years = range(2018, 2023)
for year in years:
    download_names_list.append(f"marginalpdbcpt_{year}.zip")

# Replace the .1 with .2 for the specified files
for i, name in enumerate(download_names_list):
    if name in ["marginalpdbcpt_20230223.1", "marginalpdbcpt_20230528.1"]:
        download_names_list[i] = name.replace('.1', '.2')
    if name in ["marginalpdbcpt_20230121.1"]:
        download_names_list[i] = name.replace('.1', '.3')


In [2]:
import aiohttp
import asyncio

async def download_file(session, url, name):
    """Download a file from a URL and return its content."""
    try:
        headers = {"User-Agent": "Mozilla/5.0"}  # Add headers if needed
        async with session.get(url, headers=headers) as response:
            if response.status == 200:
                # Read the file content as binary
                content = await response.read()
                return name, content  # Return the file name and its content
            else:
                print(f"Failed to download {url} with status {response.status}")
                return name, None
    except Exception as e:
        print(f"Error downloading {url}: {e}")
        return name, None

async def download_all_files(base_url, download_names_list):
    """Download all files and return a dictionary with file names as keys and content as values."""
    async with aiohttp.ClientSession() as session:
        tasks = [download_file(session, base_url + name, name) for name in download_names_list]
        results = await asyncio.gather(*tasks)
        # Print failed downloads
        for name, content in results:
            if content is None:
                print(f"Failed to download {name}")
        # Create a dictionary from the results, filtering out failed downloads (None values)
        return {name: content for name, content in results if content is not None}

# Example usage
base_url = "https://www.omie.es/pt/file-download?parents=marginalpdbcpt&filename="

data = await download_all_files(base_url, download_names_list)

In [7]:
import pandas as pd
from datetime import datetime
import zipfile
from io import BytesIO

def parse_plain_text(content, file_name):
    """
    Parses plain text data and returns a list of rows.
    """
    # Decode content if it's in bytes
    if isinstance(content, bytes):
        content = content.decode('utf-8')  # Adjust encoding if necessary

    lines = content.strip().split('\n')
    all_rows = []
    for line in lines:
        if line.startswith("MARGINALPDBCPT") or not line.strip():
            continue  # Skip header or empty lines
        parts = line.split(';')
        if len(parts) >= 6:
            year, month, day, hour, value1, value2 = parts[:6]
            all_rows.append({
                "Year": year,
                "Month": month,
                "Day": day,
                "HourSlot": hour,
                "PT": value1,
                "ES": value2
            })
    return all_rows

def parse_zip_file(content):
    """
    Parses ZIP file data and returns a list of rows by extracting and parsing each `.1` file inside.
    
    Parameters:
    - content (bytes): The binary content of the ZIP file.
    
    Returns:
    - list of dict: Parsed data rows from all `.1` files within the ZIP.
    """
    all_rows = []
    with zipfile.ZipFile(BytesIO(content)) as z:
        for file_info in z.infolist():
            if file_info.filename.endswith('.1'):
                with z.open(file_info) as f:
                    file_content = f.read().decode('utf-8')
                    all_rows.extend(parse_plain_text(file_content, file_info.filename))
    return all_rows


def process_data(data_dict):
    """
    Process the downloaded data (both .1 and .zip files) and return a DataFrame.

    Parameters:
    - data_dict (dict): A dictionary where keys are file names and values are file contents.

    Returns:
    - pd.DataFrame: Processed data with a combined datetime column in UTC.
    """
    all_rows = []
    for file_name, content in data_dict.items():
        if file_name.endswith(".zip"):
            all_rows.extend(parse_zip_file(content))
        else:
            all_rows.extend(parse_plain_text(content, file_name))
    
    # Create DataFrame from all rows
    df = pd.DataFrame(all_rows, columns=["Year", "Month", "Day", "HourSlot", "PT", "ES"])
    
    # Convert appropriate columns to numeric types
    df["Year"] = df["Year"].astype(int)
    df["Month"] = df["Month"].astype(int)
    df["Day"] = df["Day"].astype(int)
    df["HourSlot"] = df["HourSlot"].astype(int)  # <-- Modified Line
    df["PT"] = pd.to_numeric(df["PT"], errors='raise')
    df["ES"] = pd.to_numeric(df["ES"], errors='raise')


    df['base_date'] = pd.to_datetime(df[['Year', 'Month', 'Day']])
    df.sort_values(by=['base_date', 'HourSlot'], inplace=True)

    list_23 = list(range(24))
    list_23.remove(2)
    list_24 = list(range(24))
    list_25 = list(range(24))
    list_25.append(2)
    list_25.sort()
    def _slot_to_hour(group):
        group_size = len(group)
        if group_size == 23:
            group['Hour'] = list_23
        elif group_size == 24:
            group['Hour'] = list_24
        elif group_size == 25:
            group['Hour'] = list_25
        else:
            raise ValueError
        return group

    df = df.groupby('base_date').apply(_slot_to_hour)

    df['datetime'] = pd.to_datetime(
        dict(
            year=df['Year'],
            month=df['Month'],
            day=df['Day'],
            hour=df['Hour']
        ),
        errors='raise'  # handle invalid dates if any
    )
    # target_day = pd.Timestamp('2018-10-28').date()
    # filtered_df = df[df['datetime'].dt.date == target_day]
    # print(filtered_df)
    # target_day = pd.Timestamp('2018-10-29').date()
    # filtered_df = df[df['datetime'].dt.date == target_day]
    # print(filtered_df)

    df['datetime'] = df['datetime'].dt.tz_localize(
        'CET',
        # Handle ambiguous times (e.g. fall back):
        ambiguous='infer', 
        # Handle nonexistent times (e.g. spring forward):
        # nonexistent='shift_forward'
    )

    # 4) Convert to UTC
    df['datetime'] = df['datetime'].dt.tz_convert('UTC')
    
    # # Drop the separate Year, Month, Day, and Hour columns
    df.drop(columns=["Year", "Month", "Day", "Hour", "HourSlot", "base_date"], inplace=True)

    df.set_index("datetime", inplace=True)
    df.sort_index(inplace=True)
    
    return df

df = process_data(data)
df



  df = df.groupby('base_date').apply(_slot_to_hour)


Unnamed: 0_level_0,PT,ES
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-12-31 23:00:00+00:00,28.10,6.74
2018-01-01 00:00:00+00:00,33.00,4.74
2018-01-01 01:00:00+00:00,32.90,3.66
2018-01-01 02:00:00+00:00,28.10,2.30
2018-01-01 03:00:00+00:00,27.60,2.30
...,...,...
2024-12-20 18:00:00+00:00,145.39,145.39
2024-12-20 19:00:00+00:00,145.38,145.38
2024-12-20 20:00:00+00:00,140.01,140.01
2024-12-20 21:00:00+00:00,131.98,131.98


In [4]:
(df["Value1"] != df["Value2"]).sum()/len(df)

np.float64(0.04506197257383966)

In [None]:
import sys
import os
import pandas as pd

sys.path.append(os.path.join(os.getcwd(), '..', 'src'))

flow_es_to_pt = pd.read_pickle("../.data_cache/flow_es_to_pt.pkl.gz")
flow_pt_to_es = pd.read_pickle("../.data_cache/flow_pt_to_es.pkl.gz")
flow_fr_to_es = pd.read_pickle("../.data_cache/flow_fr_to_es.pkl.gz")
flow_es_to_fr = pd.read_pickle("../.data_cache/flow_es_to_fr.pkl.gz")