In [1]:
from datetime import datetime, timedelta

download_names_list = []

start_year_zip = 2018
start_date = "20230101"
end_date = "20241220"

name_list = []
current_date = datetime.strptime(start_date, "%Y%m%d")
end_date_datetime = datetime.strptime(end_date, "%Y%m%d")

while current_date <= end_date_datetime:
    name_list.append(current_date.strftime("%Y%m%d"))
    current_date += timedelta(days=1)

for date in name_list:
    download_names_list.append(f"{date}")

years = range(start_year_zip, 2023)
for year in years:
    download_names_list.append(f"{year}.zip")


In [None]:
import aiohttp
import asyncio

async def download_file_with_versions(session, base_url, name):
    """
    Attempt to download a file for a given name.
    If the name ends with '.zip', download it directly.
    Otherwise, try versions .1 to .6.
    Returns the first successfully downloaded file's content or None if all fail.
    """
    # Check if the name ends with '.zip'
    if name.endswith(".zip"):
        file_name = name
        url = base_url + file_name
        try:
            headers = {"User-Agent": "Mozilla/5.0"}  # Add headers if needed
            async with session.get(url, headers=headers) as response:
                if response.status == 200:
                    # Successfully downloaded
                    content = await response.read()
                    return file_name, content
                else:
                    print(f"Failed to download {url} with status {response.status}")
        except Exception as e:
            print(f"Error downloading {url}: {e}")
        # If the .zip file fails, return None
        return None, None

    # If not a .zip file, try versions .1 to .6
    for version in range(1, 10):  # Try versions .1 to .6
        file_name = f"{name}.{version}"
        url = base_url + file_name
        try:
            headers = {"User-Agent": "Mozilla/5.0"}  # Add headers if needed
            async with session.get(url, headers=headers) as response:
                if response.status == 200:
                    # Successfully downloaded
                    content = await response.read()
                    return file_name, content
                else:
                    print(f"Skipped {file_name}. Fetching version {version+1}")
        except Exception as e:
            print(f"Error downloading {url}: {e}")
    # If all versions fail, return None
    raise ValueError(f"Failed to download {name}")

async def download_all_files(base_url, name_list):
    """
    Download all files for the given dates, trying versions .1 to .6 for each name.
    Returns a dictionary with file names as keys and content as values.
    """
    async with aiohttp.ClientSession() as session:
        tasks = [download_file_with_versions(session, base_url, name) for name in name_list]
        results = await asyncio.gather(*tasks)
        # Print failed downloads
        for file_name, content in results:
            if content is None:
                print(f"Failed to download any version for {file_name}")
        # Create a dictionary from the results, filtering out failed downloads (None values)
        return {file_name: content for file_name, content in results if content is not None}

# Example usage
base_url = "https://www.omie.es/pt/file-download?parents=marginalpdbcpt&filename=marginalpdbcpt_" # diario
# base_url = "https://www.omie.es/pt/file-download?parents=marginalpibcpt&filename=marginalpibcpt_" # intradiario

data = await download_all_files(base_url, download_names_list)

In [None]:
import pandas as pd
from datetime import datetime
import zipfile
from io import BytesIO
pd.set_option('display.width', 100)


def parse_plain_text(content):
    """
    Parses plain text data and returns a list of rows.
    """
    # Decode content if it's in bytes
    if isinstance(content, bytes):
        content = content.decode('utf-8')  # Adjust encoding if necessary

    lines = content.strip().split('\n')
    all_rows = []
    for line in lines:
        if line.startswith("MARGINALPDBCPT") or not line.strip():
            continue  # Skip header or empty lines
        parts = line.split(';')
        if len(parts) >= 6:
            year, month, day, hour, value1, value2 = parts[:6]
            all_rows.append({
                "Year": year,
                "Month": month,
                "Day": day,
                "HourSlot": hour,
                "PT": value1,
                "ES": value2
            })
    return all_rows

def parse_zip_file(content):
    """
    Parses ZIP file data and returns a list of rows by extracting and parsing each `.1` file inside.
    
    Parameters:
    - content (bytes): The binary content of the ZIP file.
    
    Returns:
    - list of dict: Parsed data rows from all `.1` files within the ZIP.
    """
    all_rows = []
    with zipfile.ZipFile(BytesIO(content)) as z:
        for file_info in z.infolist():
            if any(file_info.filename.endswith(f'.{i}') for i in range(1, 10)):
                with z.open(file_info) as f:
                    file_content = f.read().decode('utf-8')
                    all_rows.extend(parse_plain_text(file_content))
    return all_rows


def process_data(data_dict):
    """
    Process the downloaded data (both .1 and .zip files) and return a DataFrame.

    Parameters:
    - data_dict (dict): A dictionary where keys are file names and values are file contents.

    Returns:
    - pd.DataFrame: Processed data with a combined datetime column in UTC.
    """
    all_rows = []
    for file_name, content in data_dict.items():
        if file_name.endswith(".zip"):
            all_rows.extend(parse_zip_file(content))
        else:
            all_rows.extend(parse_plain_text(content))
    
    # Create DataFrame from all rows
    df = pd.DataFrame(all_rows, columns=["Year", "Month", "Day", "HourSlot", "PT", "ES"])

    # Convert appropriate columns to numeric types
    df["Year"] = df["Year"].astype(int)
    df["Month"] = df["Month"].astype(int)
    df["Day"] = df["Day"].astype(int)
    df["HourSlot"] = df["HourSlot"].astype(int)  # <-- Modified Line
    df["PT"] = pd.to_numeric(df["PT"], errors='raise')
    df["ES"] = pd.to_numeric(df["ES"], errors='raise')

    df['base_date'] = pd.to_datetime(df[['Year', 'Month', 'Day']])
    df.sort_values(by=['base_date', 'HourSlot'], inplace=True)

    list_23 = list(range(24))
    list_23.remove(2)
    list_24 = list(range(24))
    list_25 = list(range(24))
    list_25.append(2)
    list_25.sort()


    def _slot_to_hour(group):
        group_size = len(group)
        if group_size == 23:
            group['Hour'] = list_23
        elif group_size == 24:
            group['Hour'] = list_24
        elif group_size == 25:
            group['Hour'] = list_25
        else:
            raise ValueError
        return group

    df = df.groupby('base_date').apply(_slot_to_hour).reset_index(drop = True)
    # print(df[df.base_date=="20180625"])
    df['datetime'] = pd.to_datetime(
        dict(
            year=df['Year'],
            month=df['Month'],
            day=df['Day'],
            hour=df['Hour']
        ),
        errors='raise'  # handle invalid dates if any
    )
    # print(df[df.datetime==pd.Timestamp("2023-10-01").date()])

    df['datetime_cet'] = df['datetime'].dt.tz_localize(
        'Europe/Madrid',
        # Handle ambiguous times (e.g. fall back):
        ambiguous='infer', 
        # Handle nonexistent times (e.g. spring forward):
        nonexistent='raise'
    )

    # 4) Convert to UTC
    df['datetime'] = df['datetime_cet'].dt.tz_convert('UTC').dt.tz_localize(None)
    
    # # Drop the separate Year, Month, Day, and Hour columns
    df.drop(columns=["Year", "Month", "Day", "Hour", "HourSlot", "base_date"], inplace=True)
    df.drop(columns=["datetime_cet"], inplace=True)

    df.set_index("datetime", inplace=True)
    df.sort_index(inplace=True)
    
    return df

prices_df = process_data(data)
prices_df



In [5]:
prices_df = prices_df.loc["2018-01-01 00:00:00":]

In [None]:
from data_fetcher import ENTSOEDataFetcher
import nest_asyncio
from data_fetcher import SimpleInterval
from datetime import datetime
from analyzer import analyze

nest_asyncio.apply()

# Apply nest_asyncio to allow nested event loops (useful for Jupyter notebooks)
fetcher = ENTSOEDataFetcher()

data_request = SimpleInterval(datetime(2018, 1,1,0), datetime(2024, 12,20,23))
fetcher_data = fetcher.get_data(data_request)

# Analyze the data to get aggregated and contributions DataFrames
aggregated, contributions = analyze(fetcher_data)
# aggregated.columns = aggregated.columns.map(lambda x: utils.PSR_TYPE_MAPPING.get(x, x))

assert prices_df.index.equals(aggregated.index)

In [7]:
imports = contributions["ES"].add(contributions["FR"], fill_value=0).sum(axis=1, skipna=True)
consumption = aggregated.sum(axis=1, skipna=True)

percentage_imports = imports.div(consumption)*100


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.scatter(percentage_imports, prices_df["PT"], label='Imports/Consumption', color='blue', alpha=0.5, s=1)
plt.title('Portuguese imports and price')
plt.xlabel('Percentage of imports')
plt.ylabel('Price (€/MWh)')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import linregress

x = percentage_imports
y = prices_df["PT"]

# Calculate linear regression
slope, intercept, r_value, p_value, std_err = linregress(x, y)

# Calculate correlation
correlation = np.corrcoef(x, y)[0, 1]

# Plotting
plt.figure(figsize=(10, 6))
plt.scatter(x, y, label='Imports/Consumption', color='blue', alpha=0.5, s=1)

# Plot the regression line
plt.plot(
    x,
    intercept + slope * (x),
    'r',
    label=f'Linear fit: y={slope:.4f}x+{intercept:.2f}, Correlation: {correlation:.2f}'
)

plt.title('Portuguese imports and price')
plt.xlabel('Imports (% of consumption)')
plt.ylabel('PT Marginal Spot Price (€/MWh)')
plt.legend()
plt.grid(True)

plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import linregress

x = imports
y = prices_df["PT"]

# Calculate linear regression
slope, intercept, r_value, p_value, std_err = linregress(x, y)

# Calculate correlation
correlation = np.corrcoef(x, y)[0, 1]

# Plotting
plt.figure(figsize=(10, 6))
plt.scatter(x, y, label='Imports/Consumption', color='blue', alpha=0.5, s=1)

# Plot the regression line
plt.plot(
    x,
    intercept + slope * (x),
    'r',
    label=f'Linear fit: y={slope:.4f}x+{intercept:.2f}, Correlation: {correlation:.2f}'
)

plt.title('Portuguese imports and price')
plt.xlabel('Imports')
plt.ylabel('PT Marginal Spot Price (€/MWh)')
plt.legend()
plt.grid(True)

plt.show()

In [None]:
import plotly.express as px

# Create the violin plot without adding a 'Month' column
fig = px.violin(
    prices_df,
    x=prices_df.index.month,  # Extract month directly from the index
    y="PT",
    box=True,  # Add a box plot inside the violin
    # points="all",  # Show all points
    title="Distribution of PT Marginal Spot Prices by Month",
    labels={"x": "Month", "PT": "Price (€/MWh)"},  # Update x-axis label
    template="plotly_white"  # Use a clean white background
)

# Show the plot
fig.show()

In [None]:
import plotly.express as px

# Create the violin plot with different colors for each month
fig = px.violin(
    prices_df,
    x=prices_df.index.month,  # Extract month directly from the index
    y="PT",
    color=prices_df.index.month,  # Use month to assign different colors
    box=True,  # Add a box plot inside the violin
    # points="all",  # Show all points
    title="Distribution of PT Marginal Spot Prices by Month",
    labels={"x": "Month", "PT": "Price (€/MWh)", "color": "Month"},  # Update labels
    template="plotly_white"  # Use a clean white background
)

# Adjust the width of the violins
for trace in fig.data:
    trace.width = 0.8  # Set the desired width

# Show the plot
fig.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Create the violin plot without adding a 'Month' column
plt.figure(figsize=(12, 6))
sns.violinplot(
    x=prices_df.index.month,  # Extract month directly from the index
    y=prices_df["PT"],
    palette="muted"
)

# Add labels and title
plt.title("Distribution of PT Day-Ahead Marginal Spot Prices by Month")
plt.xlabel("Month")
plt.ylabel("Price (€/MWh)")
plt.grid(axis="y", linestyle="--", alpha=0.7)

# Show the plot
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Create a 2D histogram (density estimate) for the contour plot
plt.figure(figsize=(10, 6))
sns.kdeplot(
    x=percentage_imports,
    y=prices_df["PT"],
    cmap="Blues",  # Color map for the contours
    fill=True,  # Fill the contours
    thresh=0,  # Show all density levels
    levels=20  # Number of contour levels
)

# Add labels and title
plt.title('Contour Plot: Portuguese Imports vs Price')
plt.xlabel('Imports (% of consumption)')
plt.ylabel('PT Marginal Spot Price (€/MWh)')
plt.grid(True)

plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Create a 2D histogram (density estimate) for the contour plot
plt.figure(figsize=(10, 6))
sns.kdeplot(
    x=percentage_imports,
    y=prices_df["PT"],
    cmap="Blues",  # Color map for the contours
    fill=True,  # Fill the contours
    thresh=0,  # Show all density levels
    levels=20,  # Number of contour levels
    cbar=True,  # Add a color bar
    log_scale=(False, False)  # Logarithmic scale for the color bar
)

# Add labels and title
plt.title('Contour Plot: Portuguese Imports vs Price (Logarithmic Density)')
plt.xlabel('Imports (% of consumption)')
plt.ylabel('PT Marginal Spot Price (€/MWh)')
plt.grid(True)

plt.show()