**Code to download hourly electricity demand data from the North Macedonian TSO (MEPSO).**

In [None]:
import os
import re
import sys
import asyncio
import aiohttp
import pdfplumber
import pandas as pd
from datetime import datetime, timedelta
from tqdm.notebook import tqdm
import contextlib
import nest_asyncio

# Download configuration
BASE_URL = "https://www.mepso.com.mk/files/mk/dnevni/Информација за {date}.pdf"
SAVE_DIR = "pdfs"
LOG_DIR = "logs"
OUTPUT_FILE = "mk_tso_data_hourly_demand.xlsx"
START_DATE = datetime(2024, 12, 1)
END_DATE = datetime(2024, 12, 31)
#END_DATE = datetime.now().strftime("%d.%m.%Y")  # Today's date

MAX_PARALLEL_DOWNLOADS = 10  # Adjust this depending on internet speed

os.makedirs(SAVE_DIR, exist_ok=True)
os.makedirs(LOG_DIR, exist_ok=True)

# Helper to suppress PDFMiner noisy stderr
@contextlib.contextmanager
def suppress_stderr():
    with open(os.devnull, 'w') as fnull:
        old_stderr = sys.stderr
        sys.stderr = fnull
        try:
            yield
        finally:
            sys.stderr = old_stderr

# Extract numbers from text fallback
def extract_vkupen_konzum_from_text(text):
    lines = text.splitlines()
    for line in lines:
        if "ВКУПЕН КОНЗУМ" in line:
            values = re.findall(r"\d{1,3}(?:\.\d{3})*,\d+|\d+,\d+", line)
            numbers = [
                float(v.replace(".", "").replace(",", ".")) for v in values
            ]
            if len(numbers) >= 25:
                return numbers[1:25]
            elif len(numbers) >= 24:
                return numbers[:24]
    return None

# Single Date Processor
async def process_date(session, date_obj, all_data, error_log):
    date_str = date_obj.strftime("%d.%m.%Y")
    url = BASE_URL.format(date=date_str)
    filename = os.path.join(SAVE_DIR, f"Информација за {date_str}.pdf")
    fallback_txt_path = os.path.join(LOG_DIR, f"fallback_{date_str}.txt")

    try:
        if not os.path.exists(filename):
            async with session.get(url, timeout=aiohttp.ClientTimeout(total=15)) as resp:
                if resp.status != 200:
                    error_log.append((date_str, "Download failed"))
                    return
                with open(filename, "wb") as f:
                    f.write(await resp.read())

        # Try extracting from table
        with suppress_stderr():
            with pdfplumber.open(filename) as pdf:
                page = pdf.pages[0]
                tables = page.extract_tables()
                for table in tables:
                    for row in table:
                        if row and any("ВКУПЕН КОНЗУМ" in str(cell) for cell in row if cell):
                            values = [
                                float(str(cell).replace(".", "").replace(",", ".").replace(" ", ""))
                                for cell in row
                                if cell and str(cell).replace(".", "").replace(",", "").replace(" ", "").isdigit()
                            ]
                            if len(values) >= 25:
                                hourly = values[1:25]
                                for h, v in enumerate(hourly, start=1):
                                    all_data.append({"date": date_obj.date().isoformat(), "hour": h, "value": v})
                                return

        # Fallback: extract from text
        with suppress_stderr():
            with pdfplumber.open(filename) as pdf:
                page = pdf.pages[0]
                text = page.extract_text() or ""

            with open(fallback_txt_path, "w", encoding="utf-8") as f:
                f.write(text)

            hourly = extract_vkupen_konzum_from_text(text)
            if hourly:
                for h, v in enumerate(hourly, start=1):
                    all_data.append({"date": date_obj.date().isoformat(), "hour": h, "value": v})
                return
            else:
                error_log.append((date_str, f"Too few values extracted"))

    except Exception as e:
        error_log.append((date_str, f"Exception: {e}"))

# Main Async Runner
async def main():
    dates = [
        START_DATE + timedelta(days=i)
        for i in range((END_DATE - START_DATE).days + 1)
        if not ((START_DATE + timedelta(days=i)).month == 2 and (START_DATE + timedelta(days=i)).day == 29)
    ]

    all_data = []
    error_log = []

    print("Processing...")
    connector = aiohttp.TCPConnector(limit_per_host=MAX_PARALLEL_DOWNLOADS)
    timeout = aiohttp.ClientTimeout(total=30)

    async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
        tasks = []
        pbar = tqdm(total=len(dates), desc="Downloading & Extracting")

        sem = asyncio.Semaphore(MAX_PARALLEL_DOWNLOADS)

        async def sem_task(date):
            async with sem:
                await process_date(session, date, all_data, error_log)
                pbar.update(1)

        for date in dates:
            tasks.append(asyncio.create_task(sem_task(date)))

        await asyncio.gather(*tasks)
        pbar.close()

    # Save results
    if all_data:
        df = pd.DataFrame(all_data)
        df = df.sort_values(by=["date", "hour"]).reset_index(drop=True)
        df.to_excel(OUTPUT_FILE, index=False)
        print(f"Data saved to {OUTPUT_FILE}")
    else:
        print("No data was downloaded. Excel file was not created.")

    # Save error log if any
    if error_log:
        pd.DataFrame(error_log, columns=["date", "issue"]).to_csv("error_log.csv", index=False)
        print(f"Errors encountered. See error_log.csv for details.")

    print("Done.")

# Runner (can be modified if the file is ran as a .py file instead of a .ipynb)
if __name__ == "__main__":
    nest_asyncio.apply()
    asyncio.get_event_loop().run_until_complete(main())