**Code to download hourly electricity demand data from the North Macedonian TSO (MEPSO).**

In [None]:
import os
import re
import requests
import pdfplumber
import pandas as pd
from datetime import datetime, timedelta
from tqdm import tqdm

# Configuration
BASE_URL = "https://www.mepso.com.mk/files/mk/dnevni/Информација за {date}.pdf"
SAVE_DIR = "pdfs"
LOG_DIR = "logs"
START_DATE = datetime(2025, 1, 1)
END_DATE = datetime(2025, 1, 10)

os.makedirs(SAVE_DIR, exist_ok=True)
os.makedirs(LOG_DIR, exist_ok=True)

all_data = []
error_log = []

# Extract values from text
def extract_vkupen_konzum_from_text(text):
    lines = text.splitlines()
    for line in lines:
        if "ВКУПЕН КОНЗУМ" in line:
            values = re.findall(r"\d{1,3}(?:\.\d{3})*,\d+|\d+,\d+", line)
            numbers = [
                float(v.replace(".", "").replace(",", "."))
                for v in values
            ]
            if len(numbers) >= 25:
                return numbers[1:25]  # skip total, return hourly
            elif len(numbers) >= 24:
                return numbers[:24]
    return None

# Main processing function
def process_date(date_obj):
    date_str = date_obj.strftime("%d.%m.%Y")
    url = BASE_URL.format(date=date_str)
    filename = os.path.join(SAVE_DIR, f"Информација за {date_str}.pdf")
    fallback_txt_path = os.path.join(LOG_DIR, f"fallback_{date_str}.txt")

    try:
        # Download PDF
        response = requests.get(url, timeout=10)
        if response.status_code != 200:
            error_log.append((date_str, "Download failed"))
            return

        with open(filename, "wb") as f:
            f.write(response.content)

        # Try extracting from table
        with pdfplumber.open(filename) as pdf:
            page = pdf.pages[0]
            tables = page.extract_tables()
            for table in tables:
                for row in table:
                    if row and any("ВКУПЕН КОНЗУМ" in str(cell) for cell in row if cell):
                        values = [
                            float(str(cell).replace(".", "").replace(",", ".").replace(" ", ""))
                            for cell in row
                            if cell and str(cell).replace(".", "").replace(",", "").replace(" ", "").isdigit()
                        ]
                        if len(values) >= 25:
                            hourly = values[1:25]
                            for h, v in enumerate(hourly, start=1):
                                all_data.append({
                                    "date": date_obj.date().isoformat(),
                                    "hour": h,
                                    "value": v
                                })
                            return

        # Fallback: extract text and save for debugging
        with pdfplumber.open(filename) as pdf:
            page = pdf.pages[0]
            text = page.extract_text() or ""
            with open(fallback_txt_path, "w", encoding="utf-8") as f:
                f.write(text)

            hourly = extract_vkupen_konzum_from_text(text)
            if hourly:
                for h, v in enumerate(hourly, start=1):
                    all_data.append({
                        "date": date_obj.date().isoformat(),
                        "hour": h,
                        "value": v
                    })
                return
            else:
                error_log.append((date_str, f"Too few values in direct text ({len(hourly) if hourly else 0})"))

        # FINAL fallback: Load from saved .txt file
        if os.path.exists(fallback_txt_path):
            with open(fallback_txt_path, "r", encoding="utf-8") as f:
                text = f.read()
                hourly = extract_vkupen_konzum_from_text(text)
                if hourly:
                    for h, v in enumerate(hourly, start=1):
                        all_data.append({
                            "date": date_obj.date().isoformat(),
                            "hour": h,
                            "value": v
                        })
                    return
                else:
                    error_log.append((date_str, f"Too few values in saved .txt ({len(hourly) if hourly else 0})"))
        else:
            error_log.append((date_str, "No fallback file found"))

    except Exception as e:
        error_log.append((date_str, f"Exception: {e}"))

# process date range
dates = [
    START_DATE + timedelta(days=i)
    for i in range((END_DATE - START_DATE).days + 1)
    if not ((START_DATE + timedelta(days=i)).month == 2 and (START_DATE + timedelta(days=i)).day == 29)
]

print("Processing...")
for d in tqdm(dates):
    process_date(d)

# Save outputs
df = pd.DataFrame(all_data)
df.to_excel("mk_tso_data_hourly_demand.xlsx", index=False)

if error_log:
    pd.DataFrame(error_log, columns=["date", "issue"]).to_csv("error_log.csv", index=False)

print("Done, data extracted and saved.")


**An alternative version of the code that utilizes parallel downloading to accelerate the data retrieval process**

In [None]:
import os
import re
import requests
import pdfplumber
import pandas as pd
from datetime import datetime, timedelta
from tqdm import tqdm
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

# Configuration
BASE_URL = "https://www.mepso.com.mk/files/mk/dnevni/Информација за {date}.pdf"
SAVE_DIR = "pdfs"
LOG_DIR = "logs"
COMPLETED_FILE = "completed_dates.txt"
START_DATE = datetime(2025, 1, 1)
END_DATE = datetime(2025, 1, 15)
MAX_RETRIES = 2
RETRY_DELAY = 5  # seconds
MAX_WORKERS = 8  # number of threads

os.makedirs(SAVE_DIR, exist_ok=True)
os.makedirs(LOG_DIR, exist_ok=True)

all_data = []
error_log = []

# Helper function to clean and convert numbers
def clean_number(s):
    return float(s.replace(".", "").replace(",", "."))

# Extract values from text
def extract_vkupen_konzum_from_text(text):
    lines = text.splitlines()
    for line in lines:
        if "ВКУПЕН КОНЗУМ" in line:
            values = re.findall(r"\d{1,3}(?:\.\d{3})*,\d+|\d+,\d+", line)
            numbers = [clean_number(v) for v in values]
            if len(numbers) >= 25:
                return numbers[1:25]
            elif len(numbers) >= 24:
                return numbers[:24]
    return None

# Download file with retries
def download_file(url, filename):
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            response = requests.get(url, timeout=10)
            if response.status_code == 200:
                with open(filename, "wb") as f:
                    f.write(response.content)
                return True
            else:
                print(f"Attempt {attempt}: Server returned status {response.status_code}")
        except Exception as e:
            print(f"Attempt {attempt}: Exception during download: {e}")

        time.sleep(RETRY_DELAY)
    return False

# Save completed date
def mark_completed(date_obj):
    with open(COMPLETED_FILE, "a") as f:
        f.write(date_obj.strftime("%Y-%m-%d") + "\n")

# Check completed dates
def load_completed_dates():
    if not os.path.exists(COMPLETED_FILE):
        return set()
    with open(COMPLETED_FILE, "r") as f:
        return set(line.strip() for line in f.readlines())

# Main processing function
def process_date(date_obj):
    date_str = date_obj.strftime("%d.%m.%Y")
    date_iso = date_obj.strftime("%Y-%m-%d")
    
    if date_iso in completed_dates:
        # Already completed
        return
    
    url = BASE_URL.format(date=date_str)
    filename = os.path.join(SAVE_DIR, f"Информација за {date_str}.pdf")
    fallback_txt_path = os.path.join(LOG_DIR, f"fallback_{date_str}.txt")

    try:
        if not download_file(url, filename):
            error_log.append((date_str, "Download failed after retries"))
            return

        with pdfplumber.open(filename) as pdf:
            page = pdf.pages[0]
            tables = page.extract_tables()
            for table in tables:
                for row in table:
                    if row and any("ВКУПЕН КОНЗУМ" in str(cell) for cell in row if cell):
                        values = [
                            clean_number(str(cell))
                            for cell in row
                            if cell and re.match(r"\d{1,3}(?:\.\d{3})*,\d+|\d+,\d+", str(cell))
                        ]
                        if len(values) >= 25:
                            hourly = values[1:25]
                            for h, v in enumerate(hourly, start=1):
                                all_data.append({"date": date_obj.date().isoformat(), "hour": h, "value": v})
                            mark_completed(date_obj)
                            return

        # Fallback: extract text if table fails
        with pdfplumber.open(filename) as pdf:
            page = pdf.pages[0]
            text = page.extract_text() or ""
            with open(fallback_txt_path, "w", encoding="utf-8") as f:
                f.write(text)

            hourly = extract_vkupen_konzum_from_text(text)
            if hourly:
                for h, v in enumerate(hourly, start=1):
                    all_data.append({"date": date_obj.date().isoformat(), "hour": h, "value": v})
                mark_completed(date_obj)
                return
            else:
                error_log.append((date_str, f"Too few values in direct text ({len(hourly) if hourly else 0})"))

        # FINAL fallback
        if os.path.exists(fallback_txt_path):
            with open(fallback_txt_path, "r", encoding="utf-8") as f:
                text = f.read()
                hourly = extract_vkupen_konzum_from_text(text)
                if hourly:
                    for h, v in enumerate(hourly, start=1):
                        all_data.append({"date": date_obj.date().isoformat(), "hour": h, "value": v})
                    mark_completed(date_obj)
                    return
                else:
                    error_log.append((date_str, f"Too few values in saved .txt ({len(hourly) if hourly else 0})"))
        else:
            error_log.append((date_str, "No fallback file found"))

    except Exception as e:
        error_log.append((date_str, f"Exception: {e}"))

# Load completed dates
completed_dates = load_completed_dates()

# Generate list of dates to process
dates = [
    START_DATE + timedelta(days=i)
    for i in range((END_DATE - START_DATE).days + 1)
    if not ((START_DATE + timedelta(days=i)).month == 2 and (START_DATE + timedelta(days=i)).day == 29)
]

# Parallel processing
print(f"Processing {len(dates)} dates with {MAX_WORKERS} workers...")

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = {executor.submit(process_date, d): d for d in dates}

    for future in tqdm(as_completed(futures), total=len(futures)):
        pass

# Save outputs
df = pd.DataFrame(all_data)
df.to_excel("mk_tso_data_hourly_demand.xlsx", index=False)

if error_log:
    pd.DataFrame(error_log, columns=["date", "issue"]).to_csv("error_log.csv", index=False)

print("Done, data extracted and saved.")