**Code to download power generation and power demand data from the TSO of Bosnia and Herzegovina (NOSBIH). Output saved in long format CSV. Skips the 29th of February making the time series easier to compare across years**

In [None]:
import os
import requests
from datetime import datetime, timedelta
import pandas as pd
from tqdm.notebook import tqdm
from bs4 import BeautifulSoup

# --- Period of data ---
start_date_str = "01.01.2023"
# Set a specific end date
end_date_str = "04.01.2023"
# or download all data availabel to date
#end_date_str = datetime.now().strftime("%d.%m.%Y")  # Today's date

start_date = datetime.strptime(start_date_str, "%d.%m.%Y")
end_date = datetime.strptime(end_date_str, "%d.%m.%Y")
date_list = [start_date + timedelta(days=i) for i in range((end_date - start_date).days + 1)]

# Request setup
url = "https://www.nosbih.ba/en/wp-admin/admin-ajax.php"
headers = {
    "User-Agent": "Mozilla/5.0",
    "X-Requested-With": "XMLHttpRequest"
}
all_data = []
# Fetch and parse data
for date in tqdm(date_list, desc="Fetching data"):
    if date.month == 2 and date.day == 29:
        print(f"Skipping {date.strftime('%Y-%m-%d')} (Leap Year Day)")
        continue

    date_str = date.strftime("%d.%m.%Y.")
    display_date = date.strftime("%Y-%m-%d")

    form_data = {
        "action": "production",
        "production": f"date={date_str}"
    }

    try:
        response = requests.post(url, data=form_data, headers=headers)
        response.raise_for_status()
        html = response.json()['data']

        soup = BeautifulSoup(html, "html.parser")
        rows = soup.select("table#productionTable tbody tr")

        for row in rows:
            cols = row.find_all("td")
            if len(cols) >= 5:
                time_val = cols[0].text.strip()

                # Handle conversion with fallback to None
                try:
                    power_generation = float(cols[2].text.strip()) if cols[2].text.strip() else None
                except ValueError:
                    power_generation = None

                try:
                    electricity_demand = float(cols[4].text.strip()) if cols[4].text.strip() else None
                except ValueError:
                    electricity_demand = None

                all_data.append({
                    "date": display_date,
                    "time": time_val,
                    "power_generation": power_generation,
                    "electricity_demand": electricity_demand
                })

    except Exception as e:
        print(f"Failed on {display_date}: {e}")

# Create DataFrame from scraped data
df = pd.DataFrame(all_data)
df["datetime"] = pd.to_datetime(df["date"] + " " + df["time"], format="%Y-%m-%d %H:%M")

# Create full hourly time range (excluding Feb 29)
full_range = pd.date_range(start=start_date, end=end_date + timedelta(days=1), freq='H', inclusive='left')
full_range = full_range[~((full_range.month == 2) & (full_range.day == 29))]  # Remove leap day

# Create empty DataFrame for full range
full_df = pd.DataFrame({"datetime": full_range})
full_df["date"] = full_df["datetime"].dt.strftime("%Y-%m-%d")
full_df["time"] = full_df["datetime"].dt.strftime("%H:%M")

# Merge full timeline with actual data
merged_df = pd.merge(full_df, df.drop(columns=["date", "time"]), on="datetime", how="left")

# Reorder so 00:00 is at the end of each day
def sort_day_correctly(group):
    group = group.copy()
    group['time_order'] = group['time'].apply(lambda t: "24:00" if t == "00:00" else t)
    group = group.sort_values(by='time_order')
    return group.drop(columns='time_order')

df_sorted = merged_df.groupby(merged_df["datetime"].dt.date, group_keys=False).apply(sort_day_correctly)

# Replace NaNs with empty cells (None)
df_sorted = df_sorted.where(pd.notnull(df_sorted), None)

# Save CSV
folder_path = os.path.join("data", start_date.strftime("%Y"), start_date.strftime("%m"))
os.makedirs(folder_path, exist_ok=True)

output_filename = f"nosbih_{start_date_str.replace('.', '-')}_to_{end_date_str.replace('.', '-')}.csv"
output_path = os.path.join(folder_path, output_filename)
df_sorted.to_csv(output_path, index=False)

print(f"CSV saved to: {output_path}")

**The following code illustrates the visualization of the downloaded time series data.**

In [None]:
import matplotlib.pyplot as plt
# --- Plotting each day with correct x-axis order ---
unique_dates = df_sorted["date"].unique()
fig, axs = plt.subplots(len(unique_dates), 1, figsize=(8, 5 * len(unique_dates)), sharey=True)

if len(unique_dates) == 1:
    axs = [axs]  # force list if only one subplot

for ax, date in zip(axs, unique_dates):
    day_df = df_sorted[df_sorted["date"] == date]
    ax.plot(day_df["time"], day_df["power_generation"], label="Power Generation", color="green")
    ax.plot(day_df["time"], day_df["electricity_demand"], label="Electricity Demand", color="red")
    ax.set_title(f"Electricity Data for {date}")
    ax.set_xlabel("Time of Day")
    ax.set_ylabel("MW")
    ax.set_xticks(day_df["time"])
    ax.set_xticklabels(day_df["time"], rotation=45)
    ax.grid(True)
    ax.legend()

plt.tight_layout()
plt.show()