# Download file ARERA and parsing Excel


In [None]:
# === DOWNLOAD FILE FROM WEB IF AVAILABLE ===

import requests
from bs4 import BeautifulSoup
import os

# Base URL of the page
base_url = "https://www.arera.it/area-operatori/prezzi-e-tariffe"

# Fetch the page content
response = requests.get(base_url)
response.raise_for_status()  # Raise error if request fails

# Parse HTML
soup = BeautifulSoup(response.text, "html.parser")

# Find the <a> tag that links to the Excel file (contains 'smt.xlsx')
link_tag = soup.find("a", href=lambda x: x and "smt.xlsx" in x)

if link_tag:
    # Build full URL (the href is relative)
    file_url = "https://www.arera.it" + link_tag['href']
    file_name = os.path.basename(file_url)
    
    # Download the Excel file
    r = requests.get(file_url)
    r.raise_for_status()
    
    # Save locally
    with open(file_name, "wb") as f:
        f.write(r.content)
    
    print(f"Downloaded file: {file_name}")
else:
    print("No Excel file found containing 'smt.xlsx'")

In [None]:
# === DOWNLOAD DATA FROM LOCAL XLS FILE ===

import pandas as pd
import json
from numbers import Number

# === COSTANTI ===
EXCEL_FILE = "E2025-3_smt.xlsx"
SHEET_NAME = 0
HEADER_ROW = 16
COLS = list(range(2, 21))

RESIDENTIAL_ROWS = {
    "EN €/kWh": 19,
    "FIX €/Y": 20,
    "POT €/kW/Y": 21
}
NON_RESIDENTIAL_ROWS = {
    "EN €/kWh": 28,
    "FIX €/Y": 29,
    "POT €/kW/Y": 30
}

PE_KEYS = ["F0", "F1", "F23"]


def normalize_value(v):
    if isinstance(v, str) and v.strip() == "-":
        return None
    return v


def load_raw_data(file_path, sheet, header_row, cols):
    df = pd.read_excel(file_path, sheet_name=sheet, header=None)
    headers = df.iloc[header_row, cols].astype(str).str.strip().tolist()
    return df, headers


def extract_grouped_data(df, rows_map, cols, headers):
    me_idx = next((i for i, h in enumerate(headers) if str(h).strip().upper() == "MATERIA ENERGIA"), None)
    if me_idx is None:
        raise ValueError("Intestazione 'Materia energia' non trovata.")

    PD_TO_PPE_KEYS = headers[3:me_idx]
    rest_start = me_idx + 3
    rest_keys = headers[rest_start:]

    extracted = []
    for descrizione, row_idx in rows_map.items():
        values = [normalize_value(v) for v in df.iloc[row_idx, cols].tolist()]

        pe = dict(zip(PE_KEYS, values[0:3]))

        pd_ppe = {
            k: v for k, v in zip(PD_TO_PPE_KEYS, values[3:3 + len(PD_TO_PPE_KEYS)])
            if str(k).strip().upper() != "MATERIA ENERGIA"
        }

        if descrizione == "EN €/kWh":
            me_values = values[me_idx:me_idx + 3]
            me = dict(zip(["Fascia unica", "F1", "F23"], me_values))
        else:
            me = values[me_idx]

        rest_vals = values[rest_start:]
        rest = {
            k: v for k, v in zip(rest_keys, rest_vals)
            if str(k).strip().upper() not in {"MATERIA ENERGIA", "TOTALE"}
        }

        row_dict = {"PE": pe, "Materia energia": me}
        row_dict.update(pd_ppe)
        row_dict.update(rest)
        row_dict = {k: v for k, v in row_dict.items() if str(k).strip().upper() != "TOTALE"}

        extracted.append({"descrizione": descrizione, "valori": row_dict})

    return extracted


def round_values(obj, decimals=5):
    if isinstance(obj, dict):
        return {k: round_values(v, decimals) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [round_values(elem, decimals) for elem in obj]
    elif isinstance(obj, Number):
        return round(obj, decimals)
    return obj


def save_json(data, filename):
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)
    print(f"File JSON salvato in {filename}")


def main():
    df_raw, headers = load_raw_data(EXCEL_FILE, SHEET_NAME, HEADER_ROW, COLS)
    residential_data = extract_grouped_data(df_raw, RESIDENTIAL_ROWS, COLS, headers)
    non_residential_data = extract_grouped_data(df_raw, NON_RESIDENTIAL_ROWS, COLS, headers)

    final_result = {
        "Abitazioni di residenza anagrafica": residential_data,
        "Abitazioni diverse dalla residenza anagrafica": non_residential_data,
    }

    final_result = round_values(final_result, decimals=5)
    save_json(final_result, "output.json")


if __name__ == "__main__":
    main()