In [None]:
import os 

import pandas as pd
import geopandas as gpd
from energyemissionsregio.config import DATA_PATH, SHP_PATH
from energyemissionsregio.utils import solve_proxy_equation, get_proxy_var_list
from energyemissionsregio.disaggregation import perform_proxy_based_disaggregation
from energyemissionsregio.plotting_functions import plot_validation_data
from sklearn.metrics import mean_squared_error

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
cwd = os.getcwd()

In [None]:
nuts2_shp = gpd.read_file(os.path.join(SHP_PATH, "NUTS2.shp"))
nuts2_shp = nuts2_shp[nuts2_shp["code"].str.startswith(("DE", "ES"))]

### Iron and steel industries emissions (Eurostat)

In [None]:
eurostat_data_nuts0 = pd.read_csv(os.path.join(DATA_PATH, "ghg_emissions_from_fc_in_iron_and_steel_industries.csv"))
eurostat_data_nuts0 = eurostat_data_nuts0[eurostat_data_nuts0["region_code"].str.startswith(("DE", "ES"))][
    ["region_code", "value"]].copy()

In [None]:
eurostat_data_nuts0

### EDGAR data

In [None]:
validation_data = pd.read_csv(os.path.join(cwd, "..", "..", "data", 
                                           "validation_data", "iron_and_steel_emissions_EDGAR.csv"))

validation_data = validation_data[validation_data["code"].str.startswith(("DE", "ES"))][
    ["code", "_sum"]].copy()

validation_data.rename(columns={"code": "region_code", "_sum": "value"}, inplace=True)

validation_data["value"] = validation_data["value"]

In [None]:
target_data = validation_data.copy()
target_data["region_code"] = target_data["region_code"].str[:2]

target_data = target_data.groupby("region_code").sum().reset_index()

In [None]:
target_data["value_confidence_level"] = 5

difference between the Eurostat data disaggregated and hotmaps data used for validation of disaggregation 

In [None]:
diff_df = pd.merge(eurostat_data_nuts0, target_data, on="region_code", suffixes=("_eurostat", "_edgar"))

diff_df["value_edgar"] = diff_df["value_edgar"]/1e6 # Tonnes to Mt

diff_df["diff"] = diff_df["value_eurostat"] - diff_df["value_edgar"]

diff_df

In [None]:
proxy_equation = "number_of_iron_and_steel_industries"

In [None]:
# read in proxy data 
proxy_var_list = get_proxy_var_list(proxy_equation)

proxy_data_dict = {}
for proxy_var in proxy_var_list:
    if os.path.exists(os.path.join(cwd, "..", "..", "data", "disaggregated_data", f"{proxy_var}.csv")):
        proxy_data = pd.read_csv(os.path.join(cwd, "..", "..", "data", "disaggregated_data", f"{proxy_var}.csv"))
    else:
        proxy_data = pd.read_csv(os.path.join(DATA_PATH, f"{proxy_var}.csv"))
        proxy_data["value_confidence_level"] = 5

    proxy_data = proxy_data[proxy_data["region_code"].str.startswith(("DE", "ES"))][["region_code", 
                                                                                     "value", 
                                                                                     "value_confidence_level"]].copy()

    proxy_data["value"] = proxy_data["value"].fillna(0)
    proxy_data_dict.update({proxy_var: proxy_data})

solved_proxy_data = solve_proxy_equation(proxy_equation, proxy_data_dict)

disagg_data = perform_proxy_based_disaggregation(target_data, solved_proxy_data, "NUTS0", 5)

In [None]:
disagg_data["NUTS2"] = disagg_data["region_code"].str[:4]

disagg_data_nuts2 = disagg_data[["NUTS2", "value"]].copy()
disagg_data_nuts2.rename(columns={"NUTS2": "region_code"}, inplace = True)
disagg_data_nuts2 = disagg_data_nuts2.groupby("region_code").sum().reset_index()

In [None]:
# calulate RMSE and country total -------------
merged_df_mae = pd.merge(validation_data, disagg_data_nuts2, on = "region_code", how="outer", suffixes=("_true", "_disagg"))

# merged_df_mae["value_true"] = merged_df_mae["value_true"] / 1e6 # tonnes to Mt
# merged_df_mae["value_disagg"] = merged_df_mae["value_disagg"] / 1e6 # tonnes to Mt

true_values_de = merged_df_mae[merged_df_mae["region_code"].str.startswith("DE")]["value_true"]
disagg_values_de = merged_df_mae[merged_df_mae["region_code"].str.startswith("DE")]["value_disagg"]

true_values_es = merged_df_mae[merged_df_mae["region_code"].str.startswith("ES")]["value_true"]
disagg_values_es = merged_df_mae[merged_df_mae["region_code"].str.startswith("ES")]["value_disagg"]

rmse_de = mean_squared_error(true_values_de, disagg_values_de, squared=False).round(2)
rmse_es = mean_squared_error(true_values_es, disagg_values_es, squared=False).round(2)


In [None]:
# validation_data["value"] = validation_data["value"] / 1e6 # tonnes to Mt
# disagg_data_nuts2["value"] = disagg_data_nuts2["value"] / 1e6 # tonnes to Mt

In [None]:
fig_path = os.path.join("..", "..", "figures",
                         "disaggregation_validation", 
                        "validation_iron_and_steel_industries_emissions.png")

plot_validation_data(validation_data, disagg_data_nuts2, 
                     nuts2_shp, 
                     rmse_de, rmse_es, "tonnes", "EDGAR", "log", fig_path)