In [None]:
import os
import pandas as pd
import geopandas as gpd
from energyemissionsregio.config import SHP_PATH
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error


### validation - Germany

In [None]:
nuts1 = gpd.read_file(os.path.join(SHP_PATH, "NUTS1.shp"))
nuts1_de = nuts1[nuts1["code"].str.startswith("DE")].copy()

nuts1_de["name_small"] = nuts1_de["name"].str.lower()

In [None]:
# data from destatis for the year 2020, in hecatres 
validation_data = pd.read_csv(os.path.join("..", "..", "data", "validation_data", "41141-0101_de_flat.csv"), delimiter=";")

utilized_agri_area_de_states = validation_data[validation_data["value_variable_label"] == "Fläche"][["1_variable_attribute_label", "value"]].copy()

utilized_agri_area_de_states["value"] = utilized_agri_area_de_states["value"] * 0.01 #hectare to sq. km.

In [None]:
utilized_agri_area_de_states["name_small"] = utilized_agri_area_de_states["1_variable_attribute_label"].str.lower()

In [None]:
final_valdiation_data = pd.merge(nuts1_de, utilized_agri_area_de_states, on="name_small", how = "left")

In [None]:
final_valdiation_data

In [None]:
imputed_data = pd.read_csv(os.path.join("..", "..", "data", "imputed_data", "es_utilized_agricultural_area.csv"))

In [None]:
imputed_data_de = imputed_data[imputed_data["region_code"].str.startswith("DE")].copy()

In [None]:
imputed_data_de["NUTS2"] = imputed_data_de["region_code"].str[:3]

In [None]:
imputed_data_de_nuts2 = imputed_data_de[["value", "NUTS2"]].groupby("NUTS2").sum().reset_index()

In [None]:
imputed_data_de_nuts2

In [None]:
# Ensure region_code is sorted consistently
plot_data = pd.merge(final_valdiation_data, imputed_data_de_nuts2, left_on = 'code', right_on="NUTS2", how="left", suffixes=["_validation", "_imputed"])

# Plot
plt.figure(figsize=(9, 4))

plt.scatter(plot_data['1_variable_attribute_label'], plot_data['value_imputed'], color='red', label='Predicted data', alpha=0.7)
plt.scatter(plot_data['1_variable_attribute_label'], plot_data['value_validation'], color='blue', label='Validation data', alpha=0.7)

plt.xlabel('Region name')
plt.ylabel('Utilized agricultural area (square kilometer)')
plt.xticks(rotation=45, ha="right") 
plt.legend()
plt.grid(True, linestyle='--', alpha=0.6)

plt.savefig(os.path.join("..", "..", "figures", "missing_value_imputation", 
                            f"uaa_de_validation.png"), 
                            bbox_inches='tight')  # Save the figure as a PNG file

In [None]:
plot_data["diff"] = abs(plot_data["value_validation"] - plot_data["value_imputed"])

In [None]:
plot_data.sort_values("diff")