In [None]:
import os
import json
import pandas as pd
from energyemissionsregio.config import DATA_PATH
import matplotlib.pyplot as plt
import seaborn as sns

### potential predictor variables

In [None]:
var_df = pd.read_excel(
    os.path.join(DATA_PATH, "..", "..", "01_raw", "variables_with_details_and_tags.xlsx"),
    sheet_name="collected_variables_EU",
)

In [None]:
vars = ["population", 
             "area", 
             'continuous_urban_fabric_cover',
       'discontinuous_urban_fabric_cover',
       'industrial_or_commercial_units_cover',
       'port_areas_cover',
       'airports_cover', 'mineral_extraction_sites_cover',
       'dump_sites_cover', 'construction_sites_cover',
       'green_urban_areas_cover', 'sport_and_leisure_facilities_cover',
       'non_irrigated_arable_land_cover',
       'permanently_irrigated_land_cover', 'rice_fields_cover',
       'vineyards_cover', 'fruit_trees_and_berry_plantations_cover',
       'olive_groves_cover', 'pastures_cover', 'permanent_crops_cover',
       'complex_cultivation_patterns_cover',
       'agriculture_with_natural_vegetation_cover',
       'agro_forestry_areas_cover', 'broad_leaved_forest_cover',
       'coniferous_forest_cover', 'mixed_forest_cover',
       'natural_grasslands_cover', 'moors_and_heathland_cover',
       'sclerophyllous_vegetation_cover',
       'transitional_woodland_shrub_cover',
       'beaches_dunes_and_sand_cover', 'bare_rocks_cover',
       'sparsely_vegetated_areas_cover', 'burnt_areas_cover',
       'glaciers_and_perpetual_snow_cover', 'inland_marshes_cover',
       'peat_bogs_cover', 'salt_marshes_cover', 'salines_cover',
       'intertidal_flats_cover', 'water_courses_cover',
       'water_bodies_cover', 'coastal_lagoons_cover', 'estuaries_cover',
       'sea_and_ocean_cover',
       "number_of_iron_and_steel_industries",
      "number_of_cement_industries",
      "number_of_refineries",
      "number_of_paper_and_printing_industries",
      "number_of_chemical_industries",
      "number_of_glass_industries",
      "number_of_non_ferrous_metals_industries",
      "number_of_non_metallic_minerals_industries",
       "railway_network",
       "road_network",
       "number_of_buildings",
       'average_air_pollution_due_to_pm2.5',
       'average_air_pollution_due_to_no2',
       'average_air_pollution_due_to_o3',
       'average_air_pollution_due_to_pm10',
       'number_of_buffaloes', 'number_of_cattle', 'number_of_pigs',
       'number_of_sheeps', 'number_of_chickens', 'number_of_goats',
             ]

### get predictor data for es_utilized_agricultural_area

In [None]:
var_to_impute = "es_utilized_agricultural_area"

x_vars = vars.copy()

In [None]:
X_vars_df = None

for var_name in x_vars:
    _df = pd.read_csv(
        os.path.join(DATA_PATH, f"{var_name}.csv")
    )
    ## only Spain data is taken because es_utilized_agricultural_area is avaiable only for Spain. 
    # Missing value imputation is done for the regions in Spain with missing data 
    # and all the regions in Germany
    _df = _df[_df["region_code"].str.startswith("ES")][["region_code", "value"]].copy() 
    _df.rename(columns={"value": var_name}, inplace=True)

    if X_vars_df is not None:
        X_vars_df = pd.merge(X_vars_df, _df, on="region_code", how="outer")
    else:
        X_vars_df = _df

In [None]:
# Data sources, including Corine Land Cover, OpenStreetMap, and SEEnergies, provide spatial data either as raster files or at 
# specific X-Y coordinates. This spatial information is overlain with LAU regions and aggregated at the LAU level to create 
# regional datasets. If no data points overlap with a given LAU region, the value is set to zero
X_vars_df = X_vars_df.fillna(0)

In [None]:
relevant_vars_corr = X_vars_df.copy()
relevant_vars_corr.drop(columns="region_code", inplace=True)

corr_df = relevant_vars_corr.corr()

for idx, row in corr_df.iterrows():
    temp_dict = dict(row)
    for key, value in temp_dict.items():
        if (idx != key) & (value>=0.9):
            print(f"{idx} and {key} are highly correlated")


In [None]:
y_var_df = pd.read_csv(
        os.path.join(DATA_PATH, f"{var_to_impute}.csv")
    )

y_var_df = y_var_df[y_var_df["region_code"].str.startswith("ES")][["region_code", "value"]].copy()
y_var_df.rename(columns={"value": var_to_impute}, inplace=True)

final_df_with_reg_code = pd.merge(X_vars_df, y_var_df, on="region_code", how="outer")

In [None]:
final_df = final_df_with_reg_code.copy()
final_df.drop(columns="region_code", inplace=True)

In [None]:
final_df = final_df.reindex(sorted(final_df.columns), axis=1)

### correlations between utilized_agricultural_area and predictor vars

In [None]:
correlations = final_df.corr()[[var_to_impute]].drop(var_to_impute)

correlations = correlations.transpose()
correlations = correlations.round(2)
# Plotting the heatmap
plt.figure(figsize=(18, 1))  # Adjust the figure size as needed
sns.heatmap(correlations.abs(), annot=True, cmap="Blues", cbar=True, annot_kws={"rotation": 90}, vmin=0, vmax=1)
plt.yticks([])
plt.savefig(os.path.join("..", "..", "figures", "missing_value_imputation", f"{var_to_impute}.png"), bbox_inches='tight')  # Save the figure as a PNG file
plt.show()

### save two sets of data for missing value imputation - predictors with correlation 0.1 and above with utilized_agricultural_area and 0.5 and above. 
### missing value imputation is performed on a computational server at FZJ. The imputation script: energyemissiosnregio/CAESAR_scripts/xg_boost.py

In [None]:
correlations = final_df.corr()[[var_to_impute]].drop(var_to_impute)

In [None]:
for corr_threshold in [0.1, 0.5]:
    
    final_df = final_df.reindex(sorted(final_df.columns), axis=1)

    correlations = final_df.corr()[[var_to_impute]].drop(var_to_impute)
    correlations = correlations[(correlations[var_to_impute] <=-corr_threshold) | (correlations[var_to_impute] >=corr_threshold)]
    
    correlations = correlations.transpose()

    chosen_vars = list(correlations.columns)
    chosen_vars.extend([var_to_impute])

    save_df = final_df[chosen_vars].copy()

    save_df.to_csv(os.path.join("..", "..", "data", 
                                "missing_value_imputation", 
                                f"{var_to_impute}_{corr_threshold}corr.csv"), index=False)

    predictor_vars = list(save_df.columns)
    predictor_vars.remove(var_to_impute)

    with open(
        os.path.join("..", "..", "data", "missing_value_imputation", 
                     "predictor_vars", 
                     f"{var_to_impute}_{corr_threshold}corr.json"), "w"
    ) as fp:
        json.dump(list(predictor_vars), fp)

    print(f"saved data for {corr_threshold} correlation threshold")