In [None]:
import os
import json
import pandas as pd
from energyemissionsregio.config import DATA_PATH
import matplotlib.pyplot as plt
import seaborn as sns
from cmcrameri import cm

### potential predictor variables

In [None]:
var_df = pd.read_excel(
    os.path.join(DATA_PATH, "..", "..", "01_raw", "variables_with_details_and_tags.xlsx"),
    sheet_name="collected_variables_EU",
)

### potential predictor variables

In [None]:
x_vars_lau = ["population", 
             "area", 
             'continuous_urban_fabric_cover',
       'discontinuous_urban_fabric_cover',
       'industrial_or_commercial_units_cover',
       'port_areas_cover',
       'airports_cover', 'mineral_extraction_sites_cover',
       'dump_sites_cover', 'construction_sites_cover',
       'green_urban_areas_cover', 'sport_and_leisure_facilities_cover',
       'non_irrigated_arable_land_cover',
       'permanently_irrigated_land_cover', 'rice_fields_cover',
       'vineyards_cover', 'fruit_trees_and_berry_plantations_cover',
       'olive_groves_cover', 'pastures_cover', 'permanent_crops_cover',
       'complex_cultivation_patterns_cover',
       'agriculture_with_natural_vegetation_cover',
       'agro_forestry_areas_cover', 'broad_leaved_forest_cover',
       'coniferous_forest_cover', 'mixed_forest_cover',
       'natural_grasslands_cover', 'moors_and_heathland_cover',
       'sclerophyllous_vegetation_cover',
       'transitional_woodland_shrub_cover',
       'beaches_dunes_and_sand_cover', 'bare_rocks_cover',
       'sparsely_vegetated_areas_cover', 'burnt_areas_cover',
       'glaciers_and_perpetual_snow_cover', 'inland_marshes_cover',
       'peat_bogs_cover', 'salt_marshes_cover', 'salines_cover',
       'intertidal_flats_cover', 'water_courses_cover',
       'water_bodies_cover', 'coastal_lagoons_cover', 'estuaries_cover',
       'sea_and_ocean_cover',
       "number_of_iron_and_steel_industries",
      "number_of_cement_industries",
      "number_of_refineries",
      "number_of_paper_and_printing_industries",
      "number_of_chemical_industries",
      "number_of_glass_industries",
      "number_of_non_ferrous_metals_industries",
      "number_of_non_metallic_minerals_industries",
       "railway_network",
       "road_network",
       "number_of_buildings",
       'average_air_pollution_due_to_pm25',
       'average_air_pollution_due_to_no2',
       'average_air_pollution_due_to_o3',
       'average_air_pollution_due_to_pm10',
             ]

In [None]:
X_vars_df = None

for var_name in x_vars_lau:
    
    _df = pd.read_csv(
        os.path.join(DATA_PATH, f"{var_name}.csv")
    )
    _df = _df[_df["region_code"].str.startswith("ES")][["region_code", "value"]].copy()

    _df = _df.fillna(0) # filling NAs for point vars. Non-point vars have no NAs in Spain

    _df.rename(columns={"value": var_name}, inplace=True)

    if X_vars_df is not None:
        X_vars_df = pd.merge(X_vars_df, _df, on="region_code", how="outer")
    else:
        X_vars_df = _df

In [None]:
X_vars_df

### drop 0  variance columns

In [None]:

numerical_cols = X_vars_df.select_dtypes(include=['number'])
variance = numerical_cols.var()

# Identify columns with zero variance
zero_variance_cols = variance[variance == 0].index

In [None]:
zero_variance_cols

### between paris of variables that are highly correlated, drop 1 variable

In [None]:
relevant_vars_corr = X_vars_df.copy()
relevant_vars_corr.drop(columns="region_code", inplace=True)

corr_df = relevant_vars_corr.corr()

for idx, row in corr_df.iterrows():
    temp_dict = dict(row)
    for key, value in temp_dict.items():
        if (idx != key) & (value>=0.9):
            print(f"{idx} and {key} are highly correlated")

In [None]:
vars_to_impute = [
"es_utilized_agricultural_area",
"es_number_of_dwellings",
]

In [None]:
for var_to_impute in vars_to_impute:
    y_var_df = pd.read_csv(
        os.path.join(DATA_PATH, f"{var_to_impute}.csv")
    )
    
    y_var_df.rename(columns={"value": var_to_impute}, inplace=True)
    y_var_df.drop(columns="year", inplace=True)

    final_df_with_reg_code = pd.merge(X_vars_df, y_var_df, on="region_code", how="outer")

    final_df = final_df_with_reg_code.copy()
    final_df.drop(columns="region_code", inplace=True)
    final_df.dropna(axis=0, inplace=True)

    for corr_threshold in [0.1, 0.5]:
        final_df = final_df.reindex(sorted(final_df.columns), axis=1)

        correlations = final_df.corr()[[var_to_impute]].drop(var_to_impute)
        correlations = correlations[(correlations[var_to_impute] <=-corr_threshold) | (correlations[var_to_impute] >=corr_threshold)]

        correlations = correlations.transpose()

        chosen_vars = list(correlations.columns)
        chosen_vars.extend([var_to_impute])

        save_df = final_df[chosen_vars].copy()

        save_df.to_csv(os.path.join("..", "..", "data", 
                                    "missing_value_imputation", 
                                    f"{var_to_impute}_{corr_threshold}corr.csv"), index=False)
        
        predictor_vars = list(save_df.columns)
        predictor_vars.remove(var_to_impute)

        with open(
            os.path.join("..", "..", "data", "missing_value_imputation", 
                         "predictor_vars", 
                         f"{var_to_impute}_{corr_threshold}corr.json"), "w"
        ) as fp:
            json.dump(list(predictor_vars), fp)

In [None]:
corr_df = X_vars_df.copy()
for var_to_impute in vars_to_impute:
    y_var_df = pd.read_csv(
        os.path.join(DATA_PATH, f"{var_to_impute}.csv")
    )

    y_var_df.rename(columns={"value": var_to_impute}, inplace=True)
    y_var_df.drop(columns="year", inplace=True)

    corr_df = pd.merge(corr_df, y_var_df, on="region_code", how="outer")

corr_df.drop(columns="region_code", inplace=True)

correlations = corr_df.corr()[vars_to_impute].drop(vars_to_impute)
correlations = correlations.round(1)
correlations = correlations.transpose()

correlations = correlations.loc[:, (correlations != 0).any()]

low_correlations = correlations.loc[:, correlations.apply(lambda col: (col < 0.3).all(), axis=0)]

high_correlations = correlations.loc[:, correlations.apply(lambda col: (col >= 0.3).any(), axis=0)]

# Plotting the heatmap

fig = plt.figure(figsize=(13, 6))
gs = fig.add_gridspec(2, 1, wspace=0, hspace=2.5)

# low correlations --------
ax1 = plt.subplot(gs[:1, :])

sns.heatmap(low_correlations.abs(), annot=True, cmap=cm.batlow_r, cbar=True,  cbar_kws={'shrink': 0.8, 'pad': 0.01}, vmin=0, vmax=1, ax=ax1)
plt.xticks(rotation=45, ha="right") 
plt.yticks(rotation=0) 

# high correaltions --------
ax2 = plt.subplot(gs[1:, :])

sns.heatmap(high_correlations.abs(), annot=True, cmap=cm.batlow_r, cbar=True,  cbar_kws={'shrink': 0.8, 'pad': 0.01}, vmin=0, vmax=1, ax=ax2)
plt.xticks(rotation=45, ha="right") 
plt.yticks(rotation=0) 

plt.savefig(os.path.join("..", "..", "figures", "missing_value_imputation", 
                            f"lau_es_corr.png"), 
                            bbox_inches='tight')  # Save the figure as a PNG file