In [None]:
import os
import pickle
import json
import math

import pandas as pd
from energyemissionsregio.config import DATA_PATH, units
from energyemissionsregio.utils import get_confidence_level

In [None]:
cwd = os.getcwd()

In [None]:
var_df = pd.read_excel(
    os.path.join(DATA_PATH, "..", "..", "01_raw", "variables_with_details_and_tags.xlsx"),
    sheet_name="collected_variables_EU",
)

In [None]:
def get_x_vars_df(predictor_vars):
    X_vars_df = None

    for var_name in predictor_vars:
        spatial_resolution = var_df[var_df["var_name"] == var_name][
                    "spatial_level"
                ].values[0]

        if spatial_resolution == "LAU":
        
            _df = pd.read_csv(
                os.path.join(DATA_PATH, f"{var_name}.csv")
            )
            _df = _df[_df["region_code"].str.startswith(("DE", "ES"))][["region_code", "value"]].copy()

            _df = _df.fillna(0) # filling NAs for point vars. Non-point vars have no NAs in Germany

            #convert LAU to NUTS3 regions
            _df["region_code"] = _df["region_code"].str.split("_").str[0]

            # aggregate per NUTS3 region 
            agg_method = var_df[var_df["var_name"] == var_name][
                    "var_aggregation_method"
                ].values[0]

            if agg_method == "SUM":
                _df = _df.groupby("region_code").sum().reset_index()
            elif agg_method == "AVG":
                _df = _df.groupby("region_code").mean().reset_index()
            elif agg_method == "MAX":
                _df = _df.groupby("region_code").max().reset_index()
            else:
                raise ValueError("Unknown var aggregation method")

            _df.rename(columns={"value": var_name}, inplace=True)

            if X_vars_df is not None:
                X_vars_df = pd.merge(X_vars_df, _df, on="region_code", how="outer")
            else:
                X_vars_df = _df

        else:
            if var_name.startswith("cproj_"):
                _df_de = pd.read_csv(os.path.join(DATA_PATH, "..", "climate_projections", "DE", var_name, "2020.csv"))
                _df_de = _df_de[_df_de["climate_experiment"] == "RCP4.5"].copy()

                _df_de.drop(columns="climate_experiment", inplace=True)

                _df_es = pd.read_csv(os.path.join(DATA_PATH, "..", "climate_projections", "ES", var_name, "2020.csv"))
                _df_es = _df_es[_df_es["climate_experiment"] == "RCP4.5"].copy()

                _df_es.drop(columns="climate_experiment", inplace=True)

                _df = pd.concat([_df_de, _df_es])

            else:
                _df = pd.read_csv(
                    os.path.join(DATA_PATH, f"{var_name}.csv")
                )
                _df = _df[_df["region_code"].str.startswith(("DE", "ES"))][["region_code", "value"]].copy()
            
            
            _df.rename(columns={"value": var_name}, inplace=True)

            if X_vars_df is not None:
                X_vars_df = pd.merge(X_vars_df, _df, on="region_code", how="outer")
            else:
                X_vars_df = _df

    return X_vars_df

In [None]:
vars_details = [
                {"var_to_impute": "de_employment_in_textile_and_leather_manufacturing",
                    "best_corr_threshold": 0.5, 
                    "r2": 0.18},

                {"var_to_impute": "de_employment_in_food_and_beverage_manufacturing",
                    "best_corr_threshold": 0.1, 
                    "r2": 0.29},

                {"var_to_impute": "de_employment_in_mechanical_and_automotive_engineering", 
                    "best_corr_threshold": 0.5, 
                    "r2": 0.92},

                {"var_to_impute": "de_employment_in_mechatronics_energy_and_electrical",
                    "best_corr_threshold": 0.5, 
                    "r2": 0.88},

                {"var_to_impute": "de_employment_in_wood_processing", 
                     "best_corr_threshold": 0.1, 
                     "r2": 0.51},

                {"var_to_impute": "de_number_of_passenger_cars_emission_group_euro_1", 
                     "best_corr_threshold": 0.1, 
                     "r2": 0.92},

                {"var_to_impute": "de_number_of_passenger_cars_emission_group_euro_2",
                     "best_corr_threshold": 0.5,
                     "r2": 0.91},

                {"var_to_impute": "de_number_of_passenger_cars_emission_group_euro_3",
                    "best_corr_threshold": 0.1, 
                    "r2": 0.93},

                {"var_to_impute": "de_number_of_passenger_cars_emission_group_euro_4",
                    "best_corr_threshold": 0.1, 
                    "r2": 0.92},

                {"var_to_impute": "de_number_of_passenger_cars_emission_group_euro_5",
                    "best_corr_threshold": 0.5, 
                    "r2": 0.93},

                {"var_to_impute": "de_number_of_passenger_cars_emission_group_euro_6r",
                    "best_corr_threshold": 0.5, 
                    "r2": 0.94},

                {"var_to_impute": "de_number_of_passenger_cars_emission_group_euro_6dt",
                    "best_corr_threshold": 0.5, 
                    "r2": 0.95},

                {"var_to_impute": "de_number_of_passenger_cars_emission_group_euro_6d",
                    "best_corr_threshold": 0.5, 
                    "r2": 0.54}, 

                {"var_to_impute": "de_number_of_passenger_cars_emission_group_euro_other",
                    "best_corr_threshold": 0.5, 
                    "r2": 0.86},

                {"var_to_impute": "de_residential_building_living_area",
                     "best_corr_threshold": 0.1, 
                     "r2": 0.95},

                {"var_to_impute": "de_non_residential_building_living_area",
                    "best_corr_threshold": 0.5, 
                    "r2": 0.88}
                ]
                

In [None]:
for vars_detail_dict in vars_details:
    var_to_impute = vars_detail_dict["var_to_impute"]
    best_corr_threshold = vars_detail_dict["best_corr_threshold"]
    r2 = vars_detail_dict["r2"]

    imputed_value_confidence_level = get_confidence_level(r2)

    print(var_to_impute)

    with open(
            os.path.join(
                cwd, "..", "..", "data", 
                "missing_value_imputation",
                  "predictor_vars", 
                  f"{var_to_impute}_{best_corr_threshold}corr.json"
            )
        ) as f:
            predictor_vars = tuple(json.load(f))

    X_vars_df = get_x_vars_df(predictor_vars)

    y_var_df = pd.read_csv(
        os.path.join(DATA_PATH, f"{var_to_impute}.csv")
    )

    y_var_df = y_var_df[y_var_df["region_code"].str.startswith(("DE", "ES"))][["region_code", "value"]].copy()
    y_var_df.rename(columns={"value": var_to_impute}, inplace=True)

    final_df = pd.merge(X_vars_df, y_var_df, on="region_code", how="left")

    input_df = final_df.copy()
    input_df = input_df[input_df[var_to_impute].isna()].drop(columns=[var_to_impute])

    input_df_no_reg_code = input_df.drop(columns=["region_code"])

    # Construct the file path
    file_path = os.path.join(cwd, "..", "..", "data", 
                            "missing_value_imputation", 
                            "models", 
                            f"{var_to_impute}_xgb_{best_corr_threshold}corr.pkl")
    
    # Load the model from the pickle file
    with open(file_path, 'rb') as f:
        model = pickle.load(f)
 
    y_pred = model.predict(input_df_no_reg_code)

    # round off values 
    if var_to_impute.startswith(("de_employment", "de_number_of_passenger_cars")):
        y_pred = y_pred.astype(int)
    else:
         y_pred = y_pred.round(2)

    input_df['imputed_values'] = y_pred

    assert input_df[input_df["region_code"].str.startswith("DE")]["imputed_values"].min() >= 0

    imputed_df = final_df[["region_code", var_to_impute]].copy()
    imputed_df.rename(columns={var_to_impute: "value"}, inplace=True) 

    imputed_df["value_confidence_level"] = 5 # VERY HIGH 

    for idx, row in final_df.iterrows():
        region_code = row["region_code"]

        if math.isnan(row[var_to_impute]):
            impute_value = input_df[input_df["region_code"] == region_code][["imputed_values"]].values.item()

            var_unit = units[var_to_impute]
            if var_unit == "number":
                imputed_df.loc[idx, "value"] = int(impute_value)
            else:
                imputed_df.loc[idx, "value"] = impute_value
                 
            imputed_df.loc[idx, "value_confidence_level"] = imputed_value_confidence_level

    imputed_df.to_csv(os.path.join(
            cwd, "..", "..", "data", "imputed_data", f"{var_to_impute}.csv"
        ), index=False)

