### Once the best model is chosen based on the experiments on the computational cluster, the model is used to impute the missing data here

In [None]:
import os
import pickle
import json
import math

import pandas as pd
from energyemissionsregio.config import DATA_PATH
from energyemissionsregio.utils import get_confidence_level

In [None]:
cwd = os.getcwd()

In [None]:
var_to_impute = "es_utilized_agricultural_area"
best_corr_threshold = 0.1
r2 = 0.83

imputed_value_confidence_level = get_confidence_level(r2)

In [None]:
with open(
        os.path.join(
            cwd, "..", "..", "data", "missing_value_imputation", "predictor_vars", f"{var_to_impute}_{best_corr_threshold}corr.json"
        )
    ) as f:
        predictor_vars = tuple(json.load(f))

X_vars_df = None

for var_name in predictor_vars:
    _df = pd.read_csv(
        os.path.join(DATA_PATH, f"{var_name}.csv")
    )
    _df = _df[_df["region_code"].str.startswith(("DE", "ES"))][["region_code", "value"]].copy()

    _df = _df.fillna(0) # filling NAs for point vars. 

    _df.rename(columns={"value": var_name}, inplace=True)

    if X_vars_df is not None:
        X_vars_df = pd.merge(X_vars_df, _df, on="region_code", how="outer")
    else:
        X_vars_df = _df

y_var_df = pd.read_csv(
        os.path.join(DATA_PATH, f"{var_to_impute}.csv")
    )

y_var_df = y_var_df[y_var_df["region_code"].str.startswith(("DE", "ES"))][["region_code", "value"]].copy()
y_var_df.rename(columns={"value": var_to_impute}, inplace=True)

final_df = pd.merge(X_vars_df, y_var_df, on="region_code", how="left")


In [None]:
input_df = final_df.copy()
input_df = input_df[input_df[var_to_impute].isna()].drop(columns=[var_to_impute])

input_df_no_reg_code = input_df.drop(columns=["region_code"])

# Construct the file path
file_path = os.path.join(cwd, "..", "..", 
                         "data", 
                         "missing_value_imputation", 
                         "models", 
                         f"{var_to_impute}_xgb_{best_corr_threshold}corr.pkl")

# Load the model from the pickle file
with open(file_path, 'rb') as f:
    model = pickle.load(f)

y_pred = model.predict(input_df_no_reg_code)
y_pred = y_pred.round(2)

input_df['imputed_values'] = y_pred

assert input_df[input_df["region_code"].str.startswith("DE")]["imputed_values"].min() >= 0


In [None]:

imputed_df = final_df[["region_code", var_to_impute]].copy()
imputed_df.rename(columns={var_to_impute: "value"}, inplace=True) 

imputed_df["value_confidence_level"] = 5 # VERY HIGH 

for idx, row in final_df.iterrows():
    region_code = row["region_code"]

    if math.isnan(row[var_to_impute]):

        imputed_df.loc[idx, "value"] = input_df[input_df["region_code"] == region_code][["imputed_values"]].values.item()
        imputed_df.loc[idx, "value_confidence_level"] = imputed_value_confidence_level


In [None]:
imputed_df

In [None]:
imputed_df.to_csv(os.path.join(
            cwd, "..", "..", "data", "imputed_data", f"{var_to_impute}.csv"
        ), index=False)