In [1]:
#%% Load packages 

import numpy as np
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from main import *
import concurrent.futures
from cv_hyper_search import *
from eval_cf_MISE import *


Real Data

In [2]:
# Real-world - train
df = pd.read_csv('data/food_df_ana.csv') 
year = 2017
df = df[df['year'] == year]

print(df)

# Prepare real world data
X = df.iloc[:, 5:].to_numpy()
A = df.iloc[:, 4].to_numpy()
Y = df.iloc[:, 2].to_numpy()

n = X.shape[0]
p = X.shape[1]

data = np.concatenate([Y.reshape(n,1), A.reshape(n,1), X],axis=1)

# Data standardization: min-max scaler
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data[:,2:])
data_train = np.concatenate([data[:,0:2], data_scaled], axis=1)

# Real world - test
df = pd.read_csv('data/food_df_ana.csv')
year = 2018
df = df[df['year'] == year]

# Prepare real world data
X = df.iloc[:, 5:].to_numpy()
A = df.iloc[:, 4].to_numpy()
Y = df.iloc[:, 2].to_numpy()

n = X.shape[0]
p = X.shape[1]

data2 = np.concatenate([Y.reshape(n,1), A.reshape(n,1), X],axis=1)

# Data standardization: min-max scaler
scaler = MinMaxScaler()
data_scaled2 = scaler.fit_transform(data2[:,2:])
data_val = np.concatenate([data2[:,0:2], data_scaled2], axis=1)


         country  year  change_msfi  ad_sdg2_aid_lag  ad_sdg2_aid  \
0    Afghanistan  2017    -0.099778       342.626137   328.921352   
1         Angola  2017    -0.015021         7.124646     9.164906   
2        Albania  2017    -0.005155        12.088164    22.409065   
3      Argentina  2017    -0.348958        32.380542    78.922367   
4        Armenia  2017     0.269365        67.599723    62.540959   
..           ...   ...          ...              ...          ...   
87      Viet Nam  2017     0.048244       137.741657   173.772372   
88       Vanuatu  2017     0.048950         1.725439     6.710924   
89         Samoa  2017    -0.419062         4.132086    11.895095   
90  South Africa  2017    -0.060355         8.869795    10.412156   
91      Zimbabwe  2017    -0.030912         2.618697     1.578156   

    ad_exc2_aid_gdp       infl  food_prod  food_imp_net_gdp    log_gdp  \
0         16.075729   4.975952      99.84         15.109628  23.723222   
1          0.195835  29

In [3]:
#%% Optimal hyperparameters for inference models

# Define models to be optimized
models = ['lm', 'nn', 'gps', 'dr', 'sci', 'cgct_gps', 'rf', 'cgct_rf', 'cf','cgct_cf']

def hyperparameter_search(models, data_train, data_val):
    # Create a list to store the results
    hyper_opt_list = []

    # Use ThreadPoolExecutor or ProcessPoolExecutor for parallelization
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Submit each model optimization task to the executor
        futures = {executor.submit(CV_hyperpar_search, data_train, data_val, model, n_trials=50, n_jobs=5, name=f"{model}"): model for model in models}
        
        # Wait for each future to complete and gather results
        for future in concurrent.futures.as_completed(futures):
            model = futures[future]
            result = future.result()  # Get the result of the optimization
            hyper_opt_list.append({
                'model': model,
                'best_hyperpars': result[0],
                'best_loss': result[1]
            })
    
    return hyper_opt_list

hyper_opt_list = hyperparameter_search(models, data_train, data_val)
df_hyper_opt = pd.DataFrame(hyper_opt_list)

print(df_hyper_opt)



[I 2025-02-12 23:22:00,182] A new study created in RDB with name: get_hp_gps
[I 2025-02-12 23:22:00,310] A new study created in RDB with name: get_hp_dr
[I 2025-02-12 23:22:00,330] A new study created in RDB with name: get_hp_nn
[I 2025-02-12 23:22:00,381] A new study created in RDB with name: get_hp_sci
[I 2025-02-12 23:22:00,408] A new study created in RDB with name: get_hp_lm
[I 2025-02-12 23:22:00,410] A new study created in RDB with name: get_hp_cgct_gps
[I 2025-02-12 23:22:00,410] A new study created in RDB with name: get_hp_cf
[I 2025-02-12 23:22:00,411] A new study created in RDB with name: get_hp_rf
[I 2025-02-12 23:22:00,411] A new study created in RDB with name: get_hp_cgct_cf
[I 2025-02-12 23:22:00,411] A new study created in RDB with name: get_hp_cgct_rf
[I 2025-02-12 23:22:00,997] Trial 0 finished with value: 0.2615225742740611 and parameters: {}. Best is trial 0 with value: 0.2615225742740611.
[I 2025-02-12 23:22:01,012] Trial 1 finished with value: 0.2615225742740611 an

      model                                     best_hyperpars  best_loss
0       gps                                                 {}   0.261523
1        lm     {'alpha_lm': 3.181609443405098, 'order_lm': 1}   0.257251
2        rf       {'n_estimators': 116, 'min_samples_leaf': 1}   0.255672
3        cf       {'n_estimators': 328, 'min_samples_leaf': 1}   0.264626
4        nn  {'layer_size_nn': 27, 'lr_nn': 0.0002783096594...   0.246609
5        dr  {'layer_size_dr': 14, 'rep_size_dr': 22, 'lr_d...   0.255556
6       sci  {'alpha_sci': 1, 'layer_size_sci': 14, 'lr_sci...   0.371482
7   cgct_cf  {'layer_size_bae': 10, 'rep_size_bae': 7, 'dro...   0.247234
8  cgct_gps  {'layer_size_bae': 14, 'rep_size_bae': 4, 'dro...   0.248791
9   cgct_rf  {'layer_size_bae': 10, 'rep_size_bae': 10, 'dr...   0.248937


In [5]:
#Export dict list to .txt

model_order = {model: i for i, model in enumerate(models)}
hyper_opt_sort = sorted(hyper_opt_list, key=lambda x: model_order.get(x['model'], float('inf')))
hyper_opt_sort = [entry['best_hyperpars'] for entry in hyper_opt_sort]

file_name = "hyperpars/hyperpars_opt_real.txt"

# Write the list to a file in JSON format for readability
with open(file_name, "w") as f:
    json.dump(hyper_opt_sort, f, indent=4)

Real World Large Dataset

In [None]:
# Real-world - train
df = pd.read_csv('data/food_df_ana_large.csv') 
year = 2017
df = df[df['year'] == year]

# Prepare real world data
X = df.iloc[:, 5:].to_numpy()
A = df.iloc[:, 4].to_numpy()
Y = df.iloc[:, 2].to_numpy()

n = X.shape[0]
p = X.shape[1]

data = np.concatenate([Y.reshape(n,1), A.reshape(n,1), X],axis=1)

# Data standardization: min-max scaler
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data[:,2:])
data_train = np.concatenate([data[:,0:2], data_scaled], axis=1)

# Real world - test
df = pd.read_csv('data/food_df_ana_large.csv')
year = 2018
df = df[df['year'] == year]

# Prepare real world data
X = df.iloc[:, 5:].to_numpy()
A = df.iloc[:, 4].to_numpy()
Y = df.iloc[:, 2].to_numpy()

n = X.shape[0]
p = X.shape[1]

data2 = np.concatenate([Y.reshape(n,1), A.reshape(n,1), X],axis=1)

# Data standardization: min-max scaler
scaler = MinMaxScaler()
data_scaled2 = scaler.fit_transform(data2[:,2:])
data_val = np.concatenate([data2[:,0:2], data_scaled2], axis=1)


In [3]:
#%% Optimal hyperparameters for inference models

# Define models to be optimized
models = ['lm', 'nn', 'gps', 'dr', 'sci', 'cgct_gps', 'rf', 'cgct_rf', 'cf','cgct_cf']

def hyperparameter_search(models, data_train, data_val):
    # Create a list to store the results
    hyper_opt_list = []

    # Use ThreadPoolExecutor or ProcessPoolExecutor for parallelization
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Submit each model optimization task to the executor
        futures = {executor.submit(CV_hyperpar_search, data_train, data_val, model, n_trials=50, n_jobs=5, name=f"{model}small"): model for model in models}
        
        # Wait for each future to complete and gather results
        for future in concurrent.futures.as_completed(futures):
            model = futures[future]
            result = future.result()  # Get the result of the optimization
            hyper_opt_list.append({
                'model': model,
                'best_hyperpars': result[0],
                'best_loss': result[1]
            })
    
    return hyper_opt_list

hyper_opt_list = hyperparameter_search(models, data_train, data_val)
df_hyper_opt = pd.DataFrame(hyper_opt_list)

print(df_hyper_opt)


[I 2025-02-10 23:16:06,614] A new study created in RDB with name: get_hp_cgct_rfsmall
[I 2025-02-10 23:16:06,643] A new study created in RDB with name: get_hp_cfsmall
[I 2025-02-10 23:16:06,667] A new study created in RDB with name: get_hp_scismall
[I 2025-02-10 23:16:06,672] A new study created in RDB with name: get_hp_lmsmall
[I 2025-02-10 23:16:06,676] A new study created in RDB with name: get_hp_cgct_cfsmall
[I 2025-02-10 23:16:06,711] A new study created in RDB with name: get_hp_gpssmall
[I 2025-02-10 23:16:06,716] A new study created in RDB with name: get_hp_drsmall
[I 2025-02-10 23:16:06,718] A new study created in RDB with name: get_hp_cgct_gpssmall
[I 2025-02-10 23:16:06,742] A new study created in RDB with name: get_hp_rfsmall
[I 2025-02-10 23:16:06,753] A new study created in RDB with name: get_hp_nnsmall
[I 2025-02-10 23:16:07,631] Trial 0 finished with value: 0.09070315560844772 and parameters: {}. Best is trial 0 with value: 0.09070315560844772.
[I 2025-02-10 23:16:07,685

      model                                     best_hyperpars  best_loss
0       gps                                                 {}   0.090703
1        lm    {'alpha_lm': 0.2988358475155337, 'order_lm': 1}   0.089921
2        rf      {'n_estimators': 160, 'min_samples_leaf': 12}   0.089119
3        cf       {'n_estimators': 296, 'min_samples_leaf': 1}   0.084995
4        nn  {'layer_size_nn': 14, 'lr_nn': 0.0003834496139...   0.093610
5        dr  {'layer_size_dr': 14, 'rep_size_dr': 22, 'lr_d...   0.095552
6       sci  {'alpha_sci': 1, 'layer_size_sci': 14, 'lr_sci...   0.152214
7  cgct_gps  {'layer_size_bae': 14, 'rep_size_bae': 7, 'dro...   0.084956
8   cgct_cf  {'layer_size_bae': 14, 'rep_size_bae': 4, 'dro...   0.086582
9   cgct_rf  {'layer_size_bae': 7, 'rep_size_bae': 4, 'drop...   0.088425


In [None]:
#Export dict list to .txt

model_order = {model: i for i, model in enumerate(models)}
hyper_opt_sort = sorted(hyper_opt_list, key=lambda x: model_order.get(x['model'], float('inf')))
hyper_opt_sort = [entry['best_hyperpars'] for entry in hyper_opt_sort]

file_name = "hyperpars/hyperpars_opt_real_large.txt"

# Write the list to a file in JSON format for readability
with open(file_name, "w") as f:
    json.dump(hyper_opt_sort, f, indent=4)