In [1]:
import os
import yaml
from itertools import product
import math
from pathlib import Path
import sys

In [2]:
PROJECT_ROOT = Path.cwd().parent  # points to nl-pe/
SRC_PATH = PROJECT_ROOT / "src"
sys.path.insert(0, str(SRC_PATH))  # make nl_pe importable
os.chdir(PROJECT_ROOT)  

In [9]:
def map_vars_to_config(
        data='',
        #embedding and knn
        embedder = '',
        k_knn = None,
        #device
        gpu_batch_size=None,
        #gp params
        noise = None,
        sigma_sig = None,
        #Active Learning
        n_obs_iters = None,
        acquisition_f = None,
        #gr-eps
        epsilon = None,
        #ucb
        ucb_beta_const = None,
        #lse
        lse_tau = None,
        lse_kappa = None,
        #ws
        ws_perc = None,
        #optimization
        k_refit = None,
        k_obs_refit = None,
        lr = None,
        frac_gt_rel_label = None,
        gen_q_suffix = None,
        ):

    base_data_dir = os.path.join('data', 'ir', data)
    embedder_dir = os.path.join(base_data_dir, embedder)

    #set query label for dataset:
    if data == 'beir/nfcorpus' or data == 'beir/trec-covid' or data == 'beir/webis-touche2020':
        query_rel_label = 2
    elif data == 'beir/scifact' or data == 'beir/fiqa':
        query_rel_label = 1
    else:
        raise ValueError(f"Unknown query label value for: {data}")

    #always update base query rel label
    base_query_rel_update = {
        'gp': {'query_rel_label': query_rel_label}
    }

    var_config_mapping = {
        'logging': {
            'DEBUG': {
                'logging': {'level': 'DEBUG'}
            }
        },
        'method':{
            'dense': {
                'agent': {
                    'policy_steps': [
                        {'component': 'HuggingFaceEmbedderSentenceTransformers',
                         'method': 'exact_knn_from_faiss'}]
                }
            },
            'dense_oracle': {
                'agent': {
                    'policy_steps': [
                        {'component': 'HuggingFaceEmbedderSentenceTransformers',
                         'method': 'exact_knn_from_faiss'},
                         {'component': 'AgentLogic',
                          'method': 'batch_all_dense',},
                          {'component': 'AgentLogic',
                          'method': 'gt_rel_oracle',},
                          {'component': 'AgentLogic',
                          'method': 'agg_pointwise_scores',}
                    ]  
                }
            },
            'gp_ws': {
                'agent': {
                    'policy_steps': [
                        {'component': 'HuggingFaceEmbedderSentenceTransformers',
                         'method': 'exact_knn_from_faiss'},
                         {'component': 'GPActiveLearner',
                          'method': 'active_learn',}]
                }
            },
            'gp_al': {
                'agent': {
                    'policy_steps': [
                        {'component': 'HuggingFaceEmbedderSentenceTransformers',
                         'method': 'get_query_embedding'},
                         {'component': 'GPActiveLearner',
                          'method': 'active_learn',}]
                }
            },
            'gp_ws_q_dec': {
                'agent': {
                    'policy_steps': [
                        {'component': 'HuggingFaceEmbedderSentenceTransformers',
                        'method': 'exact_knn_from_faiss'},
                        {'component': 'HuggingFaceEmbedderSentenceTransformers',
                        'method': 'get_reformulation_embeddings'},
                        {'component': 'GPActiveLearner',
                        'method': 'active_learn'},
                    ]
                },
                'gp': {
                    'use_query_reformulations': True,
                    'reform_query_rel_label': frac_gt_rel_label * query_rel_label,
                },
                'data': {
                    'q_text_csv': os.path.join(base_data_dir, gen_q_suffix, 'gen_qs.csv'),
                }
            },
            'gp_al_q_dec': {
                'agent': {
                    'policy_steps': [
                        {'component': 'HuggingFaceEmbedderSentenceTransformers',
                        'method': 'get_query_embedding'},
                        {'component': 'HuggingFaceEmbedderSentenceTransformers',
                        'method': 'get_reformulation_embeddings'},
                        {'component': 'GPActiveLearner',
                        'method': 'active_learn'},
                    ]
                },
                'gp': {
                    'use_query_reformulations': True,
                    'reform_query_rel_label': frac_gt_rel_label * query_rel_label,
                },
                'data': {
                    'q_text_csv': os.path.join(base_data_dir, gen_q_suffix, 'gen_qs.csv'),
                }
            },
        },
        'data': {
            data: {
                'data': {
                    'd_text_csv': os.path.join(base_data_dir, 'docs.csv'),
                    'q_text_csv': os.path.join(base_data_dir, 'test_queries.csv'),
                    'index_path': os.path.join(embedder_dir, 'faiss', 'index'),
                    #new embeddings will have index_ids.pkl (since queries will also get ids)
                    'doc_ids_path': os.path.join(embedder_dir, 'faiss', 'index_doc_ids.pkl'),
                    'qrels_path': os.path.join(base_data_dir, 'qrels', 'test.txt'),
                }
            }
        },
        #device
        'gpu_batch_size': {
            gpu_batch_size: {
                'embedding': {'inference_batch_size': gpu_batch_size},
                'data': {'embedding_batch_size': gpu_batch_size},
                }
        },
        'device': {
            'gpu': {
                'inference_device': 'gpu',
                'tensor_ops_device': 'gpu',
                },
            'cpu': {
                'inference_device': 'cpu',
                'tensor_ops_device': 'cpu',}
        },
        #embedding and knn
        'k_knn': {
            k_knn: {
                'embedding': {'k': k_knn},
                }
        },
        #AL
        'n_obs_iters': {
            n_obs_iters: {
                'active_learning': {'n_obs_iterations': n_obs_iters},
                }
        },
        #GP Params
        'noise': {
            noise: {
                'gp': {'observation_noise': noise},
                }
        },
        'sigma_sig': {
            sigma_sig: {
                'gp': {'signal_noise': sigma_sig},
                }
        },
        #Active Learning
        'acquisition_f': {
            acquisition_f: {
                'active_learning': {'acquisition_f': acquisition_f},
                }
        },
        #UCB
        'ucb_beta_const': {
            ucb_beta_const: {
                'active_learning': {'ucb_beta_const': ucb_beta_const},
                }
        },
        #GR-EPS
        'epsilon': {
            epsilon: {
                'active_learning': {'epsilon': epsilon},
                }
        },
        #LSE
        'lse_tau': {
            lse_tau: {
                'active_learning': {'lse_tau': lse_tau},
                }
        },
        'lse_kappa': {
            lse_kappa: {
                'active_learning': {'lse_kappa': lse_kappa},
                }
        },
        #WS
        'ws_perc': {
            ws_perc: {
                'gp': {'warm_start_percent': ws_perc},
                }
        },
        #optimization
        'refit_after_obs': {
            True: {
                'optimization': {'refit_after_obs': True}
            }
        },
        'ard':{
            True: {
                'optimization': {'ard': True}
            }
        },
        'opt_noise':{
            False: {
                'optimization': {'opt_noise': False}
            }
        },
        'opt_sig_noise':{
            False: {
                'optimization': {'opt_sig_noise': False}
            }
        },
        'k_refit':{
            k_refit: {'optimization': {'k_refit': k_refit}}
        },
        'k_obs_refit': {
            k_obs_refit: {'optimization': {'k_obs_refit': k_obs_refit}}
        },
        'lr': {
            lr: {'optimization': {'lr': lr}}
        },
    }
    
    return var_config_mapping, query_rel_label

In [10]:
def deep_update(original, update):
    for key, value in update.items():
        if isinstance(value, dict):
            original[key] = deep_update(original.get(key, {}), value)
        else:
            original[key] = value
    return original


def save_config(config, base_dir, experiment_name):
    full_path = os.path.join(base_dir, experiment_name)
    os.makedirs(full_path, exist_ok=True)
    with open(os.path.join(full_path, "config.yaml"), 'w') as f:
        yaml.dump(config, f)

    #EMNLP EVAL CONFIG SAVING
    # measures = []
    # for i in range(1, 176):
    #     measures.append(f"ndcg_cut_{i}")
    # for i in range(1, 176):
    #     measures.append(f"map_cut_{i}")
    # for i in range(1, 176):
    #     measures.append(f"P_{i}")
    # for i in range(1, 176):
    #     measures.append(f"recall_{i}")

    # eval_params = {
    #     'measures': measures,
    #     'qrels_path' : os.path.join(os.path.dirname(config['data']['run_path']),'qrels.txt'),
    #     'logging': {'level': 'DEBUG'},
    # }

    # with open(os.path.join(full_path, "eval_config.yaml"), 'w') as f:
    #     yaml.dump(eval_params, f)

In [16]:
EXP_DIR = 'trials/ir/scifact'
if not os.path.exists(EXP_DIR):
    os.makedirs(EXP_DIR)

BASE_CONFIG_PATH = 'configs/base_config_dec23.yaml'

In [17]:
#set the parmeters
#options/explanations of params:
'''
param_grid = {
}
'''

# Define the parameters
param_grid = {
    'logging': ['INFO'],
    'data': ['beir/scifact'],
    'method': ['gp_al_q_dec'], #dense, dense_oracle, gp_ws, gp_al, gp_ws_q_dec, gp_al_q_dec, 
    #embedding and knn
    'embedder': ['miniLM'],
    'k_knn': [100],
    ###device params
    'gpu_batch_size': [1024],
    'device': ['gpu'],
    ###al
    'n_obs_iters': [100],
    ###variances
    #'noise': [0.001],
    #'sigma_sig': [0.01],
    ###acquisition_f
    'acquisition_f': ['greedy_epsilon'], #greedy_epsilon, ucb_const_beta, lse_straddle, lse_margin
    ###gr-eps
    'epsilon': [0,1],
    ###ucb
    #'ucb_beta_const': [1,2],
    ###lse
    #'lse_tau': [1.0],
    #'lse_kappa': [1],
    ###ws
    #'ws_perc': [100],
    ###optimization
    #'ard': [True],
    #'refit_after_obs': [True],
    #'k_refit': [200],
    #'k_obs_refit': [10],
    #'lr': [0.1],
    #'opt_noise': [False],
    #'opt_sig_noise': [False],
    ### q reform
    'frac_gt_rel_label': [1],
    'gen_q_suffix': ['q_generation/5q'],
}



In [18]:
# Load the base config file
with open(BASE_CONFIG_PATH, 'r') as f:
    base_config = yaml.safe_load(f)

#for file naming, don't use these param keys:
PARAM_NAMES_TO_OMMIT = {
                        'logging',
                        #'method',
                        'data', 
                        'embedder',
                        'k_knn',
                        'gpu_batch_size',
                        'device',
                        'n_obs_iters',
                        #af
                        #acquisition_f',
                        #lse
                        'lse_tau',
                        #'lse_kappa',
                        #refitting
                        'ard',
                        'refit_after_obs',
                        'k_refit',
                        'k_obs_refit',
                        'lr',
                        'opt_noise',
                        'opt_sig_noise'
                        } #

# Generate and save config files for each combination
for param_values in product(*param_grid.values()):
    param_values_dict = dict(zip(param_grid.keys(), param_values))
    experiment_name = os.path.join(*[
        str(value)
        for param, value in param_values_dict.items()
        if param not in PARAM_NAMES_TO_OMMIT
    ])
    updated_config = yaml.safe_load(yaml.dump(base_config))  # deep copy

    # Apply updates to the config based on the current parameter values
    var_config_mapping, query_rel_label = map_vars_to_config(
        data=param_values_dict.get('data'),
        #embedding and knn
        embedder=param_values_dict.get('embedder'),
        k_knn=param_values_dict.get('k_knn'),
        #device
        gpu_batch_size=param_values_dict.get('gpu_batch_size'),
        #al
        n_obs_iters=param_values_dict.get('n_obs_iters'),
        acquisition_f=param_values_dict.get('acquisition_f'),
        #gp params
        noise=param_values_dict.get('noise'),
        sigma_sig=param_values_dict.get('sigma_sig'),
        #af params
        ucb_beta_const=param_values_dict.get('ucb_beta_const'),
        epsilon=param_values_dict.get('epsilon'),
        #lse
        lse_tau=param_values_dict.get('lse_tau'),
        lse_kappa=param_values_dict.get('lse_kappa'),
        #ws
        ws_perc=param_values_dict.get('ws_perc'),
        #optimization
        k_refit=param_values_dict.get('k_refit'),
        k_obs_refit=param_values_dict.get('k_obs_refit'),
        lr=param_values_dict.get('lr'),
        #q_reform
        frac_gt_rel_label=param_values_dict.get('frac_gt_rel_label'),
        gen_q_suffix=param_values_dict.get('gen_q_suffix'),
    )
    for param, value in param_values_dict.items():
        if param in var_config_mapping and value in var_config_mapping[param]:
            deep_update(updated_config, var_config_mapping[param][value])
        else:
            print(f"No mapping found for parameter '{param}' with value '{value}'")

    deep_update(
        updated_config,
        {'gp': {'query_rel_label': query_rel_label}}
    )

    save_config(updated_config, EXP_DIR, experiment_name)


No mapping found for parameter 'logging' with value 'INFO'
No mapping found for parameter 'embedder' with value 'miniLM'
No mapping found for parameter 'frac_gt_rel_label' with value '1'
No mapping found for parameter 'gen_q_suffix' with value 'q_generation/5q'
No mapping found for parameter 'logging' with value 'INFO'
No mapping found for parameter 'embedder' with value 'miniLM'
No mapping found for parameter 'frac_gt_rel_label' with value '1'
No mapping found for parameter 'gen_q_suffix' with value 'q_generation/5q'
