In [None]:
import copy
import itertools as it
import numpy as np
import yaml
import os

def expand_config(dict_config):
    keys, values = zip(*dict_config.items())
    permutations_dicts = [dict(zip(keys, v)) for v in it.product(*values)]
    return permutations_dicts

In [None]:
def generate_bash(configs, cuda_devices, tasks, generate_func, filename='', scripts="run_polygraph_2.py"):
    full_config = 'cd ../'
    j = 0
    n_devices = len(cuda_devices)
    for i, mc_configs in enumerate(configs):
        for conf in expand_config(mc_configs):
            for task_name in tasks:
                conf["task"] = task_name
                if not ("12b" in conf["model"]):
                    base_arg = f'CUDA_VISIBLE_DEVICES={cuda_devices[j%n_devices]} HYDRA_CONFIG=./configs/polygraph_eval_{task_name}.yaml python {scripts}'
                else:
                    base_arg = f'CUDA_VISIBLE_DEVICES={cuda_devices[j%n_devices]},{cuda_devices[(j+1)%n_devices]} HYDRA_CONFIG=./configs/polygraph_eval_{task_name}.yaml python {scripts}'
                    j+=1
                    
                new_task = copy.deepcopy(base_arg)
                args = ' '.join(generate_func[i](conf))
                new_task += f' {args}'
                if (j+1)%n_devices!=0: 
                    new_task += ' &'
                else:
                    new_task += '\nwait'
                full_config += '\n' + new_task if len(full_config) else new_task
                j+=1
                
    with open (f'../scripts/{filename}', 'w') as rsh:
        rsh.write(full_config)

## Final experiments

In [None]:
def generate_train_str_config(config):
    config_list = []
    config_list.append('ignore_exceptions=False use_density_based_ue=True')
    
    config_list.append('batch_size={}'.format(config['batch_size']))

    if ("gsm8k" in config["task"]) or ("medquad" in config["task"]) or ("samsum" in config["task"]):
        config_list.append('subsample_train_dataset={}'.format(config['subsample_gsm8k_train_dataset']))
    else:
        config_list.append('subsample_train_dataset={}'.format(config['subsample_train_dataset']))
    if config.get("ablation", False):
        if ("gsm8k" in config["task"]) or ("medquad" in config["task"]) or ("samsum" in config["task"]):
            config_list.append('subsample_background_train_dataset={}'.format(config['subsample_gsm8k_train_dataset']))
        else:
            config_list.append('subsample_background_train_dataset={}'.format(config['subsample_train_dataset']))
    else:
        config_list.append('subsample_background_train_dataset={}'.format(config['subsample_background_train_dataset']))
    config_list.append('subsample_eval_dataset={}'.format(config['subsample_eval_dataset']))
    config_list.append('model.path={}'.format(config['model']))
    
    if ("gemma" in config['model']) or ("mistral" in config['model'].lower()) or ("llama-3" in config['model'].lower()) or ("stablelm-2" in config['model'].lower()):
        config_list.append('+model.attn_implementation=eager')
    if ("cache_path" in config.keys()) and config.get("ablation", False):
        config_list.append('cache_path={}_{}'.format(config['cache_path'], config['subsample_train_dataset']))
    elif ("cache_path" in config.keys()):
        config_list.append('cache_path={}'.format(config['cache_path']))
    if ("metric_thrs" in config.keys()):
        config_list.append('+metric_thrs={}'.format(config['metric_thrs']))
    if ("layers" in config.keys()):
        config_list.append('+layers={}'.format(config['layers'][config['model']]))
    if ("samples_n" in config.keys()):
        config_list.append('+generation_params.samples_n={}'.format(config['samples_n']))
    if ("baselines" in config.keys()):
        if config['baselines'] == "all":
            config_list.append('+run_baselines=True')
            config_list.append('+md_device=cuda')
            config_list.append('+run_proposed_methods=True')
            config_list.append('+clean_md_device=cuda')
        elif config['baselines']:
            config_list.append('+run_baselines=True')
            config_list.append('+md_device=cuda')
        else:
            config_list.append('+run_proposed_methods=True')
            config_list.append('+run_layerwise_methods=True')
            config_list.append('+clean_md_device=cuda')
            config_list.append('+md_device=cuda')
            config_list.append('+run_eigenscore=False')
                
    if ("use_truefalse_dataset" in config.keys()):
        config_list.append('+use_truefalse_dataset={}'.format(config['use_truefalse_dataset']))
        if (config["use_truefalse_dataset"]):
            config_list.append('train_dataset=../data/publicDataset/truefalsedata.csv +train_text_column=statement +train_label_column=label')
    return config_list

In [None]:
layers = {"google/gemma-2-9b": "\"[{},-1]\"".format(",".join([str(x) for x in range(41)])),
          "meta-llama/Meta-Llama-3.1-8B": "\"[{},-1]\"".format(",".join([str(x) for x in range(31)])),
          "stabilityai/stablelm-2-12b": "\"[{},-1]\"".format(",".join([str(x) for x in range(39)])),
          "Qwen/Qwen2.5-7B": "\"[{},-1]\"".format(",".join([str(x) for x in range(27)])),}

In [None]:
tasks = ['sciq', 'coqa', 'triviaqa', 'mmlu', 'truthfullqa', 'samsum']

train_configs = {
    'batch_size': [1],
    'subsample_train_dataset': [5000],
    'subsample_gsm8k_train_dataset': [2000],
    'subsample_background_train_dataset': [2000],
    'subsample_eval_dataset': [2000],
    'model': ["meta-llama/Meta-Llama-3.1-8B", "google/gemma-2-9b"],
    'metric_thrs': ["\"[0.3]\""],
    'layers': [layers],
    'cache_path': ['./workdir/output_layers_internal_final'],
    'baselines': ["all"],#[False, True],
    'samples_n': [5],
}
    
cuda_devices = [0]

generate_bash([train_configs], cuda_devices, tasks, [generate_train_str_config], 'run_tmd_exps_final_1.sh', scripts="run_polygraph.py")

In [None]:
tasks = ['gsm8k', 'medquad', 'xsum']

train_configs = {
    'batch_size': [1],
    'subsample_train_dataset': [2000],
    'subsample_gsm8k_train_dataset': [2000],
    'subsample_background_train_dataset': [2000],
    'subsample_eval_dataset': [2000],
    'model': ["meta-llama/Meta-Llama-3.1-8B", "google/gemma-2-9b"],
    'metric_thrs': ["\"[0.3]\""],
    'layers': [layers],
    'baselines': ["all"],#[False, True],
    'cache_path': ['./workdir/output_layers_internal_final'],
    'samples_n': [5],
}
    
cuda_devices = [1]

generate_bash([train_configs], cuda_devices, tasks, [generate_train_str_config], 'run_tmd_exps_final_2.sh', scripts="run_polygraph.py")

In [None]:
tasks = ['pubmedqa', 'cnn', "wmt19_deen"]

train_configs = {
    'batch_size': [1],
    'subsample_train_dataset': [2000],
    'subsample_gsm8k_train_dataset': [2000],
    'subsample_background_train_dataset': [2000],
    'subsample_eval_dataset': [2000],
    'model': ["meta-llama/Meta-Llama-3.1-8B", "google/gemma-2-9b"],
    'metric_thrs': ["\"[0.3]\""],
    'layers': [layers],
    'baselines': ["all"],#False, True],
    'cache_path': ['./workdir/output_layers_internal_final'],
    'samples_n': [5],
}
    
cuda_devices = [2]

generate_bash([train_configs], cuda_devices, tasks, [generate_train_str_config], 'run_tmd_exps_final_3.sh', scripts="run_polygraph.py")

## train size ablation

In [None]:
tasks = ['sciq']

train_configs = {
    'batch_size': [1],
    'subsample_gsm8k_train_dataset': [2000],
    'subsample_background_train_dataset': [2000],
    'subsample_eval_dataset': [2000],
    'model': ["meta-llama/Meta-Llama-3.1-8B", "google/gemma-2-9b"],
    'subsample_train_dataset': [100, 200, 500, 1000, 2000],
    'metric_thrs': ["\"[0.3]\""],
    'layers': [layers],
    'cache_path': ['./workdir/output_layers_internal_ablation'],
    'baselines': [False],
    'ablation': [True],
    'samples_n': [5],
}
    
cuda_devices = [0]

generate_bash([train_configs], cuda_devices, tasks, [generate_train_str_config], 'run_tmd_exps_abl_1.sh', scripts="run_polygraph.py")

In [None]:
tasks = ['truthfullqa']

train_configs = {
    'batch_size': [1],
    'subsample_gsm8k_train_dataset': [2000],
    'subsample_background_train_dataset': [2000],
    'subsample_eval_dataset': [2000],
    'model': ["meta-llama/Meta-Llama-3.1-8B", "google/gemma-2-9b"],
    'subsample_train_dataset': [100, 200, 500],
    'metric_thrs': ["\"[0.3]\""],
    'layers': [layers],
    'cache_path': ['./workdir/output_layers_internal_ablation'],
    'baselines': [False],
    'ablation': [True],
    'samples_n': [5],
}
    
cuda_devices = [0]

generate_bash([train_configs], cuda_devices, tasks, [generate_train_str_config], 'run_tmd_exps_abl_1_1.sh', scripts="run_polygraph.py")

In [None]:
tasks = ['mmlu', 'cnn']

train_configs = {
    'batch_size': [1],
    'subsample_gsm8k_train_dataset': [2000],
    'subsample_background_train_dataset': [2000],
    'subsample_eval_dataset': [2000],
    'model': ["meta-llama/Meta-Llama-3.1-8B", "google/gemma-2-9b"],
    'subsample_train_dataset': [100, 200, 500, 1000, 2000],
    'metric_thrs': ["\"[0.3]\""],
    'layers': [layers],
    'cache_path': ['./workdir/output_layers_internal_final'],
    'baselines': [False],
    'ablation': [True],
    'samples_n': [5],
}
    
cuda_devices = [1]

generate_bash([train_configs], cuda_devices, tasks, [generate_train_str_config], 'run_tmd_exps_abl_2.sh', scripts="run_polygraph.py")

In [None]:
tasks = ['medquad']

train_configs = {
    'batch_size': [1],
    'subsample_background_train_dataset': [2000],
    'subsample_eval_dataset': [2000],
    'model': ["meta-llama/Meta-Llama-3.1-8B", "google/gemma-2-9b"],
    'subsample_gsm8k_train_dataset': [100, 200, 500, 1000, 2000],
    'subsample_train_dataset': [100],
    'metric_thrs': ["\"[0.3]\""],
    'layers': [layers],
    'cache_path': ['./workdir/output_layers_internal_final'],
    'baselines': [False],
    'ablation': [True],
    'samples_n': [5],
}
    
cuda_devices = [1]

generate_bash([train_configs], cuda_devices, tasks, [generate_train_str_config], 'run_tmd_exps_abl_3.sh', scripts="run_polygraph.py")

In [None]:
tasks = ['gsm8k']

train_configs = {
    'batch_size': [1],
    'subsample_background_train_dataset': [2000],
    'subsample_eval_dataset': [2000],
    'model': ["meta-llama/Meta-Llama-3.1-8B", "google/gemma-2-9b"],
    'subsample_gsm8k_train_dataset': [100, 200, 500, 1000, 2000],
    'subsample_train_dataset': [100],
    'metric_thrs': ["\"[0.3]\""],
    'layers': [layers],
    'cache_path': ['./workdir/output_layers_internal_final'],
    'baselines': [False],
    'ablation': [True],
    'samples_n': [5],
}
    
cuda_devices = [2]

generate_bash([train_configs], cuda_devices, tasks, [generate_train_str_config], 'run_tmd_exps_abl_4.sh', scripts="run_polygraph.py")

In [None]:
tasks = ['samsum']

train_configs = {
    'batch_size': [1],
    'subsample_train_dataset': [2000],
    'subsample_background_train_dataset': [2000],
    'subsample_eval_dataset': [2000],
    'model': ["meta-llama/Meta-Llama-3.1-8B"],
    'subsample_gsm8k_train_dataset': [100, 200, 500, 1000],
    'metric_thrs': ["\"[0.3]\""],
    'layers': [layers],
    'cache_path': ['./workdir/output_layers_internal_final'],
    'baselines': [False],
    'ablation': [True],
    'samples_n': [5],
}
    
cuda_devices = [0]

generate_bash([train_configs], cuda_devices, tasks, [generate_train_str_config], 'run_tmd_exps_abl_5.sh', scripts="run_polygraph.py")

## pca & thr ablation

In [None]:
def generate_train_str_config(config):
    config_list = []
    config_list.append('ignore_exceptions=False use_density_based_ue=True')
    
    config_list.append('batch_size={}'.format(config['batch_size']))

    if ("gsm8k" in config["task"]) or ("medquad" in config["task"]) or ("samsum" in config["task"]):
        config_list.append('subsample_train_dataset={}'.format(config['subsample_gsm8k_train_dataset']))
    else:
        config_list.append('subsample_train_dataset={}'.format(config['subsample_train_dataset']))
    if config.get("ablation", False):
        if ("gsm8k" in config["task"]) or ("medquad" in config["task"]) or ("samsum" in config["task"]):
            config_list.append('subsample_background_train_dataset={}'.format(config['subsample_gsm8k_train_dataset']))
        else:
            config_list.append('subsample_background_train_dataset={}'.format(config['subsample_train_dataset']))
    else:
        config_list.append('subsample_background_train_dataset={}'.format(config['subsample_background_train_dataset']))
    config_list.append('subsample_eval_dataset={}'.format(config['subsample_eval_dataset']))
    config_list.append('model.path={}'.format(config['model']))
    
    if ("gemma" in config['model']) or ("mistral" in config['model'].lower()) or ("llama-3" in config['model'].lower()) or ("stablelm-2" in config['model'].lower()):
        config_list.append('+model.attn_implementation=eager')
    if ("cache_path" in config.keys()) and config.get("ablation", False):
        config_list.append('cache_path={}_{}'.format(config['cache_path'], config['subsample_train_dataset']))
    elif ("cache_path" in config.keys()):
        config_list.append('cache_path={}'.format(config['cache_path']))
    if ("metric_thrs" in config.keys()):
        config_list.append('+metric_thrs={}'.format(config['metric_thrs']))
    if ("n_components" in config.keys()):
        config_list.append('+n_components={}'.format(config['n_components']))
    if ("layers" in config.keys()):
        config_list.append('+layers={}'.format(config['layers'][config['model']]))
    if ("samples_n" in config.keys()):
        config_list.append('+generation_params.samples_n={}'.format(config['samples_n']))
    if ("baselines" in config.keys()):
        if config['baselines'] == "all":
            config_list.append('+run_baselines=True')
            config_list.append('+md_device=cuda')
            config_list.append('+run_proposed_methods=True')
            config_list.append('+clean_md_device=cuda')
        elif config['baselines']:
            config_list.append('+run_baselines=True')
            config_list.append('+md_device=cuda')
        else:
            config_list.append('+run_proposed_methods=True')
            config_list.append('+run_layerwise_methods=True')
            config_list.append('+clean_md_device=cuda')
            config_list.append('+md_device=cuda')
            config_list.append('+run_eigenscore=False')
            
    if ("run_ablation" in config.keys()):
        config_list.append('+run_baselines=False')
        config_list.append('+run_proposed_methods=False')
        config_list.append('+run_layerwise_methods=False')
        config_list.append('+clean_md_device=cuda')
        config_list.append('+md_device=cuda')
        config_list.append('+run_eigenscore=False')
        config_list.append('+run_ablation={}'.format(config['run_ablation']))
        
    if ("use_truefalse_dataset" in config.keys()):
        config_list.append('+use_truefalse_dataset={}'.format(config['use_truefalse_dataset']))
        if (config["use_truefalse_dataset"]):
            config_list.append('train_dataset=../data/publicDataset/truefalsedata.csv +train_text_column=statement +train_label_column=label')
    return config_list

In [None]:
tasks = ['sciq', 'coqa', 'triviaqa', 'mmlu', 'truthfullqa']#, 'samsum']

train_configs = {
    'batch_size': [1],
    'subsample_gsm8k_train_dataset': [2000],
    'subsample_background_train_dataset': [2000],
    'subsample_eval_dataset': [2000],
    'model': ["meta-llama/Meta-Llama-3.1-8B"],
    'subsample_train_dataset': [5000],
    'metric_thrs': ["\"[0.1,0.3,0.5,0.8]\""],
    'layers': [layers],
    'cache_path': ['./workdir/output_ablation_thr'],
    'run_ablation': [True],
    'samples_n': [5],
}
    
cuda_devices = [0]

generate_bash([train_configs], cuda_devices, tasks, [generate_train_str_config], 'run_tmd_exps_abl_thr_1.sh', scripts="run_polygraph.py")

In [None]:
tasks = ['sciq', 'coqa', 'triviaqa', 'mmlu', 'truthfullqa']#, 'samsum']

train_configs = {
    'batch_size': [1],
    'subsample_gsm8k_train_dataset': [2000],
    'subsample_background_train_dataset': [2000],
    'subsample_eval_dataset': [2000],
    'model': ["meta-llama/Meta-Llama-3.1-8B"],
    'subsample_train_dataset': [5000],
    'metric_thrs': ["\"[0.3]\""],
    'n_components': ["\"[2,5,10,20,30]\""],
    'layers': [layers],
    'cache_path': ['./workdir/output_ablation_pca'],
    'run_ablation': [True],
    'samples_n': [5],
}
    
cuda_devices = [0]

generate_bash([train_configs], cuda_devices, tasks, [generate_train_str_config], 'run_tmd_exps_abl_pca_1.sh', scripts="run_polygraph.py")

In [None]:
tasks = ['gsm8k', 'medquad', 'xsum', 'pubmedqa', 'cnn']

train_configs = {
    'batch_size': [1],
    'subsample_gsm8k_train_dataset': [2000],
    'subsample_background_train_dataset': [2000],
    'subsample_eval_dataset': [2000],
    'model': ["meta-llama/Meta-Llama-3.1-8B"],
    'subsample_train_dataset': [2000],
    'metric_thrs': ["\"[0.1,0.3,0.5,0.8]\""],
    'layers': [layers],
    'cache_path': ['./workdir/output_ablation_thr'],
    'run_ablation': [True],
    'samples_n': [5],
}
    
cuda_devices = [2]

generate_bash([train_configs], cuda_devices, tasks, [generate_train_str_config], 'run_tmd_exps_abl_thr_2.sh', scripts="run_polygraph.py")

In [None]:
tasks = ['gsm8k', 'medquad', 'xsum', 'pubmedqa', 'cnn']

train_configs = {
    'batch_size': [1],
    'subsample_gsm8k_train_dataset': [2000],
    'subsample_background_train_dataset': [2000],
    'subsample_eval_dataset': [2000],
    'model': ["meta-llama/Meta-Llama-3.1-8B"],
    'subsample_train_dataset': [2000],
    'metric_thrs': ["\"[0.3]\""],
    'n_components': ["\"[2,5,10,20,30]\""],
    'layers': [layers],
    'cache_path': ['./workdir/output_ablation_pca'],
    'run_ablation': [True],
    'samples_n': [5],
}
    
cuda_devices = [2]

generate_bash([train_configs], cuda_devices, tasks, [generate_train_str_config], 'run_tmd_exps_abl_pca_2.sh', scripts="run_polygraph.py")

## Generalization

In [None]:
import yaml

def generate_train_str_config(config, task_name):
    config_list = []
    config_list.append('ignore_exceptions=False use_density_based_ue=True')
    
    config_list.append('batch_size={}'.format(config['batch_size']))
    config_list.append('train_test_split=False')
    config_list.append('subsample_train_dataset={}'.format(config['subsample_train_dataset']))
    config_list.append('subsample_background_train_dataset={}'.format(config['subsample_background_train_dataset']))
    config_list.append('subsample_eval_dataset={}'.format(config['subsample_eval_dataset']))
    config_list.append('model.path={}'.format(config['model']))
    if ("gemma" in config['model']) or ("mistral" in config['model'].lower()) or ("llama-3" in config['model'].lower()) or ("stablelm-2" in config['model'].lower()):
        config_list.append('+model.attn_implementation=eager')
    if ("cache_path" in config.keys()) and not config.get("upd_path", False):
        config_list.append('cache_path={}'.format(config['cache_path']))
    if ("samples_n" in config.keys()):
        config_list.append('+generation_params.samples_n={}'.format(config['samples_n']))
    if ("metric_thrs" in config.keys()):
        config_list.append('+metric_thrs={}'.format(config['metric_thrs']))
    if ("layers" in config.keys()):
        config_list.append('+layers={}'.format(config['layers'][config['model']]))
    if config.get("generalization", False):
        try:
            ood_cfg = gen_tasks[task_name][config["exp_idx"]]
        except:
            return ""
        for i, ds in enumerate(ood_cfg):
            if ("cache_path" in config.keys()) and config.get("upd_path", False):
                config_list.append('cache_path={}_on_{}'.format(config['cache_path'], ds))
            with open(f"../configs/polygraph_eval_{ds}.yaml") as stream:
                gen_config = yaml.safe_load(stream)
            config_list.append('+max_new_tokens_{}={}'.format(i+1, gen_config['max_new_tokens']))
            config_list.append('+train_dataset_{}=\"{}\"'.format(i+1, gen_config['dataset']))
            config_list.append('+train_text_column_{}={}'.format(i+1, gen_config['text_column']))
            config_list.append('+train_label_column_{}={}'.format(i+1, gen_config['label_column']))
            config_list.append('+train_prompt_{}=\"{}\"'.format(i+1, gen_config['prompt']))
            
            config_list[-1] = config_list[-1].replace("\n", "\\n")
            config_list[-1] = config_list[-1].replace("$", "\\$")
            config_list[-1] = config_list[-1].replace("'s", " is")
            config_list[-1] = config_list[-1].replace("(", "\\(")
            config_list[-1] = config_list[-1].replace(")", "\\)")
            config_list[-1] = config_list[-1].replace("}", "\\}")
            config_list[-1] = config_list[-1].replace("{", "\\{")
            config_list[-1] = config_list[-1].replace("]", "\\]")
            config_list[-1] = config_list[-1].replace("[", "\\[")
            config_list[-1] = config_list[-1].replace(",", "\\,")
            config_list[-1] = config_list[-1].replace("=", "\\=")
            # config_list[-1] = config_list[-1].replace("+", "\\+")
            config_list.append('+train_split_{}={}'.format(i+1, gen_config['train_split']))

            if "description" in gen_config.keys():
                config_list.append("+train_description_{}=\"{}\"".format(i+1, gen_config['description']))
                config_list[-1] = config_list[-1].replace("\n", "\\n")
                config_list[-1] = config_list[-1].replace("'s", " is")
                config_list[-1] = config_list[-1].replace("(", "\\(")
                config_list[-1] = config_list[-1].replace(")", "\\)")
                config_list[-1] = config_list[-1].replace("}", "\\}")
                config_list[-1] = config_list[-1].replace("{", "\\{")
                config_list[-1] = config_list[-1].replace(",", "\\,")
                config_list[-1] = config_list[-1].replace("$", "\\$")
                config_list[-1] = config_list[-1].replace("=", "\\=")
                config_list[-1] = config_list[-1].replace("]", "\\]")
                config_list[-1] = config_list[-1].replace("[", "\\[")
                # config_list[-1] = config_list[-1].replace("+", "\\+")
                
            if "few_shot_split" in gen_config.keys():
                config_list.append('+few_shot_split_{}={}'.format(i+1, gen_config['few_shot_split']))
            if "n_shot" in gen_config.keys():
                config_list.append('+train_n_shot_{}={}'.format(i+1, gen_config['n_shot']))
    if ("baselines" in config.keys()):
        if config['baselines'] == "all":
            config_list.append('+run_baselines=False')
            config_list.append('+run_eigenscore=False')
            config_list.append('+md_device=cuda')
            config_list.append('+run_proposed_methods=True')
            config_list.append('+run_layerwise_methods=True')
            config_list.append('+clean_md_device=cuda')
        elif config['baselines']:
            config_list.append('+run_baselines=False')
            config_list.append('+run_eigenscore=False')
            config_list.append('+run_layerwise_methods=True')
            config_list.append('+md_device=cuda')
        else:
            config_list.append('+run_proposed_methods=True')
            config_list.append('+clean_md_device=cuda')
    if "is_ood" in config.keys():
        config_list.append('+is_ood={}'.format(config['is_ood']))
        
    return config_list

In [None]:
def generate_bash(configs, cuda_devices, tasks, generate_func, script_name="polygraph_eval", filename='', n_gpus=1):
    full_config = 'cd ../'
    j = 0
    print(tasks)
    n_devices = len(cuda_devices)
    for i, mc_configs in enumerate(configs):
        for conf in expand_config(mc_configs):
            for task_name in tasks:
                
                if (n_gpus == 1) or ((task_name not in ["gsm8k", "xsum", "medquad"]) and (("7b" in conf["model"]) or ("8b" not in conf["model"]))):
                    base_arg = f'CUDA_VISIBLE_DEVICES={cuda_devices[j%n_devices]} HYDRA_CONFIG=./configs/polygraph_eval_{task_name}.yaml python run_polygraph.py'
                else:
                    base_arg = f'CUDA_VISIBLE_DEVICES={cuda_devices[j%n_devices]},{cuda_devices[(j+1)%n_devices]} HYDRA_CONFIG=./configs/polygraph_eval_{task_name}.yaml python run_polygraph.py'
                    j+=1
                    
                new_task = copy.deepcopy(base_arg)
                args = ' '.join(generate_func[i](conf, task_name))
                if not len(args.strip()):
                    continue
                new_task += f' {args}'
                if (j+1)%n_devices!=0: 
                    new_task += ' &'
                else:
                    new_task += '\nwait'
                full_config += '\n' + new_task if len(full_config) else new_task
                j+=1
                
    with open (f'../scripts/{filename}', 'w') as rsh:
        rsh.write(full_config)

In [None]:
datasets_ts = ['samsum', 'xsum', 'cnn']
datasets_tr = ['wmt14_fren', "wmt19_deen"]
datasets_qa_s = ['sciq', 'coqa', 'triviaqa', 'mmlu']
# datasets_qa_l = ['truthfullqa', 'pubmedqa', 'medquad']
datasets_qa_l = ['truthfullqa', 'pubmedqa']
all_tasks = [datasets_ts, datasets_qa_s, datasets_qa_l]

In [None]:
import numpy as np
import copy

n_exps = 1
tasks = ['truthfullqa', 'samsum']
gen_tasks = {}
for k, task in enumerate(tasks):
    gen_tasks[task] = []
    ds_exps = []
    all_tasks_i = [x for x in np.concatenate(all_tasks) if x != task] 
    for i, ds in enumerate(all_tasks_i):
        ds_exps.append(ds)
    gen_tasks[task].append(ds_exps)
print(gen_tasks)
    
train_configs = {
    'batch_size': [1],
    'baselines': ["all"],
    'subsample_train_dataset': [1000],
    'subsample_background_train_dataset': [1000],
    'subsample_eval_dataset': [2000],
    'model': ['meta-llama/Meta-Llama-3.1-8B'],
    'cache_path': ['./workdir/gen_output_loo_new'],
    'samples_n': [5],
    'generalization': [True], 
    'metric_thrs': ["\"[0.3]\""],
    'layers': [layers],
    "exp_idx": list(range(n_exps)),
    "is_ood": [True],
}
    
cuda_devices = [0]

generate_bash([train_configs], cuda_devices, tasks, [generate_train_str_config], filename='generalization/run_tmd_exps_v2_all.sh')

In [None]:
import numpy as np
import copy

n_exps = 1
tasks = ['mmlu', 'truthfullqa', 'samsum', 'sciq']#, 'cnn', 'samsum']#, 'medquad', 'xsum', 'pubmedqa', 'wmt14_fren', 'cnn', "wmt19_deen", 'samsum']
gen_tasks = {}
for k, task in enumerate(tasks):
    gen_tasks[task] = []
    ds_exps = []
    all_tasks_i = [x for x in np.concatenate(all_tasks) if x != task] 
    for i, ds in enumerate(all_tasks_i):
        ds_exps.append(ds)
    gen_tasks[task].append(ds_exps)
print(gen_tasks)
    
train_configs = {
    'batch_size': [1],
    'baselines': ["all"],
    'subsample_train_dataset': [400],
    'subsample_background_train_dataset': [1000],
    'subsample_eval_dataset': [2000],
    'model': ['meta-llama/Meta-Llama-3.1-8B'],
    'cache_path': ['./workdir/gen_output_loo_new'],
    'samples_n': [5],
    'generalization': [True], 
    'metric_thrs': ["\"[0.3]\""],
    'layers': [layers],
    "exp_idx": list(range(n_exps)),
    "is_ood": [True],
}
    
cuda_devices = [0]

generate_bash([train_configs], cuda_devices, tasks, [generate_train_str_config], filename='generalization/run_tmd_exps_v2_1.sh')

In [None]:
import numpy as np
import copy

n_exps = 1
tasks = ['mmlu', 'truthfullqa', 'samsum', 'sciq']#, 'cnn', 'samsum']#, 'medquad', 'xsum', 'pubmedqa', 'wmt14_fren', 'cnn', "wmt19_deen", 'samsum']
gen_tasks = {}
for k, task in enumerate(tasks):
    gen_tasks[task] = []
    ds_exps = []
    for i, ds_tasks in enumerate(all_tasks):
        if task in ds_tasks:
            for ds in ds_tasks:
                if ds != task:
                    ds_exps.append(ds)
    gen_tasks[task].append(ds_exps)
print(gen_tasks)
    
train_configs = {
    'batch_size': [1],
    'baselines': ["all"],
    'subsample_train_dataset': [400],
    'subsample_background_train_dataset': [1000],
    'subsample_eval_dataset': [2000],
    'model': ['meta-llama/Meta-Llama-3.1-8B'],
    'cache_path': ['./workdir/gen_output_loo_new'],
    'samples_n': [5],
    'generalization': [True], 
    'metric_thrs': ["\"[0.3]\""],
    'layers': [layers],
    "exp_idx": list(range(n_exps)),
    "is_ood": [True],
}
    
cuda_devices = [1]

generate_bash([train_configs], cuda_devices, tasks, [generate_train_str_config], filename='generalization/run_tmd_exps_v2_2.sh')

In [None]:
import numpy as np
import copy

n_exps = 1
tasks = ['mmlu', 'truthfullqa', 'samsum', 'sciq']#, 'cnn', 'samsum']#, 'medquad', 'xsum', 'pubmedqa', 'wmt14_fren', 'cnn', "wmt19_deen", 'samsum']
gen_tasks = {}
for k, task in enumerate(tasks):
    gen_tasks[task] = []
    ds_exps = []
    for i, ds_tasks in enumerate(all_tasks):
        if task in ds_tasks:
            for ds in ds_tasks:
                if ds != task:
                    ds_exps.append(ds)
    gen_tasks[task].append(ds_exps)
print(gen_tasks)
    
train_configs = {
    'batch_size': [1],
    'baselines': ["all"],
    'subsample_train_dataset': [1000],
    'subsample_background_train_dataset': [1000],
    'subsample_eval_dataset': [2000],
    'model': ['meta-llama/Meta-Llama-3.1-8B'],
    'cache_path': ['./workdir/gen_output_loo_new'],
    'samples_n': [5],
    'generalization': [True], 
    'metric_thrs': ["\"[0.3]\""],
    'layers': [layers],
    "exp_idx": list(range(n_exps)),
    "is_ood": [True],
}
    
cuda_devices = [1]

generate_bash([train_configs], cuda_devices, tasks, [generate_train_str_config], filename='generalization/run_tmd_exps_v2_3.sh')

In [None]:
import numpy as np
import copy

n_exps = 3
tasks = ['mmlu', 'truthfullqa', 'samsum', 'sciq']#, 'cnn', 'samsum']#, 'medquad', 'xsum', 'pubmedqa', 'wmt14_fren', 'cnn', "wmt19_deen", 'samsum']
gen_tasks = {}
for k, task in enumerate(tasks):
    gen_tasks[task] = []
    ds_exps = []
    for i, ds_tasks in enumerate(all_tasks):
        if task in ds_tasks:
            for ds in ds_tasks:
                if ds != task:
                    # ds_exps.append(ds)
                    gen_tasks[task].append([ds])
print(gen_tasks)
    
train_configs = {
    'batch_size': [1],
    'baselines': ["all"],
    'subsample_train_dataset': [2000],
    'subsample_background_train_dataset': [1000],
    'subsample_eval_dataset': [2000],
    'model': ['meta-llama/Meta-Llama-3.1-8B'],
    'cache_path': ['./workdir/gen_output_loo_new'],
    'samples_n': [5],
    'generalization': [True], 
    'metric_thrs': ["\"[0.3]\""],
    'layers': [layers],
    "exp_idx": list(range(n_exps)),
    "is_ood": [True],
    "upd_path": [True]
}
    
cuda_devices = [0]

generate_bash([train_configs], cuda_devices, tasks, [generate_train_str_config], filename='generalization/run_tmd_exps_v2_4.sh')