## Benchmarking

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from Benchmarking import *

In [3]:
def train(bases, smoothers, chm, gen_expts, datasets, model_root):

    for data_path in datasets:

        # data_path must end with generated_data
        data_path = data_path + "generated_data/"

        for gen_name in gen_expts:
            
            data_specs = "/".join(data_path.split("/")[-5:-2]) + "/chm{}/".format(chm)
            model_path = model_root + data_specs
            if not os.path.exists(model_path):
                os.makedirs(model_path)
                
            snp = data_path.split("/")[5]
            if snp == "full":
                M = 1000
            elif snp== "ukb":
                M = 30

            gens = gen_expts[gen_name]

            print("-"*100)
            print("Experiment details")
            print("Dataset used: {}".format(data_path))
            print("Generations used: {}".format(gens))
            print("Models will be stored at: {}".format(model_path))
            
            bm_train(bases, smoothers, model_path=model_path, data_path=data_path, gens=gens, chm=chm, M=M)
            

In [7]:
datasets_benchmark = [
# Full Genome
# "/home/arvindsk/xgmix_expts/benchmark_data/full/bal_admix/latino/",
# "/home/arvindsk/xgmix_expts/benchmark_data/full/bal_admix/five/",
# "/home/arvindsk/xgmix_expts/benchmark_data/full/bal_admix/seven/",

# Array data
# "/home/arvindsk/xgmix_expts/benchmark_data/ukb/bal_admix/latino/",
# "/home/arvindsk/xgmix_expts/benchmark_data/ukb/bal_admix/five/",
"/home/arvindsk/xgmix_expts/benchmark_data/ukb/bal_admix/seven/"
]


In [8]:
model_root = '/home/wknd37/gnomix/benchmark/models/'

In [None]:
bases = ["logreg", "lgb", "randomstringkernel", "stringkernel"]
smoothers = ["crf", "xgb", "cnn"]

chm = 20
gen_expts = {
    "all" : [0,2,4,8,16,24,32,48,64,72,100]
}
train(bases, smoothers, chm, gen_expts, datasets=datasets_benchmark, model_root=model_root)

----------------------------------------------------------------------------------------------------
Experiment details
Dataset used: /home/arvindsk/xgmix_expts/benchmark_data/ukb/bal_admix/seven/generated_data/
Generations used: [0, 2, 4, 8, 16, 24, 32, 48, 64, 72, 100]
Models will be stored at: /home/wknd37/gnomix/benchmark/models/ukb/bal_admix/seven/chm20/
Reading data...
BASE: logreg
Training base
Windows done: 532/532
Re-training base
Windows done: 532/532
BASE: lgb
Training base
Windows done: 63/532

In [5]:
from Benchmarking import get_data

def evaluate(bases, smoothers, chm, val_gens, data_path_fmt, model_path_fmt):

#     for set_name in ["full", "ukb"]:
    for set_name in ["ukb"]:
        M = 30 if set_name == "ukb" else 1000

#         for pop_name in ["latino","five","seven"]:
        for pop_name in ["latino"]:

            data_path = data_path_fmt.format(chm,set_name,pop_name)
            model_root = model_path_fmt.format(chm,set_name,pop_name)

            print("Getting traintime data for model meta data")
            train_data_root = "/home/arvindsk/xgmix_expts/benchmark_data/{0}/{1}/{2}/generated_data/"
            train_data_path = train_data_root.format(set_name,"bal_admix",pop_name)
            _, meta, _ = get_data(train_data_path, M=M, gens=[0,2], chm=chm, verbose=False)
            
            print("Getting validation set that was separately generated")
            print("Val generations: ",val_gens)
            val_paths = [data_path + "/chm{}/simulation_output/val/gen_".format(chm) + str(gen) + "/" for gen in val_gens] 
            X_val_files = [p + "mat_vcf_2d.npy" for p in val_paths]
            labels_val_files = [p + "mat_map.npy" for p in val_paths]
            train_val_files = [X_val_files, labels_val_files]
            X_val_raw, labels_val_raw = [load_np_data(f) for f in train_val_files]
            X_val, labels_window_val = data_process(X_val_raw, labels_val_raw, M, 0)
            X_v = np.array(X_val).astype("int8")
            y_v = np.array(labels_window_val).astype("int16")
            y_snp = labels_val_raw.copy() # glue the old train data with the new val data

            for b in bases:
                for s in smoothers:

                    base_path = model_root + "base_models/" + b + "_retrained.pkl"
                    smooth_path = model_root + "smoothers/" + b + "_" + s + ".pkl"
                    metric_path = model_root + b + "_" + s + ".benchmark"
                    
                    print("Base:", base_path)
                    print("Smooth:", smooth_path)

                    model = Gnomix(C=meta["C"], M=meta["M"], A=meta["A"])
                    model.base = pickle.load(open(base_path,"rb"))
                    model.smooth = pickle.load(open(smooth_path,"rb"))

                    metrics = bm_eval(model, val_data=(X_v, y_v), gens=val_gens, y_snp=y_snp,
                                      base_smooth_paths=[base_path, smooth_path])
                    save_dict(metrics,metric_path)

In [6]:
# function of chm, set_name, pop_name
model_path_fmt = "/home/wknd37/gnomix/benchmark/models/{1}/bal_admix/{2}/chm{0}/"
data_path_fmt = "/home/arvindsk/xgmix_expts/benchmark_data/val/chm{0}/{1}/{2}/generated_data/"
val_gens = [2,4,8,12,16,20,30,40,50,60,70,80,90,100]

In [64]:
# evaluate(bases, smoothers, chm, gen_expts, datasets=datasets_benchmark, model_root=model_root)
evaluate(["lgb"], ["crf"], chm, val_gens=val_gens, data_path_fmt=data_path_fmt, model_path_fmt=model_path_fmt)

Getting traintime data for model meta data
Getting validation set that was separately generated
Val generations:  [2, 4, 8, 12, 16, 20, 30, 40, 50, 60, 70, 80, 90, 100]
Base: /home/wknd37/gnomix/benchmark/models/ukb/bal_admix/latino/chm20/base_models/lgb_retrained.pkl
Smooth: /home/wknd37/gnomix/benchmark/models/ukb/bal_admix/latino/chm20/smoothers/lgb_crf.pkl
{'base_val_acc': 83.98, 'base_val_acc_bal': 80.27, 'base_val_log_loss': 0.5, 'smooth_val_acc': 96.07, 'smooth_val_acc_bal': 94.51, 'smooth_val_log_loss': 0.12, 'model_total_size_mb': 42.4, 'gen_performance': {'gens': [2, 4, 8, 12, 16, 20, 30, 40, 50, 60, 70, 80, 90, 100], 'accs': [98.53, 98.97, 98.71, 98.32, 97.96, 97.39, 97.09, 96.61, 94.76, 95.04, 93.74, 93.54, 92.48, 91.79], 'accs_snp_lvl': [98.52, 98.96, 98.66, 98.23, 97.87, 97.31, 96.98, 96.38, 94.51, 94.6, 93.32, 93.09, 92.02, 91.31]}, 'val_acc_snp_lvl': 95.84}


In [8]:
chm = 20
expt = ["model","base","smooth","pop_name","bal","snp"]
tuples = []
# for set_name in ["full","ukb"]:
for set_name in ["ukb"]:
#     for pop_name in ["latino","five","seven"]:
    for pop_name in ["latino"]:
        data_path = data_path_fmt.format(chm,set_name,pop_name)
        for base in ["lgb", "logreg", "randomstringkernel", "stringkernel"]:
            for smooth in ["xgb", "crf", "cnv"]:
                for bal in ["bal_admix"]:
                    model_root = model_path_fmt.format(chm,set_name,pop_name)
                    metric_path = model_root + b + "_" + s + ".benchmark"
                    metrics = load_dict(metric_path)
                    skip_metrics = ["gen_performance"]
                    metric_names = [mkey for mkey in list(metrics.keys()) if mkey not in skip_metrics]
                    metrics = tuple([metrics[m] for m in metric_names])
                    tuples += [(base+"_"+smooth,base,smooth,pop_name,bal,set_name)+metrics]
                    
print(metric_names)
DF_us = pd.DataFrame(tuples, columns=expt+metric_names)
DF_us


NameError: name 'metrics_benchmark' is not defined