In [1]:
import sys
functions_at = '/proj/src'
sys.path.append(functions_at)
from Py.libs import *
from Py.func import *

model_name = 'acr_CNN'
model = importlib.import_module(f'Py.model_{model_name}')
cv_schema = 'acr_cv.json'
key = 'r_1_f_1'
model_inputs = read_json("/proj/results/Py/create_slurm_scripts/acr_CNN_acr_cv/r_1_f_1/model_args.json") # a josn formatted as string for model inputs if any
tune = model_inputs[f'{model_name}']["tune"]
#hparams_at = '/proj/ext_dir/results/M2/hp_tuning/best_params.pkl' #todo: add this if you want to add one hyparameter for all runs

## Constant variables
base_dir = '/proj/tmp_data/acr_CNN_acr_cv/r_1_f_1' # where to save predictions
inputs_at = '/proj/results/Py/preprocessing'
tmp_at = '/proj/tmp_data'

## Run specifc
tb_cb = f'{base_dir}/callback_data/tb_cb'
mc_cb = f'{base_dir}/callback_data/mc_cb/model.ckpt'
tb_cb_tuning = f'{base_dir}/callback_data/tb_cb_tuning'
tuning_save_at = f'{base_dir}'
tune_dir = f'{model_name}/hp_tuning'
tuned_model_at = f'{base_dir}/model/model_tuned'
model_save_at = f'{base_dir}/model/model_fitted'
param_save_at = f'{base_dir}/model/best_params.json'
pred_save_at = f'{base_dir}/pred'
logs_at = f'{base_dir}/predictions.log'
path_to_pred_file = f'{pred_save_at}/output.csv'
cv_data = read_json(f'{inputs_at}/{cv_schema}')

In [2]:
# set which data to load
if "cv" in cv_schema:
    id_str = "cv"
elif ("st" in cv_schema or "sce" in cv_schema):
    id_str = "st_sce"
## g_a data
g_a_data = np.load(f'{inputs_at}/acr_g_a_{id_str}.npy')
scaler_g_a = read_pkl(f'{inputs_at}/g_a.scl')
## p_data
p_data = pd.read_csv(f'{inputs_at}/acr_p_{id_str}.csv')
scaler_p = read_pkl(f'{inputs_at}/acr_p.scl')
## add further cols
p_data["run_idx"] = re.sub(r"(r\_\d+)\_(f\_\d+)", r"\1", key)
p_data["fold_idx"] = re.sub(r"(r\_\d+)\_(f\_\d+)", r"\2", key)
p_data["cv_idx"] = None
p_data["env"] = None

## further additions to p_data
p_data = p_data.rename(columns = {"Geno_new" : "geno", "Env":"env"})
p_data["model_idx"] = model_name
p_data["run_type"] = re.sub(r"(\S+)\.json", r"\1", cv_schema)

if "acr" in model_name:
    out_cols = ["series", "geno", "type", "idx_col", "run_type", "run_idx", "fold_idx", "model_idx", "cv_idx", "BLUEs_raw", "BLUEs_scaled"]
else:
    out_cols = ["env", "geno", "type", "idx_col", "run_type", "run_idx", "fold_idx", "model_idx", "cv_idx", "BLUEs_raw", "BLUEs_scaled"]

p_data = p_data.loc[:, out_cols]

In [3]:
# create train, val and test sets -----------------------------------------------------------------------------------
if "val" in cv_data[key].keys(): # the function substracts one from index value. 
    train_set, val_set, test_set = create_train_val_data(index_train = cv_data[key]["train"], index_test = cv_data[key]["test"], index_val = cv_data[key]["val"])
else:
    train_set, val_set, test_set = create_train_val_data(index_train = cv_data[key]["train"], index_test = cv_data[key]["test"]) # makes a val set out of the training set

## get data as tensors 
if "acr" in model_name:
    target_data = [p_data.loc[:, "BLUEs_scaled"].values.astype('float32'), \
                   g_a_data.astype('float32')]
elif "wtn" in model_name:
    target_data = [p_data.loc[:, "BLUEs_scaled"].values.astype('float32'), \
                   ec_data.astype('float32'), \
                   g_a_data.astype('float32'), \
                   g_d_data.astype('float32'), \
                   g_s_data.astype('float32')]
    
train_data = [x[train_set] for x in target_data]
val_data = [x[val_set] for x in target_data]
test_data = [x[test_set] for x in target_data]

In [6]:
train_y = train_data[0]
val_y = val_data[0]
test_y = test_data[0]
if "acr" in model_name:
    train_x = train_data[1]
    val_x = val_data[1]
    test_x = test_data[1]
    model_tuner = model.tuner
    
start_time_tuning = time.time()
stop_early = EarlyStopping(monitor='val_loss', patience=5, min_delta = 0.001)
tb_cv_tuner = TensorBoard(tb_cb_tuning)
tuner = kt.Hyperband(hypermodel=model_tuner,
                     objective=kt.Objective("val_mean_squared_error", direction="min"),
                     max_epochs=10,
                     factor=4,
                     hyperband_iterations=1,
                     overwrite = True,
                     directory=tuning_save_at,
                     project_name=tune_dir,
                     seed=30)
tuner.search(train_x, train_y,
             epochs=10,
             validation_data=(val_x, val_y),
             callbacks=[stop_early, tb_cv_tuner],
             verbose=1)

# save parameters
for num_params in [3, 2, 1]:
    print(num_params)
    try:
        top3_params = tuner.get_best_hyperparameters(num_trials=num_params)
        if top3_params:
            break  # If successful, exit the loop
    except tf.errors.NotFoundError as e:
        print("An error occurred:", e)
        if num_params == 1:
            raise Exception("Error: Failed to retrieve best models with num_models=1. Script halted.")
params = top3_params[0].values  # best hyperparameter values # can igonore warnings # https://stackoverflow.com/questions/58289342/tf2-0-translation-model-error-when-restoring-the-saved-model-unresolved-object

# save model
for num_models in [3, 2, 1]:
    print(num_models)
    try:
        top3_models = tuner.get_best_models(num_models=num_models)
        if top3_models:
            break  # If successful, exit the loop
    except tf.errors.NotFoundError as e:
        print("An error occurred:", e)
        if num_models == 1:
            raise Exception("Error: Failed to retrieve best models with num_models=1. Script halted.")
best_model = top3_models[0]

Trial 11 Complete [00h 01m 22s]
val_mean_squared_error: 0.0649949461221695

Best val_mean_squared_error So Far: 0.004117930773645639
Total elapsed time: 00h 15m 00s
INFO:tensorflow:Oracle triggered exit
3
3


OSError: No file or directory found at /proj/tmp_data/acr_CNN_acr_cv/r_1_f_1/model/model_tuned

In [16]:
best_model_2 = model_tuner(top3_params[0])

In [30]:
layer_units = [512, 256, 128]
layer_units = [f'l_u_d_{x}' for x in range(len(layer_units))]

In [31]:
layer_units

['l_u_d_0', 'l_u_d_1', 'l_u_d_2']

In [18]:
fit_params = {'fit' : {'batch_size' : 32, # default is 32
                       'epochs' : 100,
                       'verbose' : 1,
                       'shuffle' : True,
                       'tensorboard_fp' : tb_cb,
                       'checkpoint_fp' : mc_cb}}
my_model_fit_1 = fit_model(final_model = best_model, params = fit_params, 
                         train_x = train_x, 
                         train_y = train_y, 
                         val_x = val_x,
                         val_y = val_y)
pred_vals_test_1 = predict_values(model = my_model_fit_1, 
                                test_x = test_x, 
                                test_y = test_y, 
                                index = test_set, 
                                scaler = scaler_p)
my_model_fit_2 = fit_model(final_model = best_model_2, params = fit_params, 
                         train_x = train_x, 
                         train_y = train_y, 
                         val_x = val_x,
                         val_y = val_y)
pred_vals_test_2 = predict_values(model = my_model_fit_2, 
                                test_x = test_x, 
                                test_y = test_y, 
                                index = test_set, 
                                scaler = scaler_p)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100


In [26]:
combined = pred_vals_test_1.merge(pred_vals_test_2, left_on='index', right_on='index',
          suffixes=('_p1', '_p2'))
combined

Unnamed: 0,index,obs_p1,pred_p1,obs_p2,pred_p2
0,3271,101.862343,96.281662,101.862343,98.144173
1,11678,92.091049,94.363167,92.091049,95.571243
2,2105,98.460823,99.783875,98.460823,100.515800
3,3065,101.085480,103.212029,101.085480,103.231071
4,9893,93.366928,89.850800,93.366928,92.490433
...,...,...,...,...,...
2652,5844,100.537094,98.694656,100.537094,99.489220
2653,598,101.754730,98.325439,101.754730,99.629356
2654,6197,96.329323,99.763008,96.329323,98.856636
2655,11511,91.679192,91.555229,91.679192,90.529572


In [27]:
combined['pred_p1'].corr(combined['pred_p2'])

0.9398832164522853