In [1]:
import sys
sys.path.append('..')

In [11]:
from logics_pack import global_settings, chemistry, predictor
import pandas as pd
import numpy as np
import json
import pickle

project_paths = global_settings.build_project_paths(project_dir='../')
expset_obj = global_settings.ExperimentSettings(project_paths['EXPERIMENT_SETTINGS_JSON'])

Training KOR activity predictor (Random Forest Regressor)

In [3]:
conf_k = global_settings.Object()
conf_k.affinity_path = project_paths['KOR_DATA_PATH']
conf_k.fold_path = project_paths['KOR_FOLD_JSON']
conf_k.fingerprint_path = project_paths['KOR_DATA_FP']
conf_k.test_fold_id = str(global_settings.TEST_FOLD_IDX)    # string type is required for train_predictor()

conf_k.predictor_model_fmt = project_paths['PROJECT_DIR'] + "model-kor/predictor/kor_rfr_cv%d.pkl"
conf_k.result_table_path = project_paths['PROJECT_DIR'] + "model-kor/predictor/kor_rfr_cv_results.csv"

In [4]:
# RFR regressor training
rfr_cvs, vmse, vr2, cv_fold_keys = predictor.train_predictor(conf_k)

['0', '1', '2', '3', '4']


In [5]:
# save the result
cv_folds = [int(key) for key in cv_fold_keys]
pred_result = pd.DataFrame(cv_folds, columns=['cv_fold'])
pred_result['vmse'] = vmse
pred_result['vr2'] = vr2
pred_result.to_csv(conf_k.result_table_path, index=False)

In [7]:
# find the best performing cv fold by validation R2
best_cv_idx = pred_result['vr2'].idxmax()
best_cv = pred_result['cv_fold'].iloc[best_cv_idx]
# add best cv info to the experiment setting json file, and overwrite it
expset_obj.update_setting("kor-pred-best-cv", best_cv)
print("best CV fold of KOR predictor: ", best_cv)

best CV fold of KOR predictor:  3


In [13]:
# save the models
for i, cv_idx in enumerate(cv_folds):
    with open(conf_k.predictor_model_fmt%cv_idx, 'wb') as f:
        pickle.dump(rfr_cvs[i], f)

Training PIK3CA activity predictor (Random Forest Regressor)

In [18]:
conf_p = global_settings.Object()
conf_p.affinity_path = project_paths['PIK3CA_DATA_PATH']
conf_p.fold_path = project_paths['PIK3CA_FOLD_JSON']
conf_p.fingerprint_path = project_paths['PIK3CA_DATA_FP']
conf_p.test_fold_id = str(global_settings.TEST_FOLD_IDX)    # string type is required for train_predictor()

conf_p.predictor_model_fmt = project_paths['PROJECT_DIR'] + "model-pik3ca/predictor/pik3ca_rfr_cv%d.pkl"
conf_p.result_table_path = project_paths['PROJECT_DIR'] + "model-pik3ca/predictor/pik3ca_rfr_cv_results.csv"

In [19]:
# RFR regressor training
rfr_cvs, vmse, vr2, cv_fold_keys = predictor.train_predictor(conf_p)

['0', '1', '2', '3', '4']


In [20]:
# save the result
cv_folds = [int(key) for key in cv_fold_keys]
pred_result = pd.DataFrame(cv_folds, columns=['cv_fold'])
pred_result['vmse'] = vmse
pred_result['vr2'] = vr2
pred_result.to_csv(conf_p.result_table_path, index=False)

In [21]:
# find the best performing cv fold by validation R2
best_cv_idx = pred_result['vr2'].idxmax()
best_cv = pred_result['cv_fold'].iloc[best_cv_idx]
# add best cv info to the experiment setting json file, and overwrite it
expset_obj.update_setting("pik3ca-pred-best-cv", best_cv)
print("best CV fold of PIK3CA predictor: ", best_cv)

best CV fold of PIK3CA predictor:  2


In [22]:
# save the models
for i, cv_idx in enumerate(cv_folds):
    with open(conf_p.predictor_model_fmt%cv_idx, 'wb') as f:
        pickle.dump(rfr_cvs[i], f)