## Model 'fitness' and its relationship to the model predictivity using machine learning

The fitness of a model is defined by its 'input-output' sensitivity. 

## Initialise Notebook

In [3]:
import os

path = os.getcwd()
# find the string 'project' in the path, return index
index_project = path.find('project')
# slice the path from the index of 'project' to the end
project_path = path[:index_project+7]
# set the working directory
os.chdir(project_path+'\\src')
print(f'Project path set to: {os.getcwd()}')

Project path set to: c:\Github\new-peak-project\src


In [4]:
from dotenv import dotenv_values
config = dotenv_values(".env")
print(config["DATA_PATH"])

I:\My Drive\DAWSON PHD PROJECT\Biomarker Data Repository\data\new-peak-project\experiments


In [5]:
from models.ModelBuilder import ModelBuilder
from models.Reaction import Reaction
from models.ReactionArchtype import ReactionArchtype
from models.ArchtypeCollections import *

# import scikit-learn
from sklearn.linear_model import LinearRegression
# tree models and support vector machines
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
# import pearson correlation
from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from copy import deepcopy

## Notebook Parameters

In [7]:
search_folder_id = 'exp4_drug_model_search_2'
search_result_file_pkl = 'sensitivity_analysis_results.pkl'

# import the search results
import pickle
import pandas as pd
search_results = pd.read_pickle(os.path.join(config["DATA_PATH"], search_folder_id, search_result_file_pkl))
search_results

Unnamed: 0,o_random_seed,parameter_random_seed,fitness,cv,mean,median
7903,80,83,-35.333503,1.120586,11.739083,5.013761
554,6,60,-11.501563,1.230697,8.707243,4.849242
548,6,54,-11.348729,1.314631,11.554897,3.394198
6335,64,99,-10.967009,1.418260,10.719108,2.444227
7411,75,86,-6.879933,1.370995,6.910913,4.250851
...,...,...,...,...,...,...
3538,36,74,1127.917296,2.784076,8.990375,0.585685
4418,45,63,1173.275919,1.955181,13.786861,1.631351
4383,45,28,1327.515388,1.937048,14.263155,2.072403
7833,80,13,1388.551956,1.341663,22.318707,7.417626


In [8]:
import os 

### parameters 

notebook_name = 'exp6_fitness_vs_predictivity' # name of the notebook
exp_id = '1' # name of the experiment

## Generation of ground truth model 

model_name = 'v3_medium_model_15_10'
no_observable_species = 15
no_feedback_regulations = 10
specie_value_range = (5, 5000)
param_range = (0.05, 20)
param_multiplier_range = (0.5, 1.5)

## Simulation parameters 

simulation_time = 1000 
simulation_step = 100

## Feature data generation 

feature_generation_method = 'uniform'
feature_generation_extra_params = {'min': 0.1, 'max': 10}
feature_generation_size = 1000 
feature_generation_seed = 50 # seed for reproducibility

## General parameters
parallelise = True
save_figures = True 
experiment_id = notebook_name + '_' + exp_id
experiment_folder = config['DATA_PATH'] + '/' + experiment_id + '/'
if not os.path.exists(experiment_folder):
    os.makedirs(experiment_folder)
    
print(experiment_folder)

I:\My Drive\DAWSON PHD PROJECT\Biomarker Data Repository\data\new-peak-project\experiments/exp6_fitness_vs_predictivity_1/


## Analysis

### Main function to perform the analysis


In [None]:
from models.Utils import *
from models.DrugModelSpecification import DrugModelSpecification, Drug
from models.Solver.RoadrunnerSolver import RoadrunnerSolver
from models.SyntheticGen import generate_feature_data, generate_target_data, generate_model_timecourse_data
from models.Utils import last_time_point_method, dynamic_features_method
cached_result_name = 'cached_result'
output_data = []

def evaluate_model(model, model_name, feature_data, feature_data_name, target_data ,test_size=0.2, random_state=4):
    X_train, X_test, y_train, y_test = train_test_split(feature_data, target_data, test_size=test_size, random_state=random_state)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    # return a dictionary of the model performance
    return {'Model': model_name, 
            'Feature Data': feature_data_name,
            'Mean Squared Error': mean_squared_error(y_test, y_pred),
            'R2 Score': r2_score(y_test, y_pred),
            'Pearson Correlation': pearsonr(y_test, y_pred)[0],
            'Pearson P-Value': pearsonr(y_test, y_pred)[1]
            }

import warnings 
warnings.filterwarnings("ignore", category=FutureWarning)

# iterate through the search results
max_iter = 2
curr_iter = 0
for index, row in tqdm(search_results.iterrows(), total=max_iter):   
    # key parameters for the iteration 
    o_random_seed, p_random_seed = int(row['o_random_seed']), int(row['parameter_random_seed'])
    model_id = index 
    # check if the result is already cached
    if os.path.exists(experiment_folder + cached_result_name + '_' + str(model_id) + '.pkl'):
        print(f'Result for {model_id} already cached, will use cached result instead')
        output_data.append(pd.read_pickle(experiment_folder + cached_result_name + '_' + str(model_id) + '.pkl'))
        continue 

    # create a drug enabled model 
    try: 
        model_drug_spec = DrugModelSpecification()
        model_drug_spec.generate_specifications(o_random_seed, no_observable_species, no_feedback_regulations, verbose=0)
        drug_0 = Drug('D0', 500, 5000)
        np.random.seed(o_random_seed)
        # add random 'up' and 'down' regulations to the drug
        regulation_dir = []
        for i, s in enumerate(model_drug_spec.A_species):
            regulation_dir.append(np.random.choice(['up', 'down']))
            drug_0.add_regulation(s, 'up')
        model_drug_spec.add_drug(drug_0)
        G0_d = model_drug_spec.generate_network(f'drug_model_{index}', specie_value_range, param_range, param_multiplier_range, random_seed=p_random_seed, verbose=0)
        base_parameters = G0_d.get_parameters()
        base_initial_conditions = G0_d.get_state_variables()
        # generate feature and target data
        solver = RoadrunnerSolver()
        solver.compile(G0_d.get_sbml_model())
        feature_data = generate_feature_data(model_drug_spec, G0_d.get_state_variables(), feature_generation_method, feature_generation_extra_params, 1000, feature_generation_seed)
        target_data, _ = generate_target_data(model_drug_spec, solver, feature_data, {'start': 0, 'end': simulation_time, 'points': simulation_step}, n_cores=8, verbose=False)
        
        # generate dynamic data, with ground truth model

        # generate the timecourse data for the new model
        time_course_data = generate_model_timecourse_data(model_drug_spec, 
                                                        solver, 
                                                        feature_data, 
                                                        {'start': 0, 'end': 1000, 'points': 100}, 
                                                        capture_species='all', n_cores=8, verbose=False)
        


        # randomise links and parameters by using a different random seed

        new_spec = DrugModelSpecification()
        new_spec.generate_specifications(6, 15, 10, verbose=0)
        drug_0 = Drug('D0', 500, 5000)
        drug_0.add_regulation('A0', 'down')
        drug_0.add_regulation('A1', 'down')
        drug_0.add_regulation('A2', 'down')
        new_spec.add_drug(drug_0)
        new_model = new_spec.generate_network('drug_model_552', specie_value_range, param_range, param_multiplier_range, random_seed=6, verbose=0)
        new_base_parameters = new_model.get_parameters()
        new_base_initial_conditions = new_model.get_state_variables()
        new_solver = RoadrunnerSolver()
        new_solver.compile(new_model.get_sbml_model())
        # generate the timecourse data for the new model
        time_course_data = generate_model_timecourse_data(new_spec, 
                                                        new_solver, 
                                                        feature_data, 
                                                        {'start': 0, 'end': simulation_time, 'points': 100}, 
                                                        capture_species='all', n_cores=8, verbose=False)
        all_species = new_spec.A_species + new_spec.B_species + new_spec.C_species
        all_phos_species = [s+'p' for s in all_species]
        # apply the data engineering method to the feature data
        last_time_data = last_time_point_method(time_course_data, all_phos_species)
        dynamic_data = dynamic_features_method(time_course_data, all_phos_species, n_cores=8, verbose=False)
        
        # create a linear regression model
        lm = LinearRegression()
        # create a random forest model
        rf = RandomForestRegressor(n_estimators=100, random_state=o_random_seed)
        # create a gradient boosting model
        gb = GradientBoostingRegressor(n_estimators=100, random_state=o_random_seed)
        # create a support vector machine model
        svr = SVR(max_iter=10000)
        scaled_svr = Pipeline([('scaler', StandardScaler()), ('svr', svr)])
        # create a neural network model (simple)
        nn = MLPRegressor(hidden_layer_sizes=(20,), max_iter=10000, random_state=o_random_seed)
        scaled_nn = Pipeline([('scaler', StandardScaler()), ('nn', nn)])
        combined_lp_data = pd.concat([feature_data, last_time_data], axis=1)
        combined_dyn_data = pd.concat([feature_data, dynamic_data], axis=1)

        feature_data_list = [feature_data, last_time_data, dynamic_data, combined_lp_data, combined_dyn_data]
        feature_data_names = ['feature_data', 'last_time_data', 'dynamic_data', 'combined_lp_data', 'combined_dyn_data']
        
        all_models = [lm, rf, gb, scaled_svr, scaled_nn]
        all_models_desc = ['Linear Regression', 'Random Forest', 'Gradient Boosting', 'Support Vector Machine', 'Neural Network']
        zipped_model_data = list(zip(all_models, all_models_desc))
        all_features = feature_data_list
        all_features_desc = feature_data_names
        zipped_feature_data = list(zip(all_features, all_features_desc))

        # random states are rand ints between 0 and 10000, for n values 
        np.random.seed(o_random_seed)
        n_random = 10
        all_random_states = np.random.randint(0, 10000, n_random)
        parallelise = True 
        metric_data = []
        # parallelise the model evaluation process using joblib
        from joblib import Parallel, delayed

        metric_data = Parallel(n_jobs=-1)(delayed(evaluate_model)(model, model_name, feature_data, feature_data_name, target_data['Cp'], random_state=rand) 
                                        for (feature_data, feature_data_name) in zipped_feature_data
                                        for (model, model_name) in zipped_model_data
                                        for rand in all_random_states)
        
        # save metric data to a pandas dataframe
        metric_df = pd.DataFrame(metric_data)
        # save the metric data to a pickle file
        metric_df.to_pickle(experiment_folder + cached_result_name + '_' + str(model_id) + '.pkl')
        output_data.append(metric_df)
        
    except Exception as e:
        warnings.warn(f'Error in model {model_id}: {e}')
        
    curr_iter += 1    
    if curr_iter > max_iter:
        print('Max iterations reached, stopping...')
        break

In [None]:
# cache all of the data into a single file
import pickle
with open(experiment_folder + 'cached_result.pkl', 'wb') as f:
    pickle.dump(output_data, f)

In [None]:
# load the cached data
with open(experiment_folder + 'cached_result.pkl', 'rb') as f:
    output_data = pickle.load(f)
if len(output_data) == search_results.shape[0]: 
    # delete all cache data in experiment_folder
    for index, row in search_results.iterrows():
        os.remove(experiment_folder + cached_result_name + '_' + str(index) + '.pkl')