## Init

In [1]:
import os

path = os.getcwd()
# find the string 'project' in the path, return index
index_project = path.find('project')
# slice the path from the index of 'project' to the end
project_path = path[:index_project+7]
# set the working directory
os.chdir(project_path+'/src')
print(f'Project path set to: {os.getcwd()}')

Project path set to: c:\Users\dawson\Documents\GitHub\new-peak-project\src


In [2]:
from dotenv import dotenv_values
config = dotenv_values(".env")
print(config["DATA_PATH"])

C:\Users\dawson\Documents\Google Drive\My Drive\DAWSON PHD PROJECT\Biomarker Data Repository\data\new-peak-project\experiments


In [3]:
from models.ModelBuilder import ModelBuilder
from models.Reaction import Reaction
from models.ReactionArchtype import ReactionArchtype
from models.ArchtypeCollections import *

# import scikit-learn
from sklearn.linear_model import LinearRegression
# tree models and support vector machines
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
# import pearson correlation
from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from copy import deepcopy

## Notebook Parameters

Aim is to discover feature datasets which is predictive for the drug target (>0.5 pearson correlation between predicted to actual).

In [4]:
import os 
import numpy as np

### parameters 

notebook_name = 'exp11_model_search' # name of the notebook
try_seeds = []
for i in range(1, 101):
    try_seeds.append(i)
    

## Generation of ground truth model 

model_name = 'v4_drug_model' # name of the model
# p_overall_seed = 46 # different seed for parameter generation
no_observable_species = 5
no_feedback_regulations = 2
specie_value_range = (1000, 5000)
param_range = (0.05, 20)
param_multiplier_range = (0.5, 1.5)

## Simulation parameters 

simulation_time = 1000 
simulation_step = 100

## Feature data generation 

feature_generation_method = 'lhs'
feature_generation_extra_params = {'min': 0.1, 'max': 10}
feature_generation_size = 1000 
feature_generation_seed = 50 # if -1 then 'o_random_seed' is used


'''
Options: 
- 'feedback_prune': removes feedback regulations from the model 
- 'random parameter': randomizes a x% of parameter values of the model
'''

''' 
Options: 
- 'last_time_point' : only the last time point of the phosphorylated species is used
- 'dynamic_feature': computes the characteristic 'ten' dynamic feature for each specie data 
'''

## General parameters
parallelise = True
save_figures = True 
exp_id = '1'
experiment_id = notebook_name + '_' + str(exp_id)
experiment_folder = config['DATA_PATH'] + '/' + experiment_id + '/'
if not os.path.exists(experiment_folder):
    os.makedirs(experiment_folder)
    
print(experiment_folder)

C:\Users\dawson\Documents\Google Drive\My Drive\DAWSON PHD PROJECT\Biomarker Data Repository\data\new-peak-project\experiments/exp11_model_search_1/


## Compute

In [5]:
print('Experiment ID: ', experiment_id)
print('Experiment folder: ', experiment_folder)
print('Tried seeds: ', try_seeds)

Experiment ID:  exp11_model_search_1
Experiment folder:  C:\Users\dawson\Documents\Google Drive\My Drive\DAWSON PHD PROJECT\Biomarker Data Repository\data\new-peak-project\experiments/exp11_model_search_1/
Tried seeds:  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100]


In [6]:
### Virtual Cell Creation
# create a drug enabled model 
from models.Utils import *
from models.DrugModelSpecification import DrugModelSpecification, Drug
from models.Solver.RoadrunnerSolver import RoadrunnerSolver
from models.Solver.ScipySolver import ScipySolver
from models.SyntheticGen import generate_feature_data_v2, generate_target_data_diff_build
from models.SyntheticGen import generate_model_timecourse_data_diff_build
from models.Utils import last_time_point_method, dynamic_features_method
from joblib import Parallel, delayed
import pickle
from tqdm import tqdm

def parallel_func(j, seed):
    model_drug_spec = DrugModelSpecification()
    model_drug_spec.generate_specifications(seed, no_observable_species, no_feedback_regulations, verbose=0)
    drug_0 = Drug('D0', 500, 500)
    rng = np.random.default_rng(seed)
    # add random 'up' and 'down' regulations to the drug
    regulation_dir = []
    for i, s in enumerate(model_drug_spec.A_species):
        reg_type = str(rng.choice(['up', 'down']))
        regulation_dir.append(reg_type)
        drug_0.add_regulation(s, reg_type)
    model_drug_spec.add_drug(drug_0)
    # print(model_drug_spec)
    # print(f'Feedback: {model_drug_spec.get_feedback_regulations()}')

    p_random_seeds = []
    feature_size = 1000 
    rng = np.random.default_rng(seed)
    # generate `feature_size` random seeds for different parameter sets using numpy, ensure that the seeds are unique
    p_random_seeds = rng.choice(range(1000000), feature_size, replace=False).tolist()

    G0_d = model_drug_spec.generate_network('drug_model_524', 
                                            specie_value_range, 
                                            param_range, 
                                            param_multiplier_range,  
                                            verbose=0,
                                            random_seed=p_random_seeds[0])
    base_parameters = G0_d.get_parameters()
    base_initial_conditions = G0_d.get_state_variables()

    # print(G0_d.get_antimony_model())


    # generate parameter sets for each random seed
    parameter_sets = []
    for p in p_random_seeds: 
        model_build = model_drug_spec.generate_network(f'param_seed_{p}', 
                                                specie_value_range, param_range, param_multiplier_range, random_seed=p, verbose=0)
        parameter_sets.append(model_build.get_parameters())
        
    # test simulation 



    solver = ScipySolver()
    solver.compile(G0_d.get_antimony_model())
    # result = solver.simulate(0, 1000, 100)



    feature_data = generate_feature_data_v2(model_drug_spec, base_initial_conditions, feature_generation_method, feature_generation_extra_params, 1000, feature_generation_seed)
    target_data, _ = generate_target_data_diff_build(model_drug_spec, solver, 
                                                    feature_data, parameter_sets, 
                                                    {'start': 0, 'end': 1000, 'points': 100}, 
                                                    n_cores=-1, verbose=False)


    # create a dataframe with the feature data and target data
    feature_data_df = pd.DataFrame(feature_data)
    target_data_df = pd.DataFrame(target_data)
    # add the target data to the feature data
    feature_data_df['target'] = target_data_df['Cp']

    # calculate the correlation between the features and the target data
    correlation = feature_data_df.corr()['target'].sort_values(ascending=False)
    # create a dataframe with the correlation values
    correlation_df = pd.DataFrame(correlation)
    correlation_df = correlation_df.reset_index()
    correlation_df.columns = ['feature', 'correlation']
    # do not include the target data in the correlation dataframe
    correlation_df = correlation_df[correlation_df['feature'] != 'target']
    mean = np.mean(correlation_df['correlation'])
    std = np.std(correlation_df['correlation'])
    max_val = np.max(correlation_df['correlation'])
    min_val = np.min(correlation_df['correlation'])
    outliers = correlation_df[(correlation_df['correlation'] > mean + 3*std) | (correlation_df['correlation'] < mean - 3*std)]
    outliers_ratio = outliers.shape[0] / correlation_df.shape[0]
    
    # build a dataframe based on the above values
    data = {
        'seed': seed,
        'mean': mean,
        'std': std,
        'max': max_val,
        'min': min_val,
        'outliers_ratio': outliers_ratio,
    }
    return data 
    
output_data = []
for i, seed in tqdm(enumerate(try_seeds), total=len(try_seeds)):
    # print(f'Running seed {i+1}/{len(try_seeds)}: {seed}')
    data = parallel_func(i, seed)
    output_data.append(data)
    


100%|██████████| 100/100 [1:31:00<00:00, 54.61s/it]


In [7]:
# make a dataframe from the output data
output_data_df = pd.DataFrame(output_data)

In [8]:
output_data_df

Unnamed: 0,seed,mean,std,max,min,outliers_ratio
0,1,0.013839,0.049041,0.058789,-0.120961,0.0
1,2,-0.028277,0.070601,0.058589,-0.193553,0.0
2,3,0.032184,0.035740,0.084621,-0.043574,0.0
3,4,-0.064193,0.143799,0.063710,-0.289205,0.0
4,5,-0.004387,0.070109,0.115289,-0.143429,0.0
...,...,...,...,...,...,...
95,96,-0.006401,0.038275,0.053404,-0.067344,0.0
96,97,-0.007219,0.048212,0.118273,-0.060487,0.0
97,98,0.006225,0.051514,0.063860,-0.105977,0.0
98,99,0.023616,0.035269,0.072417,-0.035673,0.0


## Analysis