### Init

In [1]:
import os

path = os.getcwd()
# find the string 'project' in the path, return index
index_project = path.find('project')
# slice the path from the index of 'project' to the end
project_path = path[:index_project+7]
# set the working directory
os.chdir(project_path+'/src')
print(f'Project path set to: {os.getcwd()}')

Project path set to: c:\Users\dawson\Documents\GitHub\new-peak-project\src


In [2]:
from dotenv import dotenv_values
config = dotenv_values(".env")
print(config["DATA_PATH"])

C:\Users\dawson\Documents\Google Drive\My Drive\DAWSON PHD PROJECT\Biomarker Data Repository\data\new-peak-project\experiments


In [3]:
from models.ModelBuilder import ModelBuilder
from models.Reaction import Reaction
from models.ReactionArchtype import ReactionArchtype
from models.ArchtypeCollections import *

# import scikit-learn
from sklearn.linear_model import LinearRegression
# tree models and support vector machines
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
# import pearson correlation
from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from copy import deepcopy

### Notebook Parameters

In [4]:
import os 
import numpy as np

### parameters 

notebook_name = 's3_compute_time' # name of the notebook
noise_levels = [0.01, 0.05, 0.1, 1, 2, 5, 50] # noise levels to be tested

## Generation of ground truth model 

model_name = 'v4_drug_model' # name of the model
o_random_seed = 5
# p_overall_seed = 46 # different seed for parameter generation
no_observable_species = 5
no_feedback_regulations = 2
specie_value_range = (1000, 5000)
param_range = (0.8, 1.2)
param_multiplier_range = (0.99, 1.01)

## Simulation parameters 

simulation_time = 1000 
simulation_step = 100

## Feature data generation 

feature_generation_method = 'lhs'
feature_generation_extra_params = {'min': 0.1, 'max': 10}
feature_generation_size = 1000 
feature_generation_seed = 50 # if -1 then 'o_random_seed' is used
if feature_generation_seed == -1:
    feature_generation_seed = o_random_seed


'''
Options: 
- 'feedback_prune': removes feedback regulations from the model 
- 'random parameter': randomizes a x% of parameter values of the model
'''

''' 
Options: 
- 'last_time_point' : only the last time point of the phosphorylated species is used
- 'dynamic_feature': computes the characteristic 'ten' dynamic feature for each specie data 
'''

## General parameters
parallelise = True
save_figures = True 
experiment_id = notebook_name + '_' + str(o_random_seed) + '_' + str(no_observable_species) + '_' + str(no_feedback_regulations)
experiment_folder = config['DATA_PATH'] + '/' + experiment_id + '/'
if not os.path.exists(experiment_folder):
    os.makedirs(experiment_folder)
    
print(experiment_folder)

C:\Users\dawson\Documents\Google Drive\My Drive\DAWSON PHD PROJECT\Biomarker Data Repository\data\new-peak-project\experiments/s3_compute_time_5_5_2/


### Virtual Cell Creation

In [5]:
# create a drug enabled model 
from models.Utils import *
from models.DrugModelSpecification import DrugModelSpecification, Drug
from models.Solver.RoadrunnerSolver import RoadrunnerSolver


def generate_virtual_cells(node_size):
    model_drug_spec = DrugModelSpecification()
    model_drug_spec.generate_specifications(o_random_seed, node_size, no_feedback_regulations, verbose=0)
    drug_0 = Drug('D0', 500, 500)
    rng = np.random.default_rng(o_random_seed)
    # add random 'up' and 'down' regulations to the drug
    regulation_dir = []
    for i, s in enumerate(model_drug_spec.A_species):
        reg_type = str(rng.choice(['up', 'down']))
        regulation_dir.append(reg_type)
        drug_0.add_regulation(s, reg_type)
    model_drug_spec.add_drug(drug_0)

    p_random_seeds = []
    feature_size = 1000 
    rng = np.random.default_rng(o_random_seed)
    # generate `feature_size` random seeds for different parameter sets using numpy, ensure that the seeds are unique
    p_random_seeds = rng.choice(range(1000000), feature_size, replace=False).tolist()
    G0_d = model_drug_spec.generate_network('drug_model_524', 
                                            specie_value_range, 
                                            param_range, 
                                            param_multiplier_range,  
                                            verbose=0,
                                            random_seed=p_random_seeds[0])
    base_parameters = G0_d.get_parameters()
    base_initial_conditions = G0_d.get_state_variables()
    parameter_sets = []
    for p in p_random_seeds: 
        model_build = model_drug_spec.generate_network(f'param_seed_{p}', 
                                                specie_value_range, param_range, param_multiplier_range, random_seed=p, verbose=0)
        parameter_sets.append(model_build.get_parameters())

    solver = RoadrunnerSolver()
    solver.compile(G0_d.get_sbml_model())
    return model_drug_spec, base_initial_conditions, parameter_sets, solver

### Generate synthetic 'omics-like' data

In [6]:
from models.SyntheticGen import generate_feature_data, generate_target_data, generate_feature_data_v2, generate_target_data_diff_build


node_sizes = [5, 10, 20, 50, 100, 200]
time_results = []

# time the target data generation process using timeit

for node_size in node_sizes:
    print(f'Generating virtual cells for node size: {node_size}')
    model_drug_spec, base_initial_conditions, parameter_sets, solver = generate_virtual_cells(node_size)

    feature_data = generate_feature_data_v2(model_drug_spec, base_initial_conditions, feature_generation_method, feature_generation_extra_params, 1000, feature_generation_seed)
    import time
    start_time = time.time()

    target_data, _ = generate_target_data_diff_build(model_drug_spec, solver, 
                                                    feature_data, parameter_sets, 
                                                    {'start': 0, 'end': 1000, 'points': 100}, 
                                                    n_cores=1, verbose=True)
    end_time = time.time()
    print(f'Target data generation took {end_time - start_time} seconds')
    time_results.append(end_time - start_time)
    
print("Node sizes:", node_sizes)
print("Time results for different node sizes:", time_results)


Generating virtual cells for node size: 5


Simulating perturbations: 100%|██████████| 1000/1000 [00:06<00:00, 164.95it/s]


Target data generation took 6.077993631362915 seconds
Generating virtual cells for node size: 10


Simulating perturbations: 100%|██████████| 1000/1000 [00:11<00:00, 86.49it/s]


Target data generation took 11.577911615371704 seconds
Generating virtual cells for node size: 20


Simulating perturbations: 100%|██████████| 1000/1000 [00:31<00:00, 32.25it/s]


Target data generation took 31.011171579360962 seconds
Generating virtual cells for node size: 50


Simulating perturbations: 100%|██████████| 1000/1000 [03:20<00:00,  5.00it/s]


Target data generation took 200.02714157104492 seconds
Generating virtual cells for node size: 100


Simulating perturbations: 100%|██████████| 1000/1000 [18:05<00:00,  1.09s/it]


Target data generation took 1085.885205745697 seconds
Generating virtual cells for node size: 200


Simulating perturbations: 100%|██████████| 1000/1000 [2:01:28<00:00,  7.29s/it] 

Target data generation took 7288.911114692688 seconds
Node sizes: [5, 10, 20, 50, 100, 200]
Time results for different node sizes: [6.077993631362915, 11.577911615371704, 31.011171579360962, 200.02714157104492, 1085.885205745697, 7288.911114692688]



