## Init

In [7]:
import os

path = os.getcwd()
# find the string 'project' in the path, return index
index_project = path.find('project')
# slice the path from the index of 'project' to the end
project_path = path[:index_project+7]
# set the working directory
os.chdir(project_path+'/src')
print(f'Project path set to: {os.getcwd()}')

Project path set to: c:\Github\new-peak-project\src


In [8]:
from dotenv import dotenv_values
config = dotenv_values(".env")
print(config["DATA_PATH"])

I:\My Drive\DAWSON PHD PROJECT\Biomarker Data Repository\data\new-peak-project\experiments


In [9]:
from models.ModelBuilder import ModelBuilder
from models.Reaction import Reaction
from models.ReactionArchtype import ReactionArchtype
from models.ArchtypeCollections import *

# import scikit-learn
from sklearn.linear_model import LinearRegression
# tree models and support vector machines
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
# import pearson correlation
from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from copy import deepcopy

## Notebook Parameters

In [10]:

import os 

### parameters 

notebook_name = 'exp8_model_search_v2' # name of the notebook

## Generation of ground truth model 

model_name = 'test_models'
no_observable_species = 5
no_feedback_regulations = 2
specie_value_range = (1000, 5000)
param_range = (0.05, 20)
param_multiplier_range = (0.5, 1.5)


## Simulation parameters 

simulation_time = 1000 
simulation_step = 100

## Feature data generation 

feature_generation_method = 'lhs'
feature_generation_extra_params = {'min': 0.1, 'max': 10}
feature_generation_size = 1000 
feature_generation_seed = 50 # if -1 then 'o_random_seed' is used

## Search parameters

o_random_seeds = list(range(1, 2))
parameter_random_seeds = list(range(1, 3))

    
## Data engineering parameters

# Suboptimal Model Generation 

'''
Options: 
- 'feedback_prune': removes feedback regulations from the model 
- 'random parameter': randomizes a x% of parameter values of the model
'''

''' 
Options: 
- 'last_time_point' : only the last time point of the phosphorylated species is used
- 'dynamic_feature': computes the characteristic 'ten' dynamic feature for each specie data 
'''

## General parameters
parallelise = True
save_figures = True 
experiment_id = notebook_name
experiment_folder = config['DATA_PATH'] + '/' + experiment_id + '/'
if not os.path.exists(experiment_folder):
    os.makedirs(experiment_folder)
    
print(experiment_folder)

I:\My Drive\DAWSON PHD PROJECT\Biomarker Data Repository\data\new-peak-project\experiments/exp8_model_search_v2/


## Analysis 

In [11]:
from models.Utils import *
from models.DrugModelSpecification import DrugModelSpecification, Drug
from models.Solver.RoadrunnerSolver import RoadrunnerSolver
from models.SyntheticGen import generate_feature_data, generate_target_data, generate_model_timecourse_data, generate_feature_data_v2
from models.Utils import last_time_point_method, dynamic_features_method
from models.Parallel import parallelize_joblib


def evaluate_model(model, model_name, feature_data, feature_data_name, target_data ,test_size=0.2, random_state=4):
    X_train, X_test, y_train, y_test = train_test_split(feature_data, target_data, test_size=test_size, random_state=random_state)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    # return a dictionary of the model performance
    return {'Model': model_name, 
            'Feature Data': feature_data_name,
            'Mean Squared Error': mean_squared_error(y_test, y_pred),
            'R2 Score': r2_score(y_test, y_pred),
            'Pearson Correlation': pearsonr(y_test, y_pred)[0],
            'Pearson P-Value': pearsonr(y_test, y_pred)[1]
            }

import warnings 
warnings.filterwarnings("ignore", category=FutureWarning)

# nest o_random_seed and parameter_random_seed into a list of tuples
search_results = []
for o_random_seed in o_random_seeds:
    for parameter_random_seed in parameter_random_seeds:
        search_results.append({'o_random_seed': o_random_seed, 'parameter_random_seed': parameter_random_seed})
search_results = pd.DataFrame(search_results)

output_data = []
# iterate through the search results
max_iter = search_results.shape[0]
curr_iter = 0
for index, row in tqdm(search_results.iterrows(), total=max_iter):   
    # key parameters for the iteration 
    o_random_seed, p_random_seed = int(row['o_random_seed']), int(row['parameter_random_seed'])
    # print(f'Running iteration {index} with o_random_seed {o_random_seed} and parameter_random_seed {p_random_seed}')
    # create a drug enabled model 
    try: 
        model_drug_spec = DrugModelSpecification()
        model_drug_spec.generate_specifications(o_random_seed, no_observable_species, no_feedback_regulations, verbose=0)
        drug_0 = Drug('D0', 500, 5000)
        np.random.seed(o_random_seed)
        # add random 'up' and 'down' regulations to the drug
        regulation_dir = []
        for i, s in enumerate(model_drug_spec.A_species):
            regulation_dir.append(np.random.choice(['up', 'down']))
            drug_0.add_regulation(s, 'up')
        model_drug_spec.add_drug(drug_0)
        G0_d = model_drug_spec.generate_network(f'drug_model_{index}', specie_value_range, param_range, param_multiplier_range, random_seed=p_random_seed, verbose=0)
        base_parameters = G0_d.get_parameters()
        base_initial_conditions = G0_d.get_state_variables()
        # generate feature and target data
        solver = RoadrunnerSolver()
        solver.compile(G0_d.get_sbml_model())
        feature_data = generate_feature_data_v2(model_drug_spec, G0_d.get_state_variables(), feature_generation_method, feature_generation_extra_params, 1000, feature_generation_seed)
        target_data, _ = generate_target_data(model_drug_spec, solver, feature_data, {'start': 0, 'end': simulation_time, 'points': simulation_step}, n_cores=8, verbose=False)
        
        
        # create a dataframe with the feature data and target data
        feature_data_df = pd.DataFrame(feature_data)
        target_data_df = pd.DataFrame(target_data)
        # add the target data to the feature data
        feature_data_df['target'] = target_data_df['Cp']
        # measure the mean and standard deviation of the target data
        target_data = np.array(target_data_df['Cp'])
        # calculate the mean and standard deviation of the target data
        mean_target = np.mean(target_data_df['Cp'])
        std_target = np.std(target_data_df['Cp'])

        # calculate the correlation between the features and the target data
        correlation = feature_data_df.corr()['target'].sort_values(ascending=False)
        # create a dataframe with the correlation values
        correlation_df = pd.DataFrame(correlation)
        correlation_df = correlation_df.reset_index()
        correlation_df.columns = ['feature', 'correlation']
        # do not include the target data in the correlation dataframe
        correlation_df = correlation_df[correlation_df['feature'] != 'target']
        
        # extract the mean and standard deviation of the correlation values
        mean = np.mean(correlation_df['correlation'])
        std = np.std(correlation_df['correlation'])
        max_val = np.max(correlation_df['correlation'])
        min_val = np.min(correlation_df['correlation'])
        outliers = correlation_df[(correlation_df['correlation'] > mean + 3*std) | (correlation_df['correlation'] < mean - 3*std)]
        outliers_ratio = outliers.shape[0] / correlation_df.shape[0]
        output_data.append({
            'o_random_seed': o_random_seed,
            'parameter_random_seed': p_random_seed,
            'mean': mean,
            'std': std,
            'max': max_val,
            'min': min_val,
            'outliers_ratio': outliers_ratio,
            'mean_target': mean_target,
            'std_target': std_target,
        })
        
    except Exception as e:
        warnings.warn(f'Error in model {index} {o_random_seed} {p_random_seed}: {e}')
        
    curr_iter += 1    
    if curr_iter > max_iter:
        print('Max iterations reached, stopping...')
        break


100%|██████████| 2/2 [00:00<00:00, 1000.79it/s]


In [12]:
# check the output data as a dataframe

output_data_df = pd.DataFrame(output_data)
output_data_df