# Testing notebook

In [1]:
import os 

path = os.getcwd()
# find the string 'project' in the path, return index
index_project = path.find('project')
# slice the path from the index of 'project' to the end
project_path = path[:index_project+7]
# set the working directory
os.chdir(project_path)
print(f'Project path set to: {os.getcwd()}')


Project path set to: c:\Github\ode-biomarker-project


## Loading Data

In [None]:
from PathLoader import PathLoader

path_loader = PathLoader('data_config.env', 'current_user.env')

### Load data

import pandas as pd
import pickle

# import GDSC2 drug response data using pickle

with open(f'{path_loader.get_data_path()}data/drug-response/GDSC2/cache_gdsc2.pkl', 'rb') as f:
    gdsc2 = pickle.load(f)
    gdsc2_info = pickle.load(f)
    
# import CCLE gene expression data using pickle

with open(f'{path_loader.get_data_path()}data/gene-expression/CCLE_Public_22Q2/ccle_expression.pkl', 'rb') as f:
    gene_entrez = pickle.load(f)
    ccle = pickle.load(f)

# import CCLE sample info data using pickle

with open(f'{path_loader.get_data_path()}data/gene-expression/CCLE_Public_22Q2/ccle_sample_info.pkl', 'rb') as f:
    ccle_sample_info = pickle.load(f)

# import STRING database using pickle

with open(f'{path_loader.get_data_path()}data/protein-interaction/STRING/string_df.pkl', 'rb') as f:
    string_df = pickle.load(f)
    string_df_info = pickle.load(f)
    string_df_alias = pickle.load(f)


# import proteomic expression
with open(f'{path_loader.get_data_path()}data/proteomic-expression/goncalves-2022-cell/goncalve_proteome_fillna_processed.pkl', 'rb') as f:
    joined_full_protein_matrix = pickle.load(f)
    joined_sin_peptile_exclusion_matrix = pickle.load(f)

# import STRING database using pickle

with open(f'{path_loader.get_data_path()}data/protein-interaction/STRING/string_df.pkl', 'rb') as f:
    string_df = pickle.load(f)
    string_df_info = pickle.load(f)
    string_df_alias = pickle.load(f)

# open STRING to goncalves mapping file

with open(f'{path_loader.get_data_path()}data\protein-interaction\STRING\goncalve_to_string_id_df.pkl', 'rb') as f:
    goncalve_to_string_id_df = pickle.load(f)

# open the cache for neighbourhood calculations

with open(f'{path_loader.get_data_path()}data/protein-interaction/STRING/palbociclib_nth_degree_neighbours.pkl', 'rb') as f:
    nth_degree_neighbours = pickle.load(f)

# Toolkit Tests

## Feature Transformer

In [None]:
import pandas as pd

# loading cell line proteomic expression data

cancercell2022 = pd.read_csv('data\preprocessed\SY-Processed\CancerCell2022_PRISM.csv')

cancercell2022_dropnan = cancercell2022.dropna(subset=['AUC'])

import DataFunctions as dfunc 

feature_data, label_data = dfunc.create_feature_and_label(cancercell2022_dropnan, label_name='AUC')

feature_data_no_row = feature_data.drop(['Row'], axis=1)

In [None]:
from toolkit import FeatureTransformer
from toolkit import impute_by_zero, impute_by_first_quantile, get_network_stat_features, get_random_features

F = FeatureTransformer()

F.add_transform_function('impute_by_zero', impute_by_zero)
F.add_selection_function('random_select', get_random_features, {"selection_size": 10})

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and test sets

X_train, X_test, y_train, y_test = train_test_split(feature_data, label_data, random_state=42)

# Print the shapes of the new X objects

print(f'X_train shape: {X_train.shape}, y_train shape: {y_train.shape}, X_test shape: {X_test.shape}, y_test shape: {y_test.shape}')

# Run Feature Transformer 

selected_features, sel_train, sel_test = F.run(X_train, y_train, X_test)

print(selected_features, sel_train.shape, sel_test.shape)  

In [None]:
# clear variables in juptyer notebook

%reset -f

## Feature Selection Methods

### Create controlled dataset

In [None]:
from sklearn.datasets import make_regression
import pandas as pd

# turn X and Y into dataframes
X, y = make_regression(n_samples=500, n_features=1000, n_informative=10, random_state=1, shuffle=False)

X = pd.DataFrame(X)
y = pd.Series(y)

# turn columns into strings

X.columns = [str(i) for i in range(X.shape[1])]

print(f'Original informative columns: {X.columns[:10]}')

# shuffle columns around for X

X = X.sample(frac=1, axis=1, random_state=0)

print(f'Newly shuffled columns: {X.columns[:10]}')


In [None]:
from toolkit import mrmr_select_fcq

features, scores = mrmr_select_fcq(X, y, K=10,verbose=True)

In [None]:
from toolkit import enet_select

features, scores = enet_select(X, y, 10, max_iter=10000, alpha=0.1, l1_ratio=0.7)

print(features)
print(scores)

In [None]:
from toolkit import rf_select

features, scores = rf_select(X, y, k=10, n_estimators=100, max_depth=5, n_jobs=-1)

print(features)
print(scores)

In [None]:
from toolkit import f_regression_select

features, scores = f_regression_select(X, y, k=10)
print(features)
print(scores)

In [None]:
from toolkit import relieff_select

features, scores = relieff_select(X, y, k=10, n_jobs=4)
print(features)
print(scores)


In [None]:
# clear variables in juptyer notebook

%reset -f

## Selection Functions

In [None]:
# import train_test_split function
from sklearn.model_selection import train_test_split
from toolkit import select_random_features

selected_features, selected_X = select_random_features(X, y, 10)

print(selected_features, selected_X.shape)

## Transforming Functions


In [None]:
from toolkit import transform_impute_by_zero

imputed_X, imputed_y = transform_impute_by_zero(X, y)

print(imputed_X.shape, imputed_y.shape)


## the Powerkit class

In [None]:
'''Testing code, for reference ONLY
'''

# rng = 45
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
# get_feature_importance = False
# pipeline_comps = pipeline_func(X_train, y_train)
# eval_returns = eval_func(X_test, y_test, pipeline_components=pipeline_comps)
# # print(eval_returns)

# # combine pipeline_comps and eval_returns into a single dictionary

# final_returns = {}
# final_returns['rng'] = rng
# final_returns['condition'] = 'test'
# final_returns.update(eval_returns)

# if not get_feature_importance:
#     final_returns.pop('feature_importance')

# # convert final_returns into a dataframe, test if it works for multiple rows

# df = pd.DataFrame([final_returns, final_returns])

# df.head()

# feature_importance = final_returns['feature_importance']

# for x,y in zip(feature_importance[0], feature_importance[1]):
#     print(f'Feature: {x}, Score: {y}')

In [1]:
import os 

path = os.getcwd()
# find the string 'project' in the path, return index
index_project = path.find('project')
# slice the path from the index of 'project' to the end
project_path = path[:index_project+7]
# set the working directory
os.chdir(project_path)
print(f'Project path set to: {os.getcwd()}')

from toolkit import Powerkit, transform_impute_by_zero, select_random_features, select_preset_features, select_stat_features, f_regression_select, mrmr_select_fcq, hypertune_svr

from copy import deepcopy

from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression
from scipy.stats import pearsonr

from sklearn.feature_selection import f_regression

# create a Powerkit object

from sklearn.datasets import make_regression
import pandas as pd

# turn X and Y into dataframes
X, y = make_regression(n_samples=500, n_features=1000, n_informative=10, random_state=1, shuffle=False)

X = pd.DataFrame(X)
y = pd.Series(y)

# turn columns into strings

X.columns = [str(i) for i in range(X.shape[1])]

print(f'Original informative columns: {X.columns[:10]}')

# shuffle columns around for X

X = X.sample(frac=1, axis=1, random_state=0)

print(f'Newly shuffled columns: {X.columns[:10]}')


def pipeline_func(X_train, y_train, **kwargs):
    
    X_transformed, y_transformed = transform_impute_by_zero(X_train, y_train)
    # selected_features, scores = f_regression_select(X_transformed, y_transformed, k=10)
    selected_features, scores = mrmr_select_fcq(X_transformed, y_transformed, K=10, return_index=False)
    selected_features, X_selected = select_preset_features(X_transformed, y_transformed, selected_features)
    model = SVR()
    model.fit(X_selected, y_transformed)
    
    return {'model': model, 'selected_features': selected_features, 'scores': scores}

def eval_func(X_test, y_test, pipeline_components=None, **kwargs):
    
    '''
    example function to evaluate the performance of a pipeline
    inputs
        X_test: test set features
        y_test: test set labels
        pipeline_components: dictionary of pipeline components, e.g. {'model': model, 'selected_features': selected_features, 'scores': scores}
    '''
    
    _, X_selected = select_preset_features(X_test, y_test, pipeline_components['selected_features'])
    y_pred = pipeline_components['model'].predict(X_selected)
    # assess performance by pearson correlation
    corr, p_vals = pearsonr(y_test, y_pred)
    
    
    # at the end, return a dictionary of all the information you want to return
    return {'model_performance': corr, 'p_vals': p_vals, 
            'feature_importance': (pipeline_components['selected_features'], pipeline_components['scores'])}

powerkit = Powerkit(X, y) 
powerkit.add_condition('test', True, pipeline_func, {}, eval_func, {})
rng_list = [i for i in range(24)]

Project path set to: c:\Github\ode-biomarker-project
Original informative columns: Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'], dtype='object')
Newly shuffled columns: Index(['993', '859', '298', '553', '672', '971', '27', '231', '306', '706'], dtype='object')


IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


In [7]:

df = powerkit._abstract_run(rng_list, n_jobs=4, verbose=False)
df.head()

contribution = powerkit.get_mean_contribution(df, 'test', adjust_for_accuracy=True, strict_mean=0.75)

In [3]:
rngs, total_df, meta_df = powerkit.run_until_consensus('test', n_jobs=2, verbose=True, verbose_level=1, return_meta_df=True)

current_contrib: ['6', '905', '420', '917', '195']
current iteration: 4 current_tol: 0.027931, abs_diff: 490024.096651, abs_prev: 17544047.417729, performance: 0.906521
current_contrib: ['6', '905', '420', '917', '195']
current iteration: 6 current_tol: 0.002560, abs_diff: 46340.458463, abs_prev: 18105015.824048, performance: 0.856003
Consensus Run: condition test is done in 6 iterations
Consensus Run under condition test is NOT converged within 0.001 absolute tolerance


In [5]:
contribution = powerkit.get_mean_contribution(total_df, 'test', adjust_for_accuracy=True, strict_mean=0.75)

# TorchApp Tests

# Loading SBML Models

In [None]:
# loading anthony's sbml model

from libsbml import *

reader = SBMLReader()

document = reader.readSBML("data\export_ECC_Base.xml")



In [None]:
model = document.getModel()

print(f'Document errors: {document.getNumErrors()}')

In [None]:
print(f'Number of species: {model.getNumSpecies()}')

print(f'Number of reactions: {model.getNumReactions()}')

print(f'Number of compartments: {model.getNumCompartments()}')

print(f'Number of parameters: {model.getNumParameters()}')

In [None]:
print(f'Number of rules: {model.getNumRules()}')

In [None]:
import roadrunner

rr = roadrunner.RoadRunner("data\export_ECC_Base.xml")

print(f'Number of floating species: {len(rr.model.getFloatingSpeciesIds())}')

print(f'Number of boundary species: {len(rr.model.getBoundarySpeciesIds())}')

print(f'Number of global parameters: {len(rr.model.getGlobalParameterIds())}')

print(f'Number of compartments: {len(rr.model.getCompartmentIds())}')

In [None]:
result = rr.simulate(0, 10, 100)

rr.plot(result)

In [None]:
rr.reset()

In [None]:
print(f'{rr.model.getFloatingSpeciesIds()}')


In [None]:
rr.model["init(IRS)"]

# rr.model["init(IRS)"] = 0.5 # for changing initial conditions

In [None]:
rr.model.getGlobalParameterIds()



In [None]:
rr.kc_INSULIN_INSR_INSRpY

# rr.kc_INSULIN_INSR_INSRpY = 0.1 # for changing parameter values

In [None]:
rr.model['kc_INSULIN_INSR_INSRpY'] # another method for changing parameter values

In [None]:
rr.model['INSR'] # another method for changing initial condition values

## Linking CCLE data to Anthony's SBML model initial conditions

## Changing parameters in SBML model to the calibrated set

## Processing dynamic simulation data back to singular vector

## LanODEApp Tests

In [None]:
'''load in core data and libraries'''

# libraries used 

# load CCLE expression data  

# load Anthony's model and optimal parameter sets 

document = reader.readSBML("data\export_ECC_Base.xml")
