In [13]:
import os

import numpy as np
import pandas as pd

path = os.getcwd()
# find the string 'project' in the path, return index
index_project = path.find('project')
# slice the path from the index of 'project' to the end
project_path = path[:index_project+7]
# set the working directory
os.chdir(project_path)
print(f'Project path set to: {os.getcwd()}')

INPUT_DATA_CODE_EXPRESSION_DATA = 'ccle'
INPUT_DATA_CODE_MATCH_RULES = 'integrate_ccle_anthony'
PARAM_FOLDER_NAME = 'create-initial-conditions'
PARAM_COMBINATION_METHOD = 'median' # weighted_median, median and cell_line_specific
SPECIFIC_CELL_LINE = 'MCF7' # only used when PARAM_COMBINATION_METHOD is 'cell_line_specific'
SILENT = False

Project path set to: c:\Github\ode-biomarker-project


In [14]:
### Bring in CCLE data
from PathLoader import PathLoader
from DataLink import DataLink 
path_loader = PathLoader('data_config.env', 'current_user.env')
data_link = DataLink(path_loader, 'data_codes.csv')

### INPUT Data Code for m x n table of expression data (pkl pandas dataframe)
data_link.load_data_code(INPUT_DATA_CODE_EXPRESSION_DATA)
expression_df = data_link.data_code_database[INPUT_DATA_CODE_EXPRESSION_DATA]
if INPUT_DATA_CODE_EXPRESSION_DATA == 'ccle':
    expression_df.set_index('CELLLINE', inplace=True)

In [15]:
# INPUT Data code for match rules table (csv file)
data_link.load_data_code(INPUT_DATA_CODE_MATCH_RULES, verbose=True)
match_rules_df = data_link.data_code_database[INPUT_DATA_CODE_MATCH_RULES]

Data code integrate_ccle_anthony loaded at data/anthony_model_adaptation_ccle_csv.csv with index position 0. Enforced raw loading: True


In [6]:
match_rules_df.head()

Unnamed: 0,specie,initial_value,reference,is_active_form,method
0,AKT,500,AKT3;AKT2;AKT1,False,combination
1,AKTpp,0,,True,
2,AKTpS473,0,,True,
3,AKTpT308,0,,True,
4,CDK2,500,CDK2,False,direct


In [7]:

match_rules_df.shape

(59, 5)

In [8]:
match_rules_df_dropna = match_rules_df.dropna(subset=['reference'])

match_rules_df_dropna.shape

(21, 5)

In [5]:
median_expression_values = {}
for col in expression_df.columns:
    expression_col = expression_df[col]
    expression_col_no_zero = expression_col[expression_col != 0]
    median_expression_values[col] = expression_col_no_zero.median()

median_expression_values 

{'TSPAN6': 3.8308637567517576,
 'TNMD': 0.0840642647884745,
 'DPM1': 6.479295161441588,
 'SCYL3': 2.334853560794812,
 'C1orf112': 3.7420045991348685,
 'FGR': 0.070389327891398,
 'CFH': 1.3812807150244448,
 'FUCA2': 5.691393761374138,
 'GCLC': 4.522620726907423,
 'NFYA': 4.016139427113389,
 'STPG1': 2.571676299723311,
 'NIPAL3': 3.6690263196784336,
 'LAS1L': 5.365097324343457,
 'ENPP4': 2.176322772640463,
 'SEMA3F': 2.2600256559614555,
 'CFTR': 0.137503523749935,
 'ANKIB1': 4.293149314468688,
 'CYP51A1': 5.50747752951693,
 'KRIT1': 4.737416340593031,
 'RAD52': 3.2905716625032397,
 'BAD': 5.632703562014195,
 'LAP3': 5.683556046050848,
 'CD99': 7.148324042480084,
 'HS3ST1': 0.4956951626240688,
 'AOC1': 0.1110313123887439,
 'WNT16': 0.1110313123887439,
 'HECW1': 0.5753123306874368,
 'MAD1L1': 4.285772045848275,
 'LASP1': 6.626512143885318,
 'SNX11': 3.863938450423972,
 'TMEM176A': 0.1890338243900171,
 'M6PR': 6.683205179017107,
 'KLHL13': 1.4698859762744636,
 'CYP26B1': 1.0214797274104517,

In [25]:
median_expression_values=expression_df.median(axis=0)
median_expression_values.head()

TSPAN6      3.804776
TNMD        0.000000
DPM1        6.479295
SCYL3       2.334854
C1orf112    3.742005
dtype: float64

In [26]:
median_expression_values['TSPAN6']

3.8047762856177245

In [30]:
# iterate each row in expression_df

dataset_constructor = {}

# print(columns)

for i, row in expression_df.iterrows():
    model = row.name
    row_constructor = []
    
    if PARAM_COMBINATION_METHOD == 'median':
        # first compute the median expression values for each gene/protein
        median_expression_values = {}
        for col in expression_df.columns:
            expression_col = expression_df[col]
            expression_col_no_zero = expression_col[expression_col != 0] # ensure no zero values 
            median_expression_values[col] = expression_col_no_zero.median()
    
    for j, specie_info in match_rules_df.iterrows():
        specie = specie_info['specie']
        initial_value = specie_info['initial_value']
        references = specie_info['reference']
        # print(specie, initial_value, references)
        ### three transformation methods here
        if PARAM_COMBINATION_METHOD == 'weighted_median':
            raise NotImplementedError('weighted method not implemented yet')
        elif PARAM_COMBINATION_METHOD == 'median':
            
            if isinstance(references, float):
                # references is nan, in this case, use the default value for the specie
                row_constructor.append(initial_value)
            
            else:
                
                references = references.split(';')
                
                if len(references) > 1: 
                    # when there is more than one reference, take the average of the median values
                    # for all references
                    sum_vals = 0
                    for feature_name in references:
                        feature_value, feature_median_value = row[feature_name], median_expression_values[feature_name]
                        sum_vals += feature_value / feature_median_value
                        
                    norm_value = sum_vals / len(references) 
                    specie_value = norm_value * initial_value
                    row_constructor.append(specie_value)
                        
                else:
                    feature_name = references[0]
                    feature_value, feature_median_value, default_specie_value = row[feature_name], median_expression_values[feature_name], initial_value  
                    norm_value = feature_value / feature_median_value    
                    specie_value = norm_value * default_specie_value
                    row_constructor.append(specie_value)
                
            # print(j, specie, len(row_constructor))
                           
        elif PARAM_COMBINATION_METHOD == 'cell_line_specific':
            raise NotImplementedError('cell_line_specific method not implemented yet')
        else:
            raise ValueError('PARAM_COMBINATION_METHOD not recognized')
         
        # append row_constructor to dataset_constructor
    dataset_constructor[model] = row_constructor
    break
    

columns = list(match_rules_df['specie'])
dynamic_features_df = pd.DataFrame.from_dict(dataset_constructor, orient='index', columns=columns)
dynamic_features_df.head()

Unnamed: 0,AKT,AKTpp,AKTpS473,AKTpT308,CDK2,CDK2ccCYCE,CDK2ccCYCEccP21,CDK2ccCYCEccP27,CDK2ccCYCEccP27p,CDK2ccCYCEp,...,RAF,RAFa,RAFi,RB,RBp,RBpp,S6K,S6KpT389,SOS,SOSi
ACH-001113,623.365301,0,0,0,545.040493,500,0,0,0,0,...,465.1835,0,0,610.154184,0,0,593.25771,0,619.154984,0
