In [1]:
### Import dynamic simulations 
from PathLoader import PathLoader
from DataLink import DataLink 
path_loader = PathLoader('data_config.env', 'current_user.env')
TheLink = DataLink(path_loader, 'data_codes.csv')

### Plot a specific specie in a specifc cell line

In [None]:
dynamic_data = TheLink.get_data_from_code('dynamic_simulation_data_all', verbose=True)

# function to select cellline from dynamic data 

cellline = 'ACH-001113'

specie = 'INSR'


selected_data = dynamic_data[dynamic_data['Cellline'] == cellline]

selected_data.head()

In [None]:
specie_data = selected_data[specie]
print(specie_data.shape, type(specie_data))
specie_data.head()

In [None]:
# plot the data

import matplotlib.pyplot as plt

plt.plot(specie_data)
plt.show()

### Plot all cell lines for one specie

In [None]:
# selecting dynamic simulations for one protein specie across all cell lines

specie = 'IRS'

selected_data = dynamic_data[['Cellline', 'Time', specie]]

In [None]:
selected_data.shape

In [None]:
selected_data.head()

In [None]:
# transform the data to have time as columns and cell lines as rows

pivoted_data = selected_data.pivot(index='Cellline', columns='Time', values=specie)

In [None]:
pivoted_data.head(10)

In [None]:
import matplotlib.pyplot as plt

# plot every cell line
for cellline in pivoted_data.index:
    plt.plot(pivoted_data.loc[cellline])
    # label the plot
    
plt.legend(pivoted_data.index, bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()


### Select all species for one cell line

In [None]:
cellline = 'ACH-001113'
selected_data = dynamic_data[dynamic_data['Cellline'] == cellline]

In [None]:
import matplotlib.pyplot as plt

for specie in selected_data.columns[2:]:
    plt.plot(selected_data['Time'], selected_data[specie])
    
# plt.legend(selected_data.columns[2:], bbox_to_anchor=(1.05, 1), loc='upper left')

# show within y range of 0 to x, a visually useful range
plt.ylim(0,500)
plt.show()

### Calculating Dynamic Simulations

In [None]:
import numpy as np 
import matplotlib.pyplot as plt

## calculating AUC for a specific specie and cell line

specie = 'IRS'
cellline = 'ACH-001113'

selected_data = dynamic_data[dynamic_data['Cellline'] == cellline]
specie_data = selected_data[specie]

# plot the data

plt.plot(specie_data)
plt.show()

# calculate AUC

auc = np.trapz(specie_data, dx=1)

print('AUC:',auc)

# obtain the max value of the specie

max_value = specie_data.max()

print('Max:',max_value)

# obtain the time at which the max value occurs

max_time = specie_data.idxmax()

print('Max time:',max_time)

# obtain the min value of the specie

min_value = specie_data.min()

print('Min:',min_value)

# obtain the time at which the min value occurs

min_time = specie_data.idxmin()

print('Min time:',min_time)

# mean value of the specie

mean_value = specie_data.mean()

print('Mean:',mean_value)

# median value of the specie

median_value = specie_data.median()

print('Median:',median_value)


# total fold change (TFC) from 0 to end 

start = specie_data.iloc[0]
end = specie_data.iloc[-1]

# print('Start:',start)
# print('End:',end)

tfc = (end - start) / start

print('Total Fold Change:',tfc)

# time to stable value (TSV), a time point t where the value of the specie no longer changes more than 0.01 for all t' > t

tsv = specie_data.shape[0]
change_abs_tolerance = 0.01
difference = specie_data.diff()

while tsv > 0:
    if abs(difference.iloc[tsv-1]) < change_abs_tolerance:
        tsv = tsv - 1
    else:
        break

print('Time to stable value:',tsv)


In [None]:
# normalise all time based values and AUC to the maximum simulation time

max_sim_time = specie_data.shape[0]

n_auc = auc / max_sim_time
n_max_time = max_time / max_sim_time
n_min_time = min_time / max_sim_time
n_tsv = tsv / max_sim_time

print('Normalised AUC:',n_auc)
print('Normalised max time:',n_max_time)
print('Normalised min time:',n_min_time)
print('Normalised TSV:',n_tsv)


In [None]:
### iterate through all cell lines and species and calculate all dynamic simulation features, return a dataframe

import numpy as np
import pandas as pd

# function to calculate dynamic simulation features for a specie and cell line

def calculate_dynamic_simulation_features(specie, cellline, dynamic_data, normalise_time_based_values=True):
    selected_data = dynamic_data[dynamic_data['Cellline'] == cellline]
    specie_data = selected_data[specie]
    # reset index to start from 0
    specie_data = specie_data.reset_index(drop=True)
    
    # calculate AUC
    auc = np.trapz(specie_data, dx=1)
    
    # obtain the max value of the specie
    max_value = specie_data.max()
    
    # obtain the time at which the max value occurs
    max_time = specie_data.idxmax()
    
    # obtain the min value of the specie
    min_value = specie_data.min()
    
    # obtain the time at which the min value occurs
    min_time = specie_data.idxmin()
    
    # mean value of the specie
    mean_value = specie_data.mean()
    
    # median value of the specie
    median_value = specie_data.median()
    
    # total fold change (TFC) from 0 to end 
    start = specie_data.iloc[0]
    end = specie_data.iloc[-1]
    tfc = (end - start) / start
    
    # time to stable value (TSV), a time point t where the value of the specie no longer changes more than 0.01 for all t' > t
    tsv = specie_data.shape[0]
    change_abs_tolerance = 0.01
    difference = specie_data.diff()
    while tsv > 0:
        if abs(difference.iloc[tsv-1]) < change_abs_tolerance:
            tsv = tsv - 1
        else:
            break
    
    # normalise all time based values and AUC to the maximum simulation time
    max_sim_time = specie_data.shape[0]
    n_auc = auc / max_sim_time
    n_max_time = max_time / max_sim_time
    n_min_time = min_time / max_sim_time
    n_tsv = tsv / max_sim_time
    
    if normalise_time_based_values:
        return [n_auc, max_value, n_max_time, min_value, n_min_time, mean_value, median_value, tfc, n_tsv]
    
    return [auc, max_value, max_time, min_value, min_time, mean_value, median_value, tfc, tsv]

all_species = dynamic_data.columns[2:]
all_celllines = dynamic_data['Cellline'].unique()

new_dataset = []

for c in all_celllines:
    cellline_dynamic_features = []
    for s in all_species:
        cellline_dynamic_features.extend(calculate_dynamic_simulation_features(s, c, dynamic_data))
    new_dataset.append(cellline_dynamic_features)
    

dynamic_feature_label = ['auc', 'max', 'max_time', 'min', 'min_time', 'mean', 'median', 'tfc', 'tsv']    
new_df = pd.DataFrame(new_dataset, columns=[s + '_' + dynamic_feature for s in all_species for dynamic_feature in dynamic_feature_label], index=all_celllines)
new_df.head()


## Visualisation of processed dynamic features vs. original features 

In [None]:
dynamic_data = TheLink.get_data_from_code('dynamic_features')
ccle_data = TheLink.get_data_from_code('ccle')

### Automatic Plot Generation 

In [None]:
import matplotlib.pyplot as plt
import os 

match_rules_file = TheLink.get_data_from_code('integrate_ccle_anthony')
match_rules_files_dropna = match_rules_file.dropna(subset=['CCLE reference'])

dynamic_feature_label = ['auc', 'max', 'max_time', 'min', 'min_time', 'mean', 'median', 'tfc', 'tsv']    

folder_name = 'dynamic_features_ccle_plots'

if not os.path.exists(f'{path_loader.get_data_path()}data/results/{folder_name}'):
    os.makedirs(f'{path_loader.get_data_path()}data/results/{folder_name}')

file_save_path = f'{path_loader.get_data_path()}data/results/{folder_name}/'

species_ccle_matches = {}
for i in range(len(match_rules_files_dropna)):
    row = match_rules_files_dropna.iloc[i]  
    specie_name = row['Protein Name']
    ccle_matches = row['CCLE reference']
    ccle_matches = ccle_matches.split(';')
    species_ccle_matches[specie_name] = ccle_matches
    

for s in species_ccle_matches:
    dynamic_specie = s 
    gene_list = species_ccle_matches[s]
    dynamic_features = dynamic_data[[dynamic_specie + '_' + dynamic_feature for dynamic_feature in dynamic_feature_label]]

    # plot all dynamic features against gene expression as a multi-panel plot

    fig, axes = plt.subplots(3, 3, figsize=(15, 15))

    for i, dynamic_feature in enumerate(dynamic_feature_label):
        ax = axes.flatten()[i]
        for gene in gene_list:
            gene_expression = ccle_data[gene]
            ax.scatter(gene_expression, dynamic_features[dynamic_specie + '_' + dynamic_feature], alpha=0.5, s=10)
        ax.legend(gene_list, loc='upper left')
        # change the x and y axis labels size
        ax.tick_params(axis='both', which='major', labelsize=12)
        ax.set_xlabel('Gene expression', fontsize=15)
        ax.set_ylabel(dynamic_feature, fontsize=15)

    plt.suptitle(
        f'{dynamic_specie} dynamic features vs gene expression of {gene_list}', fontsize=20)
    plt.tight_layout()
    plt.savefig(f'{file_save_path}{dynamic_specie}_dynamic_features_vs_gene_expression.png')    
    # do not show the plot
    plt.close()



### Visualising One Specie vs two genes

In [None]:
dynamic_specie = 'CDK46'
ccle_gene = 'CDK6'
ccle_gene_2 = 'CDK4'

In [None]:
gene_expression = ccle_data[ccle_gene]
gene_expression_2 = ccle_data[ccle_gene_2]
dynamic_feature_label = ['auc', 'max', 'max_time', 'min', 'min_time', 'mean', 'median', 'tfc', 'tsv']    

# extract dynamic features for a specie
dynamic_features = dynamic_data[[dynamic_specie + '_' + dynamic_feature for dynamic_feature in dynamic_feature_label]]


In [None]:
dynamic_features.head()

In [None]:
# plot all dynamic features against gene expression as a multi-panel plot

import matplotlib.pyplot as plt

fig, axes = plt.subplots(3, 3, figsize=(15,15))

for i, dynamic_feature in enumerate(dynamic_feature_label):
    ax = axes.flatten()[i]
    ax.scatter(gene_expression, dynamic_features[dynamic_specie + '_' + dynamic_feature], alpha=0.5, s=10)
    ax.scatter(gene_expression_2, dynamic_features[dynamic_specie + '_' + dynamic_feature], alpha=0.5, s=10)
    ax.legend([ccle_gene, ccle_gene_2], loc='upper left')
    # change the x and y axis labels size
    ax.tick_params(axis='both', which='major', labelsize=12)
    ax.set_xlabel('Gene expression', fontsize=15)
    ax.set_ylabel(dynamic_feature, fontsize=15)

plt.suptitle(f'{dynamic_specie} dynamic features vs gene expression of {ccle_gene} and {ccle_gene_2}', fontsize=20)
plt.tight_layout()
plt.show()

    


### Visualising One Specie

In [None]:
dynamic_specie = 'CDK46'
ccle_gene = 'CDK6'

In [None]:
gene_expression = ccle_data[ccle_gene]
gene_expression_2 = ccle_data[ccle_gene_2]
dynamic_feature_label = ['auc', 'max', 'max_time', 'min', 'min_time', 'mean', 'median', 'tfc', 'tsv']    

# extract dynamic features for a specie
dynamic_features = dynamic_data[[dynamic_specie + '_' + dynamic_feature for dynamic_feature in dynamic_feature_label]]


In [None]:
dynamic_features.head()

In [None]:
# plot all dynamic features against gene expression as a multi-panel plot

import matplotlib.pyplot as plt

fig, axes = plt.subplots(3, 3, figsize=(15,15))

for i, dynamic_feature in enumerate(dynamic_feature_label):
    ax = axes.flatten()[i]
    ax.scatter(gene_expression, dynamic_features[dynamic_specie + '_' + dynamic_feature], alpha=0.5, s=10)
    ax.legend([ccle_gene, ccle_gene_2], loc='upper left')
    # change the x and y axis labels size
    ax.tick_params(axis='both', which='major', labelsize=12)
    ax.set_xlabel('Gene expression', fontsize=15)
    ax.set_ylabel(dynamic_feature, fontsize=15)

plt.suptitle(f'{dynamic_specie} dynamic features vs gene expression of {ccle_gene}', fontsize=20)
plt.tight_layout()
plt.show()

    


### Loading Dynamic Features and Labels

In [2]:
feature_data, label_data = TheLink.get_data_using_code('anthony-ode-gdsc-2-Palbociclib-LN_IC50-default')

## Variance and similarity of dynamic features

against original features from ccle expression

### Random

In [None]:
ccle_sample_info = TheLink.get_data_from_code('ccle_sample_info')
dynamic_data = TheLink.get_data_from_code('dynamic_features')
ccle_data = TheLink.get_data_from_code('ccle')

In [None]:
depmap_to_sanger = ccle_sample_info[['DepMap_ID', 'Sanger_Model_ID']]
depmap_to_sanger = depmap_to_sanger.dropna(subset=['Sanger_Model_ID'])
    


In [None]:
# join dynamic features and depmap_to_sanger

dynamic_features = dynamic_data.join(depmap_to_sanger.set_index('DepMap_ID'), on='Unnamed: 0')
dynamic_features.head()

In [None]:
dynamic_features.drop(columns=['Unnamed: 0'], inplace=True)
dynamic_features.head()

In [None]:
dynamic_features.set_index('Sanger_Model_ID', inplace=True)

In [None]:
dynamic_features.head()

In [None]:
gdsc = TheLink.get_data_from_code('gdsc2')

In [None]:
from DataFunctions import create_joint_dataset_from_proteome_gdsc, create_feature_and_label

joint_dataset = create_joint_dataset_from_proteome_gdsc('Palbociclib', dynamic_features, gdsc)
feature_data, label_data = create_feature_and_label(joint_dataset)

In [None]:
feature_data.head()

In [None]:
label_data.head()

In [None]:
feature_data.shape