# Session Info

necessary packages to run this notebook

In [None]:
import types
def imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            yield val.__name__
list(imports())

# Data loading 

methods to pre-cache and load in raw data from the data folder

### Data Pre-processing Guidelines

- **documentation of original download information**
    - original download link 
    - the date of download
    - the doi or bibliography of the linked publication 
    - basic description of the dataset
    - |time-consuming| ideally, a simple description of the methodology used to generate the dataset
        - how were the samples collected?
        - at what time point are the samples lysed? 
        - any further pre-processing steps?  
<br />

- **documentation of dataset(s)** 
    - *the information type represented by the dataset (i.e. gene expression, drug response, etc.)*
    - any supplementary spreadsheet(s) associated with the dataset or metadata
    - sample size (e.g. number of samples, number of genes, etc.)
    - main row and column domains (e.g. genes, samples, drugs, etc.)
    - identifiers used for drug, gene, protein etc. 
    - presence of specific drugs or genes/proteins of interest
        - e.g. CDK4/6 inhibitors: palbociclib, ribociclib and abemaciclib  
<br />  

- **documentation of the pre-processing steps**
    - *the final shape of the processed dataset associated with metadata, e.g. (n_samples, n_genes)*
    - |time-consuming| the technique used to transform the dataset
        - e.g. log2 transformation, z-score normalization, etc.
        - e.g. the method used to impute missing values
        - any removal of data and reasoning (i.e. due to missing values, etc.)
    - index to identifier mapping (e.g. gene index to gene symbol mapping)
        - then, the processed dataset will have indexes matched with a corresponding identifier/symbol 
        - e.g. gene index 0 corresponds to gene symbol A1BG
        - e.g. drug index 0 corresponds to drug palbociclib
        - when performing further filtering, the original index order must be preserved or traced to allow for mapping back to the original identifiers
    - creating a paired dataset from two different datasets
        - e.g. drug response and gene expression
        - e.g. drug response and mutation status
        - e.g. gene expression and mutation status
        - must perform model-to-name mapping between the two datasets and document the mapping logic
            - e.g. model are cell lines, matched by cell line name (no spaces, lower case)
            - e.g. model are cell lines, matched by a common identifier (e.g. Sanger_Model_ID)



### PRISM Screen 24Q2

Description of the dataset

In [1]:
from PathLoader import PathLoader
path_loader = PathLoader('data_config.env', 'current_user.env')


In [2]:
from DataLink import DataLink
data_link = DataLink(path_loader, 'data_codes.csv')

In [6]:
## Initial Loading of Data

import pandas as pd 

data_path = path_loader.get_data_path()
# print(dir_path)
file_path = 'data\drug-response\Repurposing-Public-24Q2\Repurposing_Public_24Q2_Extended_Primary_Data_Matrix.csv'
df = pd.read_csv(data_path + file_path, index_col=0)

In [7]:
df

Unnamed: 0,ACH-000001,ACH-000002,ACH-000004,ACH-000005,ACH-000006,ACH-000007,ACH-000008,ACH-000010,ACH-000011,ACH-000012,...,ACH-002016,ACH-002022,ACH-002023,ACH-002025,ACH-002038,ACH-002039,ACH-002041,ACH-002042,ACH-002046,ACH-002048
BRD:BRD-A00047421-001-01-7,-1.207281,-4.231563,-3.860672,-2.271411,0.277833,-4.011285,-0.615105,-2.625300,-1.827661,-3.335969,...,-1.975287,-2.403487,-0.495741,-3.238121,1.223419,-2.314389,-2.805922,-2.332483,-3.441262,-1.254407
BRD:BRD-A00055058-001-01-0,0.515743,,,,,0.196878,0.347821,,-0.001031,-0.302195,...,,,,,,,,,,
BRD:BRD-A00077618-236-07-6,-0.015577,,,,,-0.095730,0.379480,,0.145346,0.103348,...,,,,,,,,,,
BRD:BRD-A00092689-236-04-9,-0.395123,-0.538376,0.306971,0.035096,-0.124899,-0.861860,-0.137544,-0.206334,0.159798,-0.269911,...,-0.480284,0.156351,0.012775,0.304118,-0.397145,0.202935,-0.335913,-0.334916,-0.343913,0.154135
BRD:BRD-A00100033-001-08-9,-0.449332,,,,,0.257943,-0.596132,,-0.499274,0.063870,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
BRD:BRD-U08520523-000-01-0,0.125641,,,,,-0.394254,-0.820576,,-0.352543,,...,,,,,,,,,,
BRD:BRD-U25960968-000-01-9,-0.067407,,,,,-0.275891,-0.323379,,-0.534487,0.540634,...,,,,,,,,,,
BRD:BRD-U45393375-000-01-6,0.448387,,,,,-0.191386,0.035115,,-0.011005,-0.185457,...,,,,,,,,,,
BRD:BRD-U48018661-000-01-9,-0.194616,,,,,0.499452,-0.057839,,0.119944,0.148812,...,,,,,,,,,,


### Depmap Gene Expression 24Q4

In [None]:
from PathLoader import PathLoader
path_loader = PathLoader('data_config.env', 'current_user.env')


In [None]:
from DataLink import DataLink
data_link = DataLink(path_loader, 'data_codes.csv')

In [None]:
## Initial Loading of Data

import pandas as pd 

data_path = path_loader.get_data_path()
# print(dir_path)
file_path = 'data\drug-response\Repurposing-Public-24Q2\Repurposing_Public_24Q2_Extended_Primary_Data_Matrix.csv'
df = pd.read_csv(data_path + file_path, index_col=0)

### GDSC 1 

GDSC1 is a drug response dataset, retrieved from [Genomics of Drug Sensitivity in Cancer](http://www.cancerrxgene.org/). The data is stored in the `data/drug-response/GDSC1` folder.

Data Retrieval Date: 2022-06-01

Yang, W., Soares, J., Greninger, P., Edelman, E. J., Lightfoot, H., Forbes, S., Bindal, N., Beare, D., Smith, J. A., Thompson, I. R., Ramaswamy, S., Futreal, P. A., Haber, D. A., Stratton, M. R., Benes, C., McDermott, U., & Garnett, M. J. (2013). Genomics of Drug Sensitivity in Cancer (GDSC): A resource for therapeutic biomarker discovery in cancer cells. Nucleic Acids Research, 41(Database issue), D955–D961. https://doi.org/10.1093/nar/gks1111

#### Methodology

retrieved from [Genomics of Drug Sensitivity in Cancer](https://www.cancerrxgene.org/help#t_curve)

> The GDSC1 dataset was generated jointly by the Wellcome Sanger Institute and Massachusetts General Hospital between 2009 and 2015 using a matched set of cancer cell lines (the GDSC1000).

> Compounds were stored in aliquots at -80°C and were subjected to a maximum of 5 freeze-thaw cycles.

> Cells were seeded in 96-well or 384-well plates and compound dose titrations were delivered using tip based liquid handling apparatus. Cell viability was measured using either Syto60 or Resazurin. Drug treatments in this dataset used two formats:

> 9-point dose curve incorporating a 2-fold dilution step (256-fold range)
> 5-point dose curve incorporating a 4-fold dilution step (256-fold range)



In [None]:
## Initial Loading of Data

import pandas as pd 

gdsc1 = pd.read_excel('data\drug-response\GDSC1\GDSC1_fitted_dose_response_25Feb20.xlsx')
print(gdsc1.head())

In [None]:
gdsc1_info = pd.read_csv('data\drug-response\GDSC1\GDSC1_DrugData.csv')

In [None]:
## Caching loaded data into pickle obj 

import pickle

with open('data/drug-response/GDSC1/cache_gdsc1.pkl', 'wb') as f:
    pickle.dump(gdsc1, f)
    pickle.dump(gdsc1_info, f)


In [None]:
## Loading cached data
import pickle

with open('data/drug-response/GDSC1/cache_gdsc1.pkl', 'rb') as f:
    gdsc1 = pickle.load(f)
    gdsc1_info = pickle.load(f)

In [None]:
# investigating the structure of the gdsc1 dataset 

print(gdsc1.head())

In [None]:
print(gdsc1.shape)

In [None]:
print(gdsc1_info.head())

In [None]:
palbo = gdsc1.loc[gdsc1['DRUG_NAME'] == 'Palbociclib']

print(palbo.shape)

In [None]:
ribo = gdsc1.loc[gdsc1['DRUG_NAME'] == 'Ribociclib']

print(ribo.shape)

In [None]:
Abemaciclib = gdsc1.loc[gdsc1['DRUG_NAME'] == 'Abemaciclib']

print(Abemaciclib.shape)

Technical information
- gdsc1 
- dataset type: drug response
- dataset shape: (310904, 19) (n_cells, row_features)
- **each row** represents a drug response measurement of a cell line for a given drug
- **each column** are the features of each drug response measurement
- the column `LN_IC50` is the log-normalized IC50 value of the drug response
- the column `AUC` is the area under the curve of the drug response
- the column `DRUG_ID`, `DRUG_NAME` are the **identifiers of the drug**
    - 'DRUG_ID' can be queried to show further information on drug targets from the supplementary spreadsheet 'gdsc1_info.csv' or the `gdsc1_info` object in python 
- the column `COSMIC_ID`, `SANGER_MODEL_ID`, `CELL_LINE_NAME` are the **identifiers of the cell line**
- drug present: **Palbociclib** (cell line n = 901)


pre-processing may not be required on this dataset, however, further pre-processing is needed if paired with other datasets

### CCLE 22Q2

CCLE (Cancer Cell Line Encyclopedia) is a gene expression dataset, retrieved from [Cancer Cell Line Encyclopedia](https://depmap.org/portal/download/all/). Data is pulled with the option 'DepMap Public 22Q2' in the selection menu.

The data is stored in the `data/gene-expression/CCLE_Public_22Q2` folder.

Data Retrieval Date: 2022-06-01

Ghandi, M., Huang, F. W., Jané-Valbuena, J., Kryukov, G. V., Lo, C. C., McDonald, E. R., Barretina, J., Gelfand, E. T., Bielski, C. M., Li, H., Hu, K., Andreev-Drakhlin, A. Y., Kim, J., Hess, J. M., Haas, B. J., Aguet, F., Weir, B. A., Rothberg, M. V., Paolella, B. R., … Sellers, W. R. (2019). Next-generation characterization of the Cancer Cell Line Encyclopedia. Nature, 569(7757), Article 7757. https://doi.org/10.1038/s41586-019-1186-3


#### Methodology

From Ghandi et al, 2019:

> WGS for 329 cell lines and WES for 326 cell lines were performed at the Broad Institute Genomics Platform. Libraries were constructed and sequenced on either an Illumina HiSeq 2000 or Illumina GAIIX, with the use of 101-base-pair (bp) paired-end reads for WGS and 76-bp paired-end reads for WES. Output from Illumina software was processed by the Picard data-processing pipeline to yield BAM files containing well-calibrated, aligned reads. All sample information tracking was performed by automated LIMS messaging.

> Short read alignment and calculation of gene expression
RNA-seq reads were aligned to the GRCh37 build of the human genome reference using STAR 2.4.2a59. The GENCODE v19 annotation was used for the STAR alignment and all other quantifications. Gene level RPKM and read count values were calculated using RNA-SeQC v1.1.860. Exon–exon junction read counts were obtained from STAR. Isoform-level expression in TPM (transcripts per million) was quantified using RSEM v.1.2.22. All methods were run as part of the pipeline developed for the GTEx Consortium (https://gtexportal.org)61.

In [None]:
# importing ccle data 

import pandas as pd

ccle = pd.read_csv('data\gene-expression\CCLE_Public_22Q2\CCLE_expression.csv')

In [None]:
print(ccle.shape)

print(ccle.describe())

In [None]:
# rename ccle columns 

entrez = list(ccle.columns)
gene_name = []

for c in entrez:
    if c == 'Unnamed: 0':
        entrez[entrez.index(c)] = 'CELLLINE'
        gene_name.append('CELLLINE')
    else:
        # only retain the entrez id in the bracket
        left, right = c.find('('), c.find(')')
        entrez[entrez.index(c)] = c[left+1:right]
        gene_name.append(c[:left-1])

In [None]:
ccle.columns = gene_name
print(ccle.head())
print(ccle.shape)

In [None]:
gene_entrez = pd.DataFrame({'gene_name': gene_name, 'entrez': entrez})
print(gene_entrez.head())

In [None]:
import pickle 

with open('data\gene-expression\CCLE_Public_22Q2\ccle_expression.pkl', 'wb') as f:
    pickle.dump(gene_entrez, f)
    pickle.dump(ccle, f)

In [None]:
import pickle

with open('data\gene-expression\CCLE_Public_22Q2\ccle_expression.pkl', 'rb') as f:
    gene_entrez = pickle.load(f)
    ccle = pickle.load(f)

In [None]:
print(ccle.shape)

In [None]:
print(ccle.head())

In [None]:
import pandas as pd 

ccle_sample_info = pd.read_csv('data/gene-expression/CCLE_Public_22Q2/sample_info.csv')


In [None]:
print(ccle_sample_info.head())

In [None]:
import pickle 

with open('data/gene-expression/CCLE_Public_22Q2/ccle_sample_info.pkl', 'wb') as f:
    pickle.dump(ccle_sample_info, f)

In [None]:
import pickle

with open('data/gene-expression/CCLE_Public_22Q2/ccle_sample_info.pkl', 'rb') as f:
    ccle_sample_info = pickle.load(f)

Dataset Documentation
- Dataset name: ccle_expression 
- dataset type: gene expression by RNASeq
- dataset shape: (1404, 19222) (n_cells, row_features)
- **each row** represents a gene expression measurement of a cell line
- **each column** after the first column is the specific expression of a gene in a given cell line
- the first column `CELLLINE` is renamed from `0: unnamed`, represents the cell line `DepMap_ID`, and is the **identifier of the cell line**
- `DepMap_ID` can be queried to show further information on cell line from the supplementary spreadsheet 'ccle_sample_info.csv' or the `ccle_sample_info` object in python, including linkage to other identifiers including `Sanger_Model_ID`

- biologically/clinically implicated genes of note for cdk4/6 inhibitors: #TODO

Pre-processing Documentation
- column renaming was performed, from `0: unnamed` to `CELLLINE` for the first column, entrez ids are stripped and put into a separate dataframe as part of data cleaning.

### GDSC 2

GDSC2 is a drug response dataset, retrieved from [Genomics of Drug Sensitivity in Cancer](http://www.cancerrxgene.org/). The data is stored in the `data/drug-response/GDSC2` folder.

In [None]:
## Initial Loading of Data

import pandas as pd 

gdsc2 = pd.read_excel('data\drug-response\GDSC2\GDSC2_fitted_dose_response_25Feb20.xlsx')
print(gdsc2.head())

In [None]:
gdsc2_info = pd.read_csv('data\drug-response\GDSC2\GDSC2_DrugData.csv')



In [None]:
## Caching loaded data into pickle obj 

import pickle

with open('data/drug-response/GDSC2/cache_gdsc2.pkl', 'wb') as f:
    pickle.dump(gdsc2, f)
    pickle.dump(gdsc2_info, f)

In [None]:
## Loading cached data

import pickle

with open('data/drug-response/GDSC2/cache_gdsc2.pkl', 'rb') as f:
    gdsc2 = pickle.load(f)
    gdsc2_info = pickle.load(f)

### Goncalves 2022 Proteomic Cell Paper (n=949)

Data is retrieved from the [Cell publication of the original article under supplemental information](https://www.cell.com/cancer-cell/fulltext/S1535-6108(22)00274-4) on 01-02-2023.

Gonçalves, E., Poulos, R. C., Cai, Z., Barthorpe, S., Manda, S. S., Lucas, N., Beck, A., Bucio-Noble, D., Dausmann, M., Hall, C., Hecker, M., Koh, J., Lightfoot, H., Mahboob, S., Mali, I., Morris, J., Richardson, L., Seneviratne, A. J., Shepherd, R., … Reddel, R. R. (2022). Pan-cancer proteomic map of 949 human cell lines. Cancer Cell, 40(8), 835-849.e8. https://doi.org/10.1016/j.ccell.2022.06.010

The data is stored in the `data/proteomic-expression/goncalves-2022-cell` folder.

This dataset contains the proteomic expression of 949 cell lines. 

#### Methodology

From Gonçalves et al, 2022 (Results page): 

>To construct a pan-cancer proteomic map, proteomes of 949 human cancer cell lines from 28 tissues and more than 40 genetically and histologically diverse cancer types were quantified (Figures 1A and S1A, Table S1). The proteome for each cell line was acquired by DIA-MS from six replicates using a workflow that enables high throughput and minimal instrument downtime (see STAR Methods, Figure S1B). The resulting dataset was derived from 6,864 DIA-MS runs acquired over 10,000 MS h (Table S1), including peptide preparations derived from the human embryonic kidney cell line HEK293T that were used throughout all data acquisition periods and instruments for quality control. These data, together with the spectral library, were deposited in the Proteomics Identification Database (Perez-Riverol et al., 2019) with dataset identifier PXD030304. Raw DIA-MS data were processed with DIA-NN (Demichev et al., 2020), using retention time-dependent normalization and with a spectral library generated by DIA-NN. For full details of data processing steps and parameters, see STAR Methods and Table S1. MaxLFQ (Cox et al., 2014) was then used to quantify a total of 8,498 proteins (Table S2, Figure S1C), with a median of 5,237 proteins (min-max range: 2,523–6,251) quantified per cell line (Table S1, Figure 1A).

For more detailed information on the methodology, see the STAR Methods section of the paper. In brief, protein expression was measured using DIA-MS, and the data was processed using DIA-NN and quantified using MaxLFQ. Then, data was further processed by log2 transformation. 

For more information on MaxLFQ, see the [Cox et al, 2014](https://www.sciencedirect.com/science/article/pii/S1535947620333107). 

In [None]:
import pandas as pd

# loading in the proteomic data

main_file = pd.ExcelFile('data\proteomic-expression\goncalves-2022-cell\goncalves-2022-cell-949-protein-matrix.xlsx')
print(main_file.sheet_names)

full_protein_matrix = pd.read_excel(main_file, 'Full protein matrix', header=1)
full_protein_matrix.head(2)




In [None]:
sin_peptile_exclusion_matrix = pd.read_excel(main_file, 'Prot matrix excl single-peptide', header=1)
sin_peptile_exclusion_matrix.head(2)

In [None]:
print(full_protein_matrix.shape)
print(sin_peptile_exclusion_matrix.shape)

In [None]:
# loading in the sample info for the proteomic data

info_file = pd.ExcelFile('data\proteomic-expression\goncalves-2022-cell\goncalves-2022-cell-949-sample-info.xlsx')
print(info_file.sheet_names)

goncalve_cell_line_info = pd.read_excel(info_file, 'Cell line level sample info', header=1)

goncalve_cell_line_info.head(1)


In [None]:
print(goncalve_cell_line_info.shape)

In [None]:
# pickle the goncalve_proteome and goncalve_proteome_info

import pickle

with open('data/proteomic-expression/goncalves-2022-cell/goncalve_proteome.pkl', 'wb') as f:
    pickle.dump(full_protein_matrix, f)
    pickle.dump(sin_peptile_exclusion_matrix, f)
    pickle.dump(goncalve_cell_line_info, f)

In [None]:
# load the goncalve_proteome and goncalve_proteome_info

import pickle

with open('data/proteomic-expression/goncalves-2022-cell/goncalve_proteome.pkl', 'rb') as f:
    full_protein_matrix = pickle.load(f)
    sin_peptile_exclusion_matrix = pickle.load(f)
    goncalve_cell_line_info = pickle.load(f)

In [None]:
full_protein_matrix.head(2)

Dataset Documentation
- Dataset name: full_protein_matrix

- dataset type: proteomic expression by DIA-MS

- dataset shape: (949, 8498) (n_cells, row_quantified_protein_expression), note that each protein expression value is aleady log2 transformed.

- **each row** represents a proteomic measurement of a cell line

- **each column** after the first column is the specific expression of a protein in a given cell line

- the first column `Project_Identifier` is the **identifier of the cell line**, this can be translated to `model_id` in the sample info csv document or the python object `goncalves_sample_info` using the `model_id` column. `model_id` appears to be consistent with the Sanger model ID format. 





In [None]:
# preprocess the dataset by zeroing the nan values 

full_protein_matrix = full_protein_matrix.fillna(0)
sin_peptile_exclusion_matrix = sin_peptile_exclusion_matrix.fillna(0)

# print(full_protein_matrix.head(2))

In [None]:
import pickle

with open('data/proteomic-expression/goncalves-2022-cell/goncalve_proteome_fillna.pkl', 'wb') as f:
    pickle.dump(full_protein_matrix, f)
    pickle.dump(sin_peptile_exclusion_matrix, f)
    pickle.dump(goncalve_cell_line_info, f)

In [None]:
# load the goncalve_proteome and goncalve_proteome_info

import pickle

with open('data/proteomic-expression/goncalves-2022-cell/goncalve_proteome_fillna.pkl', 'rb') as f:
    full_protein_matrix = pickle.load(f)
    sin_peptile_exclusion_matrix = pickle.load(f)
    goncalve_cell_line_info = pickle.load(f)

### STRING Database for Protein-Protein Interactions

In [None]:
import pandas as pd 

# loading in the proteomic data

string_df = pd.read_csv("data\protein-interaction\STRING\9606.protein.links.detailed.v11.5.txt.gz", delimiter=' ')

In [None]:
string_df.head()

In [None]:
exp_vals = string_df['combined_score'].values

import numpy as np

# mean and std of the combined_score
print(np.mean(exp_vals))
print(np.std(exp_vals))

# max and min 
print(np.max(exp_vals))
print(np.min(exp_vals))


# get experimetnal values greater than 0.5
exp_vals = exp_vals[exp_vals > 900]

# mean and std of the combined_score
print(np.mean(exp_vals))
print(np.std(exp_vals))

print(len(exp_vals))


In [None]:
string_df_info = pd.read_csv("data\protein-interaction\STRING\9606.protein.info.v11.5.txt.gz", delimiter='\t')

In [None]:
string_df_info.head()

In [None]:
string_df_alias = pd.read_csv("data\protein-interaction\STRING\9606.protein.aliases.v11.5.txt.gz", delimiter='\t')

In [None]:
string_df_alias.tail(10)

In [None]:
# pickle the string_df, string_df_info, string_df_alias

import pickle

with open('data/protein-interaction/STRING/string_df.pkl', 'wb') as f:
    pickle.dump(string_df, f)
    pickle.dump(string_df_info, f)
    pickle.dump(string_df_alias, f)

In [None]:
# load the string_df, string_df_info, string_df_alias

import pickle

with open('data/protein-interaction/STRING/string_df.pkl', 'rb') as f:
    string_df = pickle.load(f)
    string_df_info = pickle.load(f)
    string_df_alias = pickle.load(f)

In [None]:
# load the goncalve_proteome and goncalve_proteome_info

import pickle

with open('data/proteomic-expression/goncalves-2022-cell/goncalve_proteome_fillna.pkl', 'rb') as f:
    full_protein_matrix = pickle.load(f)
    sin_peptile_exclusion_matrix = pickle.load(f)
    goncalve_cell_line_info = pickle.load(f)


# import CCLE gene expression data using pickle

with open('data/gene-expression/CCLE_Public_22Q2/ccle_expression.pkl', 'rb') as f:
    gene_entrez = pickle.load(f)
    ccle = pickle.load(f)

# import CCLE sample info data using pickle

with open('data/gene-expression/CCLE_Public_22Q2/ccle_sample_info.pkl', 'rb') as f:
    ccle_sample_info = pickle.load(f)


In [None]:
sin_peptile_exclusion_matrix.head()

In [None]:
columns_protein = sin_peptile_exclusion_matrix.columns

columns_protein[1]

#### Core functions

In [None]:
import pandas as pd 
import numpy as np

def get_protein_id_by_name(name: str, info: pd.DataFrame, alias: pd.DataFrame, 
                           absolute_match = True,
                           edit_distance=1):
    # if name exist in the info dataframe, return the id
    # get the `#string_protein_id` column from the info dataframe using the `preferred_name` column, the 
    # param `name` is the value of the `preferred_name` column 

    # get the `#string_protein_id` column using name 
    name_id = info.loc[info['preferred_name'].str.lower() == name.lower()]['#string_protein_id']
    
    # if the name_id is not empty and only one value, return the value
    if not name_id.empty and len(name_id) == 1:
        return name_id.values[0]
    
    if len(name_id) > 1:
        print('Warning: more than one id found for the given name')
        return None
    
    if name_id.empty:  
        
        # get the `#string_protein_id` column from the alias dataframe using the `alias` column, the
        # param `name` is the value of the `alias` column
        alias_id = alias.loc[alias['alias'].str.lower() == name.lower()]['#string_protein_id']

        if len(alias_id) > 1:
            if alias_id.eq(alias_id.iloc[0]).all():
                return alias_id.values[0]
            else:
                print('Warning: more than one id found for the given name (alias)')
                print(alias_id)
                return None

        if not alias_id.empty and len(alias_id) == 1:
            return alias_id.values[0]
        
        if alias_id.empty:
            return None 



def get_protein_interactors(id: str, string_df: pd.DataFrame, score_threshold=900):
    # get the interactors of the protein with the given id
    # return a list of protein ids
    interactors = string_df.loc[string_df['protein1'] == id]['protein2']

    # get the interactors with the given score threshold
    interactors = interactors[string_df['combined_score'] > score_threshold]
    return interactors.values.tolist()

def get_protein_name_by_id(id: str, relation_df: pd.DataFrame, field_name: str, check_field_name: str = '#string_protein_id'):
    
    name = relation_df.loc[relation_df[check_field_name] == id][field_name]
    if not name.empty:
        return name.values[0]
    else:
        # print('Warning: no name found for the given id')
        return None


def run_test_get_protein_id_by_name():

    test_id = get_protein_id_by_name('HSP90AA1', string_df_info, string_df_alias)
    print(test_id)

    test_id = get_protein_id_by_name('HSP90Aa1', string_df_info, string_df_alias, absolute_match=False)
    print(test_id)

    test_id = get_protein_id_by_name('HSP90A1', string_df_info, string_df_alias, absolute_match=False)
    print(test_id)

    for name in columns_protein[:10]:
        name = name.split(';')[0]
        id = get_protein_id_by_name(name, string_df_info, string_df_alias)
        print(name, id)

    for gene in gene_entrez['gene_name'][:10]:
        string_id = get_protein_id_by_name(gene, string_df_info, string_df_alias, absolute_match=False)
        print(gene, string_id)

def run_test_get_protein_interactors():
    test_id = get_protein_id_by_name('HSP90AA1', string_df_info, string_df_alias)
    print(test_id)

    interactors = get_protein_interactors(test_id, string_df)
    print(interactors)
    print(len(interactors))
    for ii in interactors[:5]:
        print(get_protein_name_by_id(ii, string_df_info, 'preferred_name'))

    interactors = get_protein_interactors(test_id, string_df, score_threshold=700)
    print(interactors)
    print(len(interactors))
    for ii in interactors[:5]:
        print(get_protein_name_by_id(ii, string_df_info, 'preferred_name'))

def run_test_get_protein_name_by_id():
    test_id = get_protein_id_by_name('CDK4', string_df_info, string_df_alias)
    print(test_id)

    import pickle 
    with open('data\protein-interaction\STRING\goncalve_to_string_id_df.pkl', 'rb') as f:
        goncalve_to_string_id_df = pickle.load(f)

    name = get_protein_name_by_id(test_id, goncalve_to_string_id_df, 'goncalve_protein_id', check_field_name='string_protein_id')
    print(name)

    interactors = get_protein_interactors(test_id, string_df)
    print(interactors)
    print(len(interactors))

    gon_ids = [n for n in map(lambda x: get_protein_name_by_id(x, goncalve_to_string_id_df, 
                                                               'goncalve_protein_id', 
                                                               check_field_name='string_protein_id'), interactors) if n is not None]
    
    print(gon_ids)
    print(len(gon_ids))
    



# run_test_get_protein_interactors()

run_test_get_protein_name_by_id()

#### Create link between ccle and goncalves

In [None]:
# create a relational dataframe between the goncalve proteome name/id, string protein id

data = []
miss_count = 0 
for i, proteo_id in enumerate(columns_protein):
    protein_str_list = proteo_id.split(';')
    if len(protein_str_list) > 1:
        protein_id = protein_str_list[0]
        protein_name = protein_str_list[1].split('_')[0]
        string_id = get_protein_id_by_name(protein_id, string_df_info, string_df_alias)
        if string_id is not None:
            data.append([proteo_id, string_id, protein_id, protein_name])
            print(f'iteration {i} protein_id: {proteo_id}, string_id: {string_id}, protein_name: {protein_name}, missing so far {miss_count}')
        else:
            miss_count += 1
            print(f'protein_id: {proteo_id}, string_id: {string_id}, protein_name: {protein_name} not found')

goncalve_to_string_id_df = pd.DataFrame(data, columns=['goncalve_protein_id', 'string_protein_id', 'protein_id', 'protein_name'])





In [None]:
print(f'Size of original proteome: {len(columns_protein)} Size of goncalve_to_string_id_df: {len(goncalve_to_string_id_df)}')
print(f'Percentage of proteins with string id: {len(goncalve_to_string_id_df)/len(columns_protein)*100:.2f}%')

In [None]:
goncalve_to_string_id_df.head()

In [None]:
# to pickle
goncalve_to_string_id_df.to_pickle('data/protein-interaction/STRING/goncalve_to_string_id_df.pkl')

In [None]:
# do the same with CCLE data

data = []
miss_count = 0
for i, gene in enumerate(gene_entrez['gene_name']):
    if gene != 'CELLLINE':
        string_id = get_protein_id_by_name(gene, string_df_info, string_df_alias, absolute_match=False)
        if string_id is not None:
            data.append([gene, string_id])
            print(f'iteration {i} gene: {gene}, string_id: {string_id}, missing so far {miss_count}')
        else:
            miss_count += 1
            print(f'gene: {gene}, string_id: {string_id} not found')

ccle_to_string_id_df = pd.DataFrame(data, columns=['gene_name', 'string_protein_id'])

In [None]:
ccle_to_string_id_df.head()

In [None]:
# to pickle
ccle_to_string_id_df.to_pickle('data/protein-interaction/STRING/ccle_to_string_id_df.pkl')

In [None]:
print(f'Size of original ccle: {len(gene_entrez)} Size of ccle_to_string_id_df: {len(ccle_to_string_id_df)}')
print(f'Percentage of genes with string id: {len(ccle_to_string_id_df)/len(gene_entrez)*100:.2f}%')

### PDE Ribociclib Data (Sungyoung)

Type: Drug response dataset (single drug: ribociclib)

Source: in-house data of external collaborators

Data is stored in the `data\drug-response\PDE_Ribociclib_ExtInHouse` folder.

##### Data Description and Methods

Dataset has multiple excel files, each containing multiple excel sheets. Drug response is measured by percentage (%) decrease of Ki67 positivity versus control. Ki67 is known to play a role in cell proliferation (Soliman et al, 2016). Responders are defined as cells with a decrease of Ki67 positivity of at least 50% OR 25% versus control. Two doses of ribociclib were tested, 100 nM and 500 nM.

From a brief visual inspection of the data, it appears the `datamatrix` sheet from both `response_mimi` and `response_ml_training_data` are the same and refers to "responders" of the 100 nM ribociclib treatment with 25% Ki67 positivity decrease versus control. Proteomic expression data were analyzed using Spectronaut 8 and quantified using MaxQuant Version 1.5.2.8 (Nguyen et al, 2018).  

Soliman, N. A., & Yussif, S. M. (2016). Ki-67 as a prognostic marker according to breast cancer molecular subtype. Cancer Biology & Medicine, 13(4), 496–504. https://doi.org/10.20892/j.issn.2095-3941.2016.0066

Nguyen, E. V., Centenera, M. M., Moldovan, M., Das, R., Irani, S., Vincent, A. D., Chan, H., Horvath, L. G., Lynn, D. J., Daly, R. J., & Butler, L. M. (2018). Identification of Novel Response and Predictive Biomarkers to Hsp90 Inhibitors Through Proteomic Profiling of Patient-derived Prostate Tumor Explants *. Molecular & Cellular Proteomics, 17(8), 1470–1486. https://doi.org/10.1074/mcp.RA118.000633






In [None]:
import pandas as pd 

# loading in the training data file

main_file = pd.ExcelFile('data\drug-response\PDE_Ribociclib_ExtInHouse\Ribociclib_Response_training_data_with_all.xlsx')
print(main_file.sheet_names)

In [None]:
pde_drug_response_full = pd.ExcelFile('data\drug-response\PDE_Ribociclib_ExtInHouse\Ribociclib_Response_Mimi.xlsx')
print(pde_drug_response_full.sheet_names)

# load in 'Response groups' 

pde_response_all = pd.read_excel(pde_drug_response_full, 'Response groups', header=1)
print(pde_response_all.head(2))

# drop row if 'Sample ID' is NaN

pde_response_all = pde_response_all.dropna(subset=['Sample ID'])
# print(pde_response_all.head(2))



In [None]:
# first, load in the datamatrix sheet with the first row as the header

ribociclib_response = pd.read_excel(main_file, 'datamatrix', header=1)
print(ribociclib_response.head(2))

response = ribociclib_response.columns
# print(response)
# keep first two letter of the column name as the new column name
pde = ribociclib_response.iloc[0]
pde = pde.tolist()[2:]
# print(pde.tolist()[2:])
response = [c[:2] for c in response][2:]
# print(response)

pde_response = pd.DataFrame({'pde': pde, 'response': response})
print(pde_response.head(2))

# ribociclib_response.columns = ribociclib_response.iloc[0]
# print(ribociclib_response.head(2))
# print(ribociclib_response.shape)

In [None]:
# then, load in the datamatrix sheet with the second row as the header

ribociclib_expression = pd.read_excel(main_file, 'datamatrix', header=2)
# print(ribociclib_expression.head(2))

ribociclib_protein_id_to_name = ribociclib_expression[['PG.ProteinAccessions', 'GeneName']]
print(ribociclib_protein_id_to_name.head(2))

ribociclib_expression.drop(['PG.ProteinAccessions'], axis=1, inplace=True)
# print(ribociclib_expression.head(2))

ribociclib_expression = ribociclib_expression.T
ribociclib_expression.columns = ribociclib_expression.iloc[0]
ribociclib_expression.drop(['GeneName'], axis=0, inplace=True)
# ribociclib_expression.rename(columns={'GeneName': 'PDE_ID'}, inplace=True)
print(ribociclib_expression.head(2))

# print(ribociclib_expression.columns)

In [None]:
# pickle the ribociclib_response and ribociclib_expression, and ribociclib_protein_id_to_name, pde_response and pde_response_all

import pickle

with open('data/drug-response/PDE_Ribociclib_ExtInHouse/ribociclib_pde_cleaned.pkl', 'wb') as f:
    pickle.dump(ribociclib_response, f)
    pickle.dump(ribociclib_expression, f)
    pickle.dump(ribociclib_protein_id_to_name, f)
    pickle.dump(pde_response, f)
    pickle.dump(pde_response_all, f)

In [None]:
# load the ribociclib_response and ribociclib_expression, and ribociclib_protein_id_to_name, pde_response and pde_response_all

import pickle

with open('data/drug-response/PDE_Ribociclib_ExtInHouse/ribociclib_pde_cleaned.pkl', 'rb') as f:
    ribociclib_response = pickle.load(f)
    ribociclib_expression = pickle.load(f)
    ribociclib_protein_id_to_name = pickle.load(f)
    pde_response = pickle.load(f)
    pde_response_all = pickle.load(f)

In [None]:
pde_response_all.head()

In [None]:
ribociclib_expression.head()

In [None]:
# join pde_response_all and ribociclib_expression on 'Sample ID' and 'GeneName' via index 

pde_decrease_100 = pde_response_all[['%Decrease 100', 'Sample ID']]
pde_decrease_100 = pde_decrease_100.set_index('Sample ID')


In [None]:
pde_decrease_100.head()

In [None]:
ribociclib_expression.head()

In [None]:
# print(pde_response_all.index)
# print(ribociclib_expression.index)

ribociclib_pde_decrease100_expression = pde_decrease_100.join(ribociclib_expression, how='inner')

# print(ribociclib_pde_whole_dataset.head(2))

# ribociclib_pde_whole_dataset.to_pickle('data/preprocessed/ribociclib_pde_whole_dataset.pkl')

In [None]:
ribociclib_pde_decrease100_expression.head()

In [None]:
pde_decrease_100_cutoff25 = pde_response_all[['25% decrease cutoff', 'Sample ID']]
pde_decrease_100_cutoff25 = pde_decrease_100_cutoff25.set_index('Sample ID')
ribociclib_pde_decrease100_cutoff25_expression = pde_decrease_100_cutoff25.join(ribociclib_expression, how='inner')


In [None]:
ribociclib_pde_decrease100_cutoff25_expression.head()

In [None]:
with open('data/preprocessed/ribociclib_pde_decrease100.pkl', 'wb') as f:
    pickle.dump(ribociclib_pde_decrease100_cutoff25_expression, f)
    pickle.dump(ribociclib_pde_decrease100_expression, f)

This dataset can now be used for drug response prediction.

# Data Integration

## Integration of GDSC2 and CCLE dataset 

### Steps 
1. GDSC2 contains drug data, each drug can be converted into a chemical structure, and the chemical structure can be converted into a SMILES string, or a fingerprint. This represents drug features.
2. CCLE contains gene expression data, each gene can be converted into a gene feature. This represents gene features.
3. The drug features and gene features can be combined to create a drug-gene interaction feature. This represents drug-gene interaction features.
4. Each drug-cell pair is a row in the new dataset (drug-gene interaction features + drug response). 
5. First set of columns should be drug features, second set of columns should be gene features. Output vector should be drug response.

In [None]:
import pandas as pd
import pickle

# import GDSC2 drug response data using pickle

with open('data/drug-response/GDSC2/cache_gdsc2.pkl', 'rb') as f:
    gdsc2 = pickle.load(f)
    gdsc2_info = pickle.load(f)

# import CCLE gene expression data using pickle

with open('data/gene-expression/CCLE_Public_22Q2/ccle_expression.pkl', 'rb') as f:
    gene_entrez = pickle.load(f)
    ccle = pickle.load(f)

# import CCLE sample info data using pickle

with open('data/gene-expression/CCLE_Public_22Q2/ccle_sample_info.pkl', 'rb') as f:
    ccle_sample_info = pickle.load(f)


### Drug ID/Name to Drug Target identification

In [None]:
gdsc2.head()

In [None]:
gdsc2_info.head()

Further preprocessing of drug features through retrieving the SMILES string using pubchem id linked in GDSC2, and converting the SMILES string into morgan fingerprint fingerprint using RDKit.

[Refs needed]

In [None]:
# generate a dataframe between drug name and pubchem id using gdsc2_info

drug_pubchem = gdsc2_info[['drug_name', 'pubchem']]
drug_pubchem = drug_pubchem.drop_duplicates()

# remove the drug name with no pubchem id

# manually modify the pubchem id that are written as 'none' or 'several' 
drug_pubchem.loc[drug_pubchem['pubchem'] == 'none', 'pubchem'] = '-'

# https://pubchem.ncbi.nlm.nih.gov/compound/44259, accessed 09-02-2023
drug_pubchem.loc[drug_pubchem['drug_name'] == 'Staurosporine', 'pubchem'] = 44259

# https://pubchem.ncbi.nlm.nih.gov/compound/457193, accessed 09-02-2023
drug_pubchem.loc[drug_pubchem['drug_name'] == 'Dactinomycin', 'pubchem'] = 457193

# remove the drug name with no pubchem id
drug_pubchem = drug_pubchem[drug_pubchem['pubchem'] != '-']

# in the case of multiple pubchem id, only retain the first one
multiples = drug_pubchem[drug_pubchem['pubchem'].str.contains(",")==True]

# modify the pubchem id to only retain the first one in multiples
drug_pubchem.loc[drug_pubchem['pubchem'].str.contains(",")==True, 'pubchem'] = drug_pubchem.loc[drug_pubchem['pubchem'].str.contains(",")==True, 'pubchem'].str.split(",").str[0]

# remove duplicates
drug_pubchem = drug_pubchem.drop_duplicates()

pubchem_list = list(drug_pubchem['pubchem'])

In [None]:
import pubchempy as pcp

# using pubchempy to retrieve the smiles string of each pubchem id

smiles_list = []

for pubchem in pubchem_list:
    try: 
        compound = pcp.Compound.from_cid(pubchem)
        smiles = compound.isomeric_smiles
        smiles_list.append(smiles)
    except Exception as e:
        print(drug_pubchem[drug_pubchem['pubchem'] == pubchem]['drug_name'])
        smiles_list.append('')

# generate a dataframe between drug name and smiles string

drug_smiles = pd.DataFrame({'drug_name': drug_pubchem['drug_name'], 'smiles': smiles_list})
print(drug_smiles.head())

# compare the number of drug name in drug_smiles and drug_pubchem
print(drug_smiles.shape, gdsc2_info.shape)


In [None]:
# pickle the drug_smiles and drug_pubchem, both have been modified and cleaned

import pickle

with open('data/drug-response/GDSC2/gdsc2_drug_smiles.pkl', 'wb') as f:
    pickle.dump(drug_smiles, f)

with open('data/drug-response/GDSC2/gdsc2_drug_pubchem.pkl', 'wb') as f:
    pickle.dump(drug_pubchem, f)

In [None]:
# Using RDKit to generate molecular fingerprints from GDSC2 drug names

from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np

# generate a list of rdkit mol objects from the smiles string
mol_list = [Chem.MolFromSmiles(smiles) for smiles in drug_smiles['smiles']]
# print(mol_list[0])

# generate a list of fingerprints from the rdkit mol objects
fp_list = [AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024) for mol in mol_list]


fpnp = np.array(fp_list[0])

# TODO: require further preprocessing documentation for the code above 

Retrieve the gene features from CCLE, and convert the gene features into a gene expression matrix.

In [None]:
print(ccle.head())

### Selecting Specific Drugs 

Matching a specfic drug from GDSC2 and gather available genomic data from CCLE.

#### Ribociclib as an example - Playground

In [None]:
# select a specific drug

drug_name = 'Ribociclib'
drug_dataset = gdsc2.loc[gdsc2['DRUG_NAME'] == drug_name]

drug_response_data = drug_dataset[['SANGER_MODEL_ID', 'LN_IC50']]
# celllines = drug_dataset['SANGER_MODEL_ID']

print(drug_response_data.head())
print(drug_response_data.shape)
id_ccle_info = ccle_sample_info[['Sanger_Model_ID', 'DepMap_ID']].dropna()

# find the intersection between the cell lines in drug response data and the cell lines in CCLE gene expression data using the Sanger_Model_ID

celllines = drug_response_data['SANGER_MODEL_ID'].unique()
celllines = [cellline for cellline in celllines if cellline in id_ccle_info['Sanger_Model_ID'].unique()]

print(len(celllines))
print(celllines)
# locate the DepMap_ID of the cell lines in drug response data

depmap_id = []
for cellline in celllines:
    depmap_id.append(id_ccle_info.loc[id_ccle_info['Sanger_Model_ID'] == cellline]['DepMap_ID'].values[0])

print(depmap_id)
print(ccle.head())
# construct the gene expression dataframe by finding row names that are in the DepMap_ID list

matched_gene_expression_dataset = ccle.loc[ccle['CELLLINE'].isin(depmap_id)]
import numpy as np

print(matched_gene_expression_dataset.shape)
print(matched_gene_expression_dataset.head(2))
# There are only 44 cell lines with matched gene expression data from CCLE to the drug response data from GDSC2. This is insufficient for training a model.
# creating matching training feature and label data, gene expressions are features, drug response ic50 is label


# extract CELLLINE column from matched_gene_expression_dataset

matched_cellline = matched_gene_expression_dataset['CELLLINE'].tolist()
matched_sanger_model_id = []

# find the Sanger_Model_ID of the matched cell lines

for cellline in matched_cellline:
    matched_sanger_model_id.append(id_ccle_info.loc[id_ccle_info['DepMap_ID'] == cellline]['Sanger_Model_ID'].values[0])

# print(len(matched_sanger_model_id), len(matched_cellline)) # sanity check, they should be the same

# join the drug response data and the gene expression data through sanger model id as a medium 

matched_drug_response_data = drug_response_data.loc[drug_response_data['SANGER_MODEL_ID'].isin(matched_sanger_model_id)]

# print(matched_drug_response_data.shape)

matched_drug_response_data = matched_drug_response_data.set_index('SANGER_MODEL_ID')

matched_gene_expression_dataset.insert(0, 'SANGER_MODEL_ID', matched_sanger_model_id)
matched_gene_expression_dataset = matched_gene_expression_dataset.set_index('SANGER_MODEL_ID')
# remove 'CELLLINE' column from matched_gene_expression_dataset
# matched_gene_expression_dataset = matched_gene_expression_dataset.drop(columns=['CELLLINE'])

# print(matched_gene_expression_dataset.shape)

# join the matched_drug_response_data and the matched_gene_expression_dataset

joined_dataset = matched_drug_response_data.join(matched_gene_expression_dataset, how='inner')

# print(joined_dataset.shape)
# print(joined_dataset.head())

# feature and label data creation

# extract the feature data from the joined dataset

feature_data = joined_dataset.drop(columns=['LN_IC50'])
feature_data.drop(columns=['CELLLINE'], inplace=True)

# extract the label data from the joined dataset

label_data = joined_dataset['LN_IC50']

# convert the feature data and label data to numpy array

feature_data_np = feature_data.to_numpy()
label_data_np = label_data.to_numpy()

# print(feature_data_np.shape, label_data_np.shape)

# print(feature_data.head())
# print(label_data.head())

In [None]:
# numeric index to cell line name mapping

cellline_name = joined_dataset['CELLLINE'].tolist()
index_dict = {i: cellline_name[i] for i in range(len(cellline_name))}

#### Streamlining and Optimization

In [None]:
import numpy as np
# select a specific drug

drug_name = 'Ribociclib'

def create_joint_dataset_from_ccle_gdsc2(drug_name: str, 
                                         drug_df: pd.DataFrame,
                                         ccle_df: pd.DataFrame,
                                         ccle_info_df: pd.DataFrame, 
                                         keep_drug_name: bool = False, separate_feature_label: bool = False): 
    
    gdsc2 = drug_df
    ccle = ccle_df
    ccle_sample_info = ccle_info_df

    drug_dataset = gdsc2.loc[gdsc2['DRUG_NAME'] == drug_name]

    drug_response_data = drug_dataset[['SANGER_MODEL_ID', 'LN_IC50']]
    id_ccle_info = ccle_sample_info[['Sanger_Model_ID', 'DepMap_ID']].dropna()

    # find the intersection between the cell lines in drug response data and the cell lines in CCLE gene expression data using the Sanger_Model_ID

    celllines = drug_response_data['SANGER_MODEL_ID'].unique()
    celllines = [cellline for cellline in celllines if cellline in id_ccle_info['Sanger_Model_ID'].unique()]

    # locate the DepMap_ID of the cell lines in drug response data

    depmap_id = []
    for cellline in celllines:
        depmap_id.append(id_ccle_info.loc[id_ccle_info['Sanger_Model_ID'] == cellline]['DepMap_ID'].values[0])

    # construct the gene expression dataframe by finding row names that are in the DepMap_ID list

    matched_gene_expression_dataset = ccle.loc[ccle['CELLLINE'].isin(depmap_id)]

    # creating matching training feature and label data, gene expressions are features, drug response ic50 is label
    # extract CELLLINE column from matched_gene_expression_dataset

    matched_cellline = matched_gene_expression_dataset['CELLLINE'].tolist()
    matched_sanger_model_id = []

    # find the Sanger_Model_ID of the matched cell lines

    for cellline in matched_cellline:
        matched_sanger_model_id.append(id_ccle_info.loc[id_ccle_info['DepMap_ID'] == cellline]['Sanger_Model_ID'].values[0])

    # join the drug response data and the gene expression data through sanger model id as a medium 

    matched_drug_response_data = drug_response_data.loc[drug_response_data['SANGER_MODEL_ID'].isin(matched_sanger_model_id)]

    # print(matched_drug_response_data.shape)

    matched_drug_response_data = matched_drug_response_data.set_index('SANGER_MODEL_ID')

    matched_gene_expression_dataset.insert(0, 'SANGER_MODEL_ID', matched_sanger_model_id)
    matched_gene_expression_dataset = matched_gene_expression_dataset.set_index('SANGER_MODEL_ID')

    # join the matched_drug_response_data and the matched_gene_expression_dataset

    joined_dataset = matched_drug_response_data.join(matched_gene_expression_dataset, how='inner')

    if keep_drug_name:
        joined_dataset.insert(1, 'DRUG_NAME', drug_name)
    
    if separate_feature_label:
        # feature and label data creation

        # extract the feature data from the joined dataset

        feature_data = joined_dataset.drop(columns=['LN_IC50'])
        feature_data.drop(columns=['CELLLINE'], inplace=True)

        # extract the label data from the joined dataset

        label_data = joined_dataset['LN_IC50']

        return feature_data, label_data
    
    return joined_dataset

joined_dataset = create_joint_dataset_from_ccle_gdsc2('Ribociclib', keep_drug_name=True, separate_feature_label=False)

print(joined_dataset.head())

# convert the feature data and label data to numpy array

# feature_data_np = feature_data.to_numpy()
# label_data_np = label_data.to_numpy()

# print(feature_data_np.shape, label_data_np.shape)

# print(feature_data.head())
# print(label_data.head())

In [None]:
# get multiple drugs and join them together to form a single dataset

drug_names = ['Ribociclib', 'Palbociclib']

dfs = [create_joint_dataset_from_ccle_gdsc2(drug_name, keep_drug_name=True, separate_feature_label=False) for drug_name in drug_names]

all_dfs = pd.concat(dfs)

# print(all_dfs.head(50))

In [None]:
# get ribociclib data 

ribociclib_data = create_joint_dataset_from_ccle_gdsc2('Ribociclib', keep_drug_name=False, separate_feature_label=False)

In [None]:
ribociclib_data.to_pickle('data/preprocessed/ribociclib_data.pkl')

In [None]:
palbociclib_data = create_joint_dataset_from_ccle_gdsc2('Palbociclib', keep_drug_name=False, separate_feature_label=False)
# palbociclib_data.to_pickle('data/preprocessed/palbociclib_data.pkl')

In [None]:
ribociclib_data.to_csv('data/preprocessed/ribociclib_data.csv')
palbociclib_data.to_csv('data/preprocessed/palbociclib_data.csv')

In [None]:
# for fun, let's try to create a dataset for all drugs

# all_drug_names = gdsc2['DRUG_NAME'].unique().tolist()

# all_dfs = [create_joint_dataset_from_ccle_gdsc2(drug_name, keep_drug_name=True, separate_feature_label=False) for drug_name in all_drug_names]

# all_dfs = pd.concat(all_dfs)

# print(all_dfs.shape)
# print(all_dfs.head(1000))
# # pickle the dataset for later use

# import pickle

# with open('data/preprocessed/all_drugs_ccle_gdsc2.pkl', 'wb') as f:
#     pickle.dump(all_dfs, f)
    

## Integration of GDSC2 and Goncalves dataset

In [None]:
# load the goncalve_proteome and goncalve_proteome_info

import pickle
import pandas as pd

with open('data/proteomic-expression/goncalves-2022-cell/goncalve_proteome_fillna.pkl', 'rb') as f:
    full_protein_matrix = pickle.load(f)
    sin_peptile_exclusion_matrix = pickle.load(f)
    goncalve_cell_line_info = pickle.load(f)

# import GDSC2 drug response data using pickle

with open('data/drug-response/GDSC2/cache_gdsc2.pkl', 'rb') as f:
    gdsc2 = pickle.load(f)
    gdsc2_info = pickle.load(f)

In [None]:
full_protein_matrix.head()

In [None]:
# retrieve model_id based on Project_identifier

df = goncalve_cell_line_info.loc[goncalve_cell_line_info['Project_Identifier'] == 'SIDM00018;K052']
df.head()

In [None]:
df['model_id']

In [None]:
sanger_model_ids = goncalve_cell_line_info[['model_id', 'Project_Identifier']]
sanger_model_ids.set_index('Project_Identifier', inplace=True)
sanger_model_ids.head()

In [None]:
full_protein_matrix.set_index('Project_Identifier', inplace=True)

In [None]:
# join the full_protein_matrix and the sanger_model_ids by Project_Identifier

joined_full_protein_matrix = full_protein_matrix.join(sanger_model_ids, how='inner')

In [None]:
joined_full_protein_matrix.head()

In [None]:
joined_full_protein_matrix.shape

In [None]:
joined_full_protein_matrix.set_index('model_id', inplace=True)
joined_full_protein_matrix.head()

In [None]:
sin_peptile_exclusion_matrix.set_index('Project_Identifier', inplace=True)

# join the sin_peptile_exclusion_matrix and the sanger_model_ids by Project_Identifier

joined_sin_peptile_exclusion_matrix = sin_peptile_exclusion_matrix.join(sanger_model_ids, how='inner')

joined_sin_peptile_exclusion_matrix.head()

In [None]:
joined_sin_peptile_exclusion_matrix.shape

In [None]:
joined_sin_peptile_exclusion_matrix.set_index('model_id', inplace=True)

In [None]:
joined_sin_peptile_exclusion_matrix.head()

In [None]:
# pickle the joined_full_protein_matrix and the joined_sin_peptile_exclusion_matrix

import pickle

with open('data/proteomic-expression/goncalves-2022-cell/goncalve_proteome_fillna_processed.pkl', 'wb') as f:
    pickle.dump(joined_full_protein_matrix, f)
    pickle.dump(joined_sin_peptile_exclusion_matrix, f)

In [None]:
def create_joint_dataset_from_proteome_gdsc(drug_name: str, proteome: pd.DataFrame, gdsc: pd.DataFrame):
    drug_dataset = gdsc.loc[gdsc['DRUG_NAME'] == drug_name]
    drug_response_data = drug_dataset[['SANGER_MODEL_ID', 'LN_IC50']]
    drug_response_data.set_index('SANGER_MODEL_ID', inplace=True)

    # join the matched_proteome_dataset and the drug_response_data by Sanger_Model_ID (model_id)

    joined_dataset = proteome.join(drug_response_data, how='inner')

    return joined_dataset

ribociclib_proteome_data = create_joint_dataset_from_proteome_gdsc('Ribociclib', joined_sin_peptile_exclusion_matrix, gdsc2)

In [None]:
ribociclib_proteome_data.head()

## Integration of GDSC2 and Goncalves dataset without fillna

In [None]:
# load the goncalve_proteome and goncalve_proteome_info

import pickle

with open('data/proteomic-expression/goncalves-2022-cell/goncalve_proteome.pkl', 'rb') as f:
    full_protein_matrix = pickle.load(f)
    sin_peptile_exclusion_matrix = pickle.load(f)
    goncalve_cell_line_info = pickle.load(f)

# import GDSC2 drug response data using pickle

with open('data/drug-response/GDSC2/cache_gdsc2.pkl', 'rb') as f:
    gdsc2 = pickle.load(f)
    gdsc2_info = pickle.load(f)

In [None]:
sanger_model_ids = goncalve_cell_line_info[['model_id', 'Project_Identifier']]
sanger_model_ids.set_index('Project_Identifier', inplace=True)
sanger_model_ids.head()

In [None]:
full_protein_matrix.set_index('Project_Identifier', inplace=True)

In [None]:
# join the full_protein_matrix and the sanger_model_ids by Project_Identifier

joined_full_protein_matrix = full_protein_matrix.join(sanger_model_ids, how='inner')
joined_full_protein_matrix.head()


In [None]:
joined_full_protein_matrix.shape
joined_full_protein_matrix.set_index('model_id', inplace=True)
joined_full_protein_matrix.head()


In [None]:
sin_peptile_exclusion_matrix.set_index('Project_Identifier', inplace=True)

# join the sin_peptile_exclusion_matrix and the sanger_model_ids by Project_Identifier

joined_sin_peptile_exclusion_matrix = sin_peptile_exclusion_matrix.join(sanger_model_ids, how='inner')

joined_sin_peptile_exclusion_matrix.head()


In [None]:

joined_sin_peptile_exclusion_matrix.shape
joined_sin_peptile_exclusion_matrix.set_index('model_id', inplace=True)
joined_sin_peptile_exclusion_matrix.head()


In [None]:

# pickle the joined_full_protein_matrix and the joined_sin_peptile_exclusion_matrix

import pickle

with open('data/proteomic-expression/goncalves-2022-cell/goncalve_proteome_processed.pkl', 'wb') as f:
    pickle.dump(joined_full_protein_matrix, f)
    pickle.dump(joined_sin_peptile_exclusion_matrix, f)