In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind
from sklearn.metrics import r2_score
from scipy.stats import pearsonr, spearmanr
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.patches as mpatches
from scipy.stats import ttest_ind, ttest_1samp, ttest_rel
import matplotlib as mpl
import umap
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from joblib import Parallel, delayed
from tqdm.notebook import tqdm
import pickle
import matplotlib.ticker as mtick
import math
from itertools import combinations
import random
from sklearn.linear_model import LinearRegression

In [3]:
plt.rcParams['font.family'] = 'Helvetica'
plt.rcParams["axes.labelweight"] = "bold"
plt.rcParams['font.size'] = 18
plt.rcParams['axes.linewidth'] = 2
plt.rcParams['figure.figsize'] = (10.0, 7.0)
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42
sns.set_palette("Dark2")

In [4]:
seed = 42

In [5]:
def calc_cellline_correlation_aux(df1, df2, cell_line, data='protein'):
    if cell_line not in df2.columns:
        return None
    df1_cellline = df1[[cell_line]].reset_index()
    if data.lower() == 'rna':
        df1_cellline['index'] = df1_cellline['index'].map(name_map_dict)
    df2_cellline = df2[[cell_line]].reset_index()
    tmp = pd.merge(df1_cellline, df2_cellline, on=['index'], suffixes=["_df1", "_df2"]).dropna()
    corr = pearsonr(tmp[f"{cell_line}_df1"], tmp[f"{cell_line}_df2"])[0]
    return corr

In [6]:
def calc_cellline_correlation(df1, df2, data='protein'):
    res = Parallel(n_jobs=14)(
        delayed(calc_cellline_correlation_aux)(df1, df2, cell_line, data)
        for cell_line in tqdm(df1.columns))
    res = [x for x in res if x is not None]
    return np.array(res)

In [30]:
def calc_protein_correlation_aux(df1, df2, protein, data='protein'):
    if data.lower() == 'rna':
        if protein not in name_map_dict:
            return None
        gene = name_map_dict[protein]
        if gene not in df2.columns:
            return None
        df1_protein = df1[[protein]].reset_index()
        df2_protein = df2[[gene]].reset_index()
        df2_protein.columns = ["Cell_line", protein]
    else:
        if protein not in df2.columns:
            return None
        df1_protein = df1[[protein]].reset_index()
        df2_protein = df2[[protein]].reset_index()

    tmp = pd.merge(df1_protein, df2_protein, on=['Cell_line'], suffixes=["_df1", "_df2"]).dropna()
    if tmp.shape[0] < 3:
        return None
    corr = pearsonr(tmp[f"{protein}_df1"], tmp[f"{protein}_df2"])[0]
    return corr

In [31]:
def calc_protein_correlation(df1, df2, data='protein'):
    res = Parallel(n_jobs=14)(
        delayed(calc_protein_correlation_aux)(df1, df2, protein, data)
        for protein in tqdm(df1.columns))
    res = [x for x in res if x is not None]
    return np.array(res)

# load data

In [9]:
colour_list = pd.read_csv('../../data/colour_list.txt',header=None)[0].tolist()

In [10]:
protein_ruv = pd.read_csv(
    "../../data/protein/E0022_P06_Protein_Matrix_ProNorM_no_control_update.txt",
    sep='\t').set_index('Cell_line')

meta = pd.read_csv('../../data/E0022_P06_final_sample_map_no_control.txt',
                   sep='\t').set_index('Cell_line')

In [11]:
cell_lines_no_blood = meta[meta['Tissue_type']!='Haematopoietic and Lymphoid'].index.values

In [12]:
tissue_type_map = meta[['Tissue_type']].to_dict()['Tissue_type']
cancer_type_map = meta[['Cancer_type']].to_dict()['Cancer_type']
batch_map = meta[['Batch']].to_dict()['Batch']
instrument_map = meta[['Instrument']].to_dict()['Instrument']


In [13]:
cancer_colours = dict(
    zip(meta['Cancer_type'].unique(),
        colour_list[:meta['Cancer_type'].unique().size]))

tissue_colours = dict(
    zip(meta['Tissue_type'].unique(),
        colour_list[:meta['Tissue_type'].unique().size]))

instrument_colours={
    'M01':'#66c2a5', 
    'M02':'#fc8d62', 
    'M03':'#8da0cb', 
    'M04':'#e78ac3', 
    'M05':'#a6d854', 
    'M06':'#ffd92f'
}

batch_colours={
    'P01':'#7fc97f', 
    'P02':'#beaed4', 
    'P03':'#fdc086', 
    'P04':'#386cb0', 
    'P05':'#f0027f', 
    'P06':'#bf5b17'
}

In [14]:
meta = meta.reset_index()

In [15]:
meta_cell_lines = meta.drop_duplicates('Cell_line')

In [16]:
drug = pd.read_csv("../../data/drug/drug_info.csv")

In [17]:
drug_colours = dict(
    zip(drug['target_pathway'].unique(),
        colour_list[:drug['target_pathway'].unique().size]))

# Kuster trypsin vs GluC

## cell line correlation

In [70]:
nci_trypsin = pd.read_csv("../../data/Kuster/protein_nci_trypsin_processed.csv").set_index('Cell_line').T

In [71]:
nci_gluc = pd.read_csv("../../data/Kuster/protein_nci_gluc_processed.csv").set_index('Cell_line').T

In [75]:
res = calc_cellline_correlation(nci_trypsin, nci_trypsin, data='protein')

np.median(res)

HBox(children=(FloatProgress(value=0.0, max=60.0), HTML(value='')))




1.0

In [74]:
res = calc_cellline_correlation(nci_trypsin, nci_gluc, data='protein')

np.median(res)

HBox(children=(FloatProgress(value=0.0, max=60.0), HTML(value='')))




0.826517451755397

## protein correlation

In [77]:
nci_trypsin = pd.read_csv("../../data/Kuster/protein_nci_trypsin_processed.csv").set_index('Cell_line')

In [78]:
nci_gluc = pd.read_csv("../../data/Kuster/protein_nci_gluc_processed.csv").set_index('Cell_line')

In [79]:
overlap = list(set(nci_trypsin.columns).intersection(nci_gluc.columns))

In [80]:
res = calc_protein_correlation(nci_trypsin, nci_trypsin, data='protein')
np.median(res)

HBox(children=(FloatProgress(value=0.0, max=7548.0), HTML(value='')))




1.0

In [57]:
res = calc_protein_correlation(nci_trypsin, nci_gluc, data='protein')
np.median(res)

0.600296964824828

# Kuster vs Sanger

In [58]:
protein_raw = pd.read_csv(
    "../../data/protein/E0022_P06_Protein_Matrix_Raw_no_control_update.txt",
    sep='\t').set_index('Cell_line')
protein_median = pd.read_csv(
    "../../data/protein/E0022_P06_Protein_Matrix_Median_no_control_update.txt",
    sep='\t').set_index('Cell_line')
protein_ruv = pd.read_csv(
    "../../data/protein/E0022_P06_Protein_Matrix_ProNorM_no_control_update.txt",
    sep='\t').set_index('Cell_line')

nci_trypsin = pd.read_csv("../../data/Kuster/protein_nci_trypsin_common_processed.csv").set_index('Cell_line')
nci_gluc = pd.read_csv("../../data/Kuster/protein_nci_gluc_common_processed.csv").set_index('Cell_line')
crc = pd.read_csv("../../data/Kuster/protein_crc_trypsin_common_processed.csv").set_index('Cell_line')

In [59]:
kuster_trypsin = pd.concat([nci_trypsin, crc])

In [60]:
kuster_gluc = pd.concat([nci_gluc, crc])

In [61]:
kuster_trypsin.to_csv("../../data/Kuster/protein_combined_common_processed.csv")

In [62]:
res_trypsin = calc_protein_correlation(protein_ruv, kuster_trypsin, data='protein')

HBox(children=(FloatProgress(value=0.0, max=3425.0), HTML(value='')))




In [63]:
np.median(res_trypsin)

0.35066125391451675

In [64]:
res_gluc = calc_protein_correlation(protein_ruv, kuster_gluc, data='protein')

HBox(children=(FloatProgress(value=0.0, max=3425.0), HTML(value='')))




In [65]:
np.median(res_gluc)

0.30565343240783266