In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind
from sklearn.metrics import r2_score
from scipy.stats import pearsonr, spearmanr
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.patches as mpatches
from scipy.stats import ttest_ind, ttest_1samp, ttest_rel
import matplotlib as mpl
import umap
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from joblib import Parallel, delayed
from tqdm.notebook import tqdm
import pickle
import matplotlib.ticker as mtick
import math
from itertools import combinations
import random
from sklearn.linear_model import LinearRegression

plt.rcParams['font.family'] = 'Helvetica'
plt.rcParams["axes.labelweight"] = "bold"
plt.rcParams['font.size'] = 18
plt.rcParams['axes.linewidth'] = 2
plt.rcParams['figure.figsize'] = (10.0, 7.0)
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42
sns.set_palette("Dark2")

seed = 42

# load data

In [4]:
colour_list = pd.read_csv('../../data/colour_list.txt',header=None)[0].tolist()

In [5]:
protein_ruv = pd.read_csv(
    "../../data/protein/E0022_P06_Protein_Matrix_ProNorM_no_control_update.txt",
    sep='\t').set_index('Cell_line')

meta = pd.read_csv('../../data/E0022_P06_final_sample_map_no_control.txt',
                   sep='\t')

In [6]:
cell_lines_no_blood = meta[meta['Tissue_type']!='Haematopoietic and Lymphoid'].index.values

In [7]:
tissue_type_map = meta.drop_duplicates(['Cell_line']).set_index(
    ['Cell_line'])[['Tissue_type']].to_dict()['Tissue_type']
cancer_type_map = meta.drop_duplicates(['Cell_line']).set_index(
    ['Cell_line'])[['Cancer_type']].to_dict()['Cancer_type']
batch_map = meta.drop_duplicates(['Cell_line']).set_index(
    ['Cell_line'])[['Batch']].to_dict()['Batch']
instrument_map = meta.drop_duplicates(['Cell_line']).set_index(
    ['Cell_line'])[['Instrument']].to_dict()['Instrument']

In [8]:
tissue_count = meta.drop_duplicates(['Cell_line', 'Tissue_type']).groupby('Tissue_type').size()
major_tissues = tissue_count[tissue_count>30].index.values

In [9]:
cancer_colours = dict(
    zip(meta['Cancer_type'].unique(),
        colour_list[:meta['Cancer_type'].unique().size]))

tissue_colours = dict(
    zip(meta['Tissue_type'].unique(),
        colour_list[:meta['Tissue_type'].unique().size]))

instrument_colours={
    'M01':'#66c2a5', 
    'M02':'#fc8d62', 
    'M03':'#8da0cb', 
    'M04':'#e78ac3', 
    'M05':'#a6d854', 
    'M06':'#ffd92f'
}

batch_colours={
    'P01':'#7fc97f', 
    'P02':'#beaed4', 
    'P03':'#fdc086', 
    'P04':'#386cb0', 
    'P05':'#f0027f', 
    'P06':'#bf5b17'
}

In [10]:
meta = meta.reset_index()

In [11]:
meta_cell_lines = meta.drop_duplicates('Cell_line')

In [12]:
rna_raw = pd.read_csv("../../data/rna/rnaseq_voom.csv", index_col=0).T

rna_raw.index.name = 'SIDM'

rna_raw = rna_raw.reset_index()
rna_sample = pd.merge(rna_raw,
                      meta[['SIDM',
                            'Cell_line']].drop_duplicates()).drop(['SIDM'],
                                                                  axis=1)

rna_sample = rna_sample.set_index(['Cell_line'])

In [72]:
name_map = pd.read_csv("../../data/misc/HUMAN_9606_idmapping.gene_prot.dat",
                       sep='\t',
                       names=['ID', 'type', 'code'])
name_map = name_map.drop_duplicates(['ID', 'type'])
name_map = pd.pivot(name_map, index='ID', columns='type',
                    values='code').dropna()

protein2rna_map = dict(
    zip(name_map['UniProtKB-ID'].values, name_map['Gene_Name'].values))
rna2protein_map = {
    protein2rna_map[protein]: protein
    for protein in protein_ruv.columns if protein in protein2rna_map
}

In [13]:
crispr_df = pd.read_csv('../../data/crispr/final_crispr.csv.gz', low_memory=False)

In [17]:
crispr_df.shape

(16827, 515)

In [111]:
lm_res = pd.read_csv("../../result_files/lm/lm_sklearn_degr_crispr_annotated.csv.gz")

In [33]:
# lm_res_old = pd.read_csv("../../result_files/lm/lm_sklearn_degr_crispr_annotated_old.csv.gz")

In [49]:
print(lm_res_old.shape[0] - lm_res.shape[0])

588945


In [50]:
lm_res[lm_res['fdr']<0.1].shape

(6745, 15)

In [51]:
lm_res_old[lm_res_old['fdr']<0.1].shape

(130278, 17)

In [112]:
lm_res = lm_res[((lm_res['fdr'] < 0.1) | (lm_res['nc_fdr'] < 0.001))
                & (lm_res['r2'] > 0.2) & (lm_res['skew'] < -1.5)]
lm_res['x_protein'] = lm_res['x_id'].map(rna2protein_map)

In [114]:
lm_assoc_list = list(zip(lm_res['y_id'], lm_res['x_id'], lm_res['x_protein']))

In [74]:
crispr_df = crispr_df.set_index('Gene')

In [75]:
crispr_df = crispr_df.T

# calculate

In [115]:
protein_df = protein_ruv
rna_df = rna_sample

In [116]:
def calc_box(tissue_level=True, step=0.2, tissues=[]):
    res_df = []
    for gene, rna_target, protein_target in tqdm(lm_assoc_list):
        if protein_target not in protein_df.columns or rna_target not in rna_df.columns:
            continue
        crispr_map = crispr_df[[gene]].to_dict()[gene]
        tmp_data_protein = protein_df[[protein_target]]
        tmp_data_rna = rna_df[[rna_target]]
        tmp_data = pd.merge(tmp_data_protein, tmp_data_rna, on='Cell_line')
        tmp_data.loc[:, 'crispr'] = tmp_data.index.map(crispr_map).values
        tmp_data.loc[:, 'tissue'] = tmp_data.index.map(tissue_type_map).values
        tmp_data = tmp_data.dropna().reset_index(drop=True)

        if len(tissues) == 0:
            tissues = tmp_data['tissue'].unique()
        if tissue_level:
            for tissue in tissues:
                tmp_data_tissue = tmp_data[tmp_data['tissue'] ==
                                           tissue].reset_index(drop=True)
                if tmp_data_tissue[protein_target].unique().size < int(
                        1 / step) or tmp_data_tissue[rna_target].unique(
                        ).size < int(1 / step):
                    continue
                tmp_data_tissue.loc[:, 'protein_q'] = pd.qcut(
                    tmp_data_tissue[protein_target],
                    np.arange(0, 1.0001, step),
                    duplicates='drop')
                tmp_data_tissue.loc[:, 'rna_q'] = pd.qcut(
                    tmp_data_tissue[rna_target],
                    np.arange(0, 1.0001, step),
                    duplicates='drop')

                medians = tmp_data_tissue[[
                    'protein_q', 'crispr'
                ]].groupby('protein_q').median()['crispr'].values
                mono = (np.all(medians[1:] >= medians[:-1] - 0.1)
                        or np.all(medians[1:] <= medians[:-1] + 0.1))

                protein_last_q = tmp_data_tissue[[
                    'protein_q', 'crispr'
                ]].groupby('protein_q').median()['crispr'].values[-1]

                rna_last_q = tmp_data_tissue[[
                    'rna_q', 'crispr'
                ]].groupby('rna_q').median()['crispr'].values[-1]
                protein_first_q = tmp_data_tissue[[
                    'protein_q', 'crispr'
                ]].groupby('protein_q').median()['crispr'].values[0]
                protein_first_nq = tmp_data_tissue[[
                    'protein_q', 'crispr'
                ]].groupby('protein_q').median()['crispr'].values[:-1]
                rna_first_q = tmp_data_tissue[[
                    'rna_q', 'crispr'
                ]].groupby('rna_q').median()['crispr'].values[0]
                rna_first_nq = tmp_data_tissue[[
                    'rna_q', 'crispr'
                ]].groupby('rna_q').median()['crispr'].values[:-1]
                crispr_mean = tmp_data_tissue['crispr'].mean()
                p_corr_protein, p_pval_protein = pearsonr(
                    tmp_data_tissue['crispr'], tmp_data_tissue[protein_target])
                p_corr_rna, p_pval_rna = pearsonr(
                    tmp_data_tissue['crispr'], tmp_data_tissue[rna_target])
                counts = tmp_data_tissue.shape[0]
                res_df.append({
                    'Gene': gene,
                    'protein': protein_target,
                    'tissue': tissue,
                    'protein_first_q': protein_first_q - crispr_mean,
                    'protein_last_q': protein_last_q - crispr_mean,
                    'rna_first_q': rna_first_q - crispr_mean,
                    'rna_last_q': rna_last_q - crispr_mean,
                    'protein_first_nq_std': np.std(protein_first_nq),
                    'rna_first_nq_std': np.std(rna_first_nq),
                    'crispr_mean': crispr_mean,
                    'mono': mono,
                    'p_corr_protein': p_corr_protein,
                    'p_pval_protein': p_pval_protein,
                    'p_corr_rna': p_corr_rna,
                    'p_pval_rna': p_pval_rna,
                    'counts': counts
                })
    res_df = pd.DataFrame(res_df)
    return res_df

In [117]:
res_df = calc_box(tissue_level=True, step=0.2)

res_df['last_q_delta'] = np.abs(
    res_df['rna_last_q'] -
    res_df['protein_last_q'])

res_df.to_csv("../../result_files/box_plot_tissues_5q_crispr_10pc_nc.csv", index=False)

HBox(children=(FloatProgress(value=0.0, max=2978.0), HTML(value='')))




# analysis

In [2]:
res_df = pd.read_csv("../../result_files/box_plot_tissues_5q_crispr_10pc_nc.csv")

In [119]:
res_df = pd.merge(
    res_df,
    lm_res[['y_id', 'x_protein', 'x_id', 'fdr', 'nc_beta', 'nc_fdr', 'beta', 'skew', 'ppi', 'r2']],
    left_on=['Gene', 'protein'],
    right_on=['y_id', 'x_protein'])

In [120]:
corr_dict_merged = pd.read_csv("../../data/protein_rna_correlations.csv")

target_proteins = corr_dict_merged[corr_dict_merged['corr_diff'].abs(
) < 0.2].sort_values('corr_avg').head(2000)['protein'].values

In [121]:
res_df['corr_diff'] = res_df['p_corr_protein'].abs() - res_df['p_corr_rna'].abs()

In [126]:
crispr_res_filtered = res_df[(res_df['protein'].isin(target_proteins))
                             & (res_df['p_pval_protein'] < 0.05)
                             & (res_df['skew'] < -2) &
                             (res_df['corr_diff'] > 0.15) &
                             (res_df['counts'] > 20) &
                             (res_df['mono'] == True)].sort_values(
                                 'corr_diff', ascending=False)[[
                                     'Gene', 'protein', 'x_id', 'beta', 'fdr',
                                     'nc_beta', 'nc_fdr', 'ppi', 'r2',
                                     'tissue', 'p_corr_protein',
                                     'p_pval_protein', 'corr_diff', 'counts',
                                     'skew'
                                 ]].reset_index(drop=True)

In [109]:
lm_res.query("y_id == 'ERBB2' and x_id == 'MRPL58'")

Unnamed: 0,y_id,x_id,n,beta,lr,covs,pval,fdr,nc_beta,nc_lr,nc_pval,nc_fdr,r2,ppi,skew,x_protein


In [128]:
crispr_res_filtered.to_csv("../../result_files/crispr/crispr_tissue_associations_filtered_10pc_nc.csv", index=False)

In [123]:
crispr_res_filtered

Unnamed: 0,Gene,protein,x_id,beta,fdr,nc_beta,nc_fdr,ppi,r2,tissue,p_corr_protein,p_pval_protein,corr_diff,counts,skew
0,FOXA1,BASI_HUMAN,BSG,0.175898,0.000127,0.299393,9.783278e-16,3,0.477842,Breast,0.757430,5.206610e-07,0.742224,32,-4.420529
1,ZSWIM7,BAZ1B_HUMAN,BAZ1B,-0.130831,0.020890,-0.097165,8.936012e-02,3,0.237547,Breast,-0.836342,4.657935e-09,0.634648,31,-2.198937
2,MYT1,SSRA_HUMAN,SSR1,0.045142,0.827471,0.077990,9.024594e-04,3,0.339374,Peripheral Nervous System,0.615107,3.000349e-03,0.608756,21,-3.100939
3,PHOX2A,CHAP1_HUMAN,CHAMP1,-0.121986,0.727387,-0.180697,4.944509e-05,-,0.672565,Peripheral Nervous System,-0.627329,2.334525e-03,0.607119,21,-4.181047
4,UXS1,AN32A_HUMAN,ANP32A,-0.122185,0.056219,-0.094637,1.001123e-01,5+,0.446563,Esophagus,-0.703667,1.437720e-05,0.593184,30,-1.860715
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
617,EGFR,CC124_HUMAN,CCDC124,0.035351,0.954396,0.241137,2.642568e-06,-,0.593320,Head and Neck,0.540291,5.301404e-03,0.152100,25,-1.779083
618,ISL1,SGTA_HUMAN,SGTA,-0.060009,0.303999,-0.115676,9.242534e-07,5+,0.783923,Ovary,-0.468800,7.809979e-03,0.151979,31,-3.899070
619,CDH2,CLIC1_HUMAN,CLIC1,-0.047665,0.452007,-0.091456,2.387451e-04,4,0.568942,Head and Neck,-0.451211,5.743601e-03,0.151474,36,-1.804825
620,FOXA1,PPIA_HUMAN,PPIA,-0.046444,0.524185,-0.108310,4.936943e-04,3,0.477842,Peripheral Nervous System,0.568033,7.224653e-03,0.150834,21,-4.420529


In [129]:
res_df.iloc[27340,:]

Gene                           EGFR
protein                  ICT1_HUMAN
tissue                       Breast
protein_first_q           0.0340941
protein_last_q          -0.00567594
rna_first_q               0.0340941
rna_last_q                0.0820041
protein_first_nq_std      0.0678297
rna_first_nq_std          0.0581552
crispr_mean               0.0720659
mono                           True
p_corr_protein           -0.0869999
p_pval_protein             0.635882
p_corr_rna                -0.123741
p_pval_rna                 0.499845
counts                           32
last_q_delta                0.08768
y_id                           EGFR
x_protein                ICT1_HUMAN
x_id                         MRPL58
fdr                        0.803743
nc_beta                   -0.381459
nc_fdr                  8.74918e-05
beta                     -0.0965041
skew                       -1.77908
ppi                               -
r2                          0.59332
corr_diff                -0.

In [125]:
res_df[(res_df['Gene']=='EGFR')&(res_df['protein']=='ICT1_HUMAN')]

Unnamed: 0,Gene,protein,tissue,protein_first_q,protein_last_q,rna_first_q,rna_last_q,protein_first_nq_std,rna_first_nq_std,crispr_mean,...,x_protein,x_id,fdr,nc_beta,nc_fdr,beta,skew,ppi,r2,corr_diff
27330,EGFR,ICT1_HUMAN,Stomach,0.13434,0.04614,-0.08941,0.05626,0.033455,0.103001,-0.133195,...,ICT1_HUMAN,MRPL58,0.803743,-0.381459,8.7e-05,-0.096504,-1.779083,-,0.59332,0.02141
27331,EGFR,ICT1_HUMAN,Central Nervous System,0.040396,-0.004684,0.003936,-0.004684,0.024147,0.000282,0.079154,...,ICT1_HUMAN,MRPL58,0.803743,-0.381459,8.7e-05,-0.096504,-1.779083,-,0.59332,0.226553
27332,EGFR,ICT1_HUMAN,Bladder,0.331965,0.299895,0.327665,-0.453105,0.250366,0.163781,-0.238745,...,ICT1_HUMAN,MRPL58,0.803743,-0.381459,8.7e-05,-0.096504,-1.779083,-,0.59332,-0.127731
27333,EGFR,ICT1_HUMAN,Kidney,0.012799,0.418059,-0.142951,0.012799,0.227147,0.127828,-0.149049,...,ICT1_HUMAN,MRPL58,0.803743,-0.381459,8.7e-05,-0.096504,-1.779083,-,0.59332,-0.250995
27334,EGFR,ICT1_HUMAN,Skin,-0.016001,-0.024766,0.091505,0.10873,0.031999,0.091681,0.086876,...,ICT1_HUMAN,MRPL58,0.803743,-0.381459,8.7e-05,-0.096504,-1.779083,-,0.59332,-0.08693
27335,EGFR,ICT1_HUMAN,Head and Neck,0.149397,-0.070633,-0.016903,0.229247,0.25793,0.123227,-0.467897,...,ICT1_HUMAN,MRPL58,0.803743,-0.381459,8.7e-05,-0.096504,-1.779083,-,0.59332,0.17733
27336,EGFR,ICT1_HUMAN,Ovary,0.112034,0.069604,0.017434,0.108954,0.03851,0.055349,-0.085094,...,ICT1_HUMAN,MRPL58,0.803743,-0.381459,8.7e-05,-0.096504,-1.779083,-,0.59332,0.208921
27337,EGFR,ICT1_HUMAN,Haematopoietic and Lymphoid,0.079786,-0.031804,0.053811,-0.047334,0.048147,0.041426,0.133984,...,ICT1_HUMAN,MRPL58,0.803743,-0.381459,8.7e-05,-0.096504,-1.779083,-,0.59332,0.110935
27338,EGFR,ICT1_HUMAN,Lung,0.16961,0.03984,0.16535,0.10641,0.031497,0.055359,-0.08599,...,ICT1_HUMAN,MRPL58,0.803743,-0.381459,8.7e-05,-0.096504,-1.779083,-,0.59332,0.119463
27339,EGFR,ICT1_HUMAN,Bone,0.045993,-0.018617,0.043158,-0.018617,0.068384,0.065862,0.092172,...,ICT1_HUMAN,MRPL58,0.803743,-0.381459,8.7e-05,-0.096504,-1.779083,-,0.59332,0.012315
