In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind
from sklearn.metrics import r2_score
from scipy.stats import pearsonr, spearmanr
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.patches as mpatches
from scipy.stats import ttest_ind, ttest_1samp, ttest_rel
import matplotlib as mpl
import umap
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from joblib import Parallel, delayed
from tqdm.notebook import tqdm
import pickle
import matplotlib.ticker as mtick
import math
from itertools import combinations
import random
from sklearn.linear_model import LinearRegression
import qgrid

In [8]:
meta_sample = pd.read_csv("../../data/protein/e0022_diann_051021_sample_mapping_replicates.txt", sep='\t')
meta_sample = meta_sample[~meta_sample['Cell_line'].str.contains('Control')].reset_index(drop=True)
meta = meta_sample.drop_duplicates(['Cell_line']).reset_index(drop=True)

In [9]:
protein_ruv = pd.read_csv(
    "../../data/protein/e0022_diann_051021_working_matrix_averaged_processed.txt",
    sep='\t').set_index('Cell_line')
name_map_df = pd.read_csv(f"../../data/misc/uniprot_human_idmap.tab.gz",
                       sep='\t')
name_map_dict = name_map_df.set_index("Entry name").to_dict()['Gene names  (primary )']
protein2rna_map = name_map_dict
rna2protein_map = name_map_df.set_index("Gene names  (primary )").to_dict()['Entry name']

rna_raw = pd.read_csv("../../data/rna/rnaseq_voom.csv", index_col=0).T

rna_raw.index.name = 'SIDM'

rna_raw = rna_raw.reset_index()
rna_sample = pd.merge(rna_raw,
                      meta[['SIDM',
                            'Cell_line']].drop_duplicates()).drop(['SIDM'],
                                                                  axis=1)

rna_sample = rna_sample.set_index(['Cell_line'])

In [14]:
genes = rna_sample.columns
gexp_t = pd.DataFrame(
    {i: Utils.gkn(rna_sample.T.loc[i].dropna()).to_dict() for i in genes}
)

In [18]:
rna_sample = gexp_t.T

In [21]:
rna_sample.index.name = 'Cell_line'

In [22]:
res_df = []
not_mapped = []
for protein in tqdm(protein_ruv.columns):
    if protein not in name_map_dict:
        not_mapped.append(protein)
        continue
    gene = name_map_dict[protein]
    if gene not in rna_sample.columns:
        continue
    tmp_df = pd.merge(protein_ruv[[protein]],
                      rna_sample[[gene]],
                      on='Cell_line').dropna()
    if tmp_df.shape[0] < 2:
        continue
    corr, pvalue = pearsonr(tmp_df.iloc[:, 0], tmp_df.iloc[:, 1])
    res_df.append({
        'protein': protein,
        'gene': gene,
        'pearsonr': corr,
        'pvalue': pvalue
    })
res_df = pd.DataFrame(res_df)

  0%|          | 0/6692 [00:00<?, ?it/s]

In [23]:
not_mapped

['DRB1_HUMAN',
 'SCP2_HUMAN',
 'UTRN_HUMAN',
 'GTPB4_HUMAN',
 'PALS2_HUMAN',
 'PALS1_HUMAN',
 'UTP25_HUMAN',
 'EPB41_HUMAN',
 'STEEP_HUMAN',
 'DAA10_HUMAN',
 'PBIR2_HUMAN',
 'H3PS2_HUMAN',
 'MIX23_HUMAN',
 'NOPC1_HUMAN',
 'EMC5_HUMAN',
 'SCD_HUMAN',
 'STYL2_HUMAN',
 'PBIR1_HUMAN',
 'ATPMK_HUMAN',
 'Q9HB66_HUMAN']

In [24]:
res_df.query('gene == "DKC1"')

Unnamed: 0,protein,gene,pearsonr,pvalue
1577,DKC1_HUMAN,DKC1,0.294594,3.5489639999999996e-20


In [9]:
res_df.to_csv("../../result_files/rna_corr_shawn.csv", index=False)

In [30]:
res_df.median()


Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.



pearsonr    4.216646e-01
pvalue      1.158843e-14
dtype: float64

In [2]:
res_df_no_norm = pd.read_csv("../../result_files/rna_corr_shawn.csv")

In [3]:
res_df_no_norm.median()

  res_df_no_norm.median()


pearsonr    4.161678e-01
pvalue      3.509480e-14
dtype: float64

In [6]:
res_df_no_norm.drop_duplicates('gene')

Unnamed: 0,protein,gene,pearsonr,pvalue
0,SRP14_HUMAN,SRP14,0.313829,8.130545e-23
1,ZFP91_HUMAN,ZFP91,0.262407,1.988657e-11
2,IRS2_HUMAN,IRS2,0.488880,2.521380e-12
3,RL4_HUMAN,RPL4,0.304117,1.852472e-21
4,SAMD1_HUMAN,SAMD1,0.515297,2.031825e-50
...,...,...,...,...
6423,NEGR1_HUMAN,NEGR1,0.470450,4.105341e-07
6424,MOT2_HUMAN,SLC16A7,0.562501,3.539948e-06
6425,LAPM5_HUMAN,LAPTM5,0.139067,3.737976e-01
6426,ZN462_HUMAN,ZNF462,0.196250,3.554960e-02
