In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind
from sklearn.metrics import r2_score
from scipy.stats import pearsonr, spearmanr
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.patches as mpatches
from scipy.stats import ttest_ind, ttest_1samp, ttest_rel
import matplotlib as mpl
import umap
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from joblib import Parallel, delayed
from tqdm.notebook import tqdm
import pickle
import matplotlib.ticker as mtick
import math
from itertools import combinations
import random
from sklearn.linear_model import LinearRegression

In [2]:
plt.rcParams['font.family'] = 'Helvetica'
plt.rcParams["axes.labelweight"] = "bold"
plt.rcParams['font.size'] = 18
plt.rcParams['axes.linewidth'] = 2
plt.rcParams['figure.figsize'] = (10.0, 7.0)
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42
sns.set_palette("Dark2")

In [3]:
seed = 42

# load data

In [4]:
colour_list = pd.read_csv('../../data/colour_list.txt',header=None)[0].tolist()

In [5]:
protein_ruv = pd.read_csv(
    "../../data/protein/E0022_P06_Protein_Matrix_ProNorM_no_control_update.txt",
    sep='\t').set_index('Cell_line')

meta = pd.read_csv('../../data/E0022_P06_final_sample_map_no_control.txt',
                   sep='\t').set_index('Cell_line')

In [6]:
cell_lines_no_blood = meta[meta['Tissue_type']!='Haematopoietic and Lymphoid'].index.values

In [7]:
tissue_type_map = meta[['Tissue_type']].to_dict()['Tissue_type']
cancer_type_map = meta[['Cancer_type']].to_dict()['Cancer_type']
batch_map = meta[['Batch']].to_dict()['Batch']
instrument_map = meta[['Instrument']].to_dict()['Instrument']


In [8]:
cancer_colours = dict(
    zip(meta['Cancer_type'].unique(),
        colour_list[:meta['Cancer_type'].unique().size]))

tissue_colours = dict(
    zip(meta['Tissue_type'].unique(),
        colour_list[:meta['Tissue_type'].unique().size]))

instrument_colours={
    'M01':'#66c2a5', 
    'M02':'#fc8d62', 
    'M03':'#8da0cb', 
    'M04':'#e78ac3', 
    'M05':'#a6d854', 
    'M06':'#ffd92f'
}

batch_colours={
    'P01':'#7fc97f', 
    'P02':'#beaed4', 
    'P03':'#fdc086', 
    'P04':'#386cb0', 
    'P05':'#f0027f', 
    'P06':'#bf5b17'
}

In [9]:
meta = meta.reset_index()

In [10]:
meta_cell_lines = meta.drop_duplicates('Cell_line')

In [11]:
name_map = pd.read_csv("../../data/misc/HUMAN_9606_idmapping.gene_prot.dat",
                       sep='\t',
                       names=['ID', 'type', 'code'])
name_map = name_map.drop_duplicates(['ID', 'type'])
name_map = pd.pivot(name_map, index='ID', columns='type', values='code').dropna()

protein2rna_map = dict(zip(name_map['UniProtKB-ID'].values, name_map['Gene_Name'].values))
rna2protein_map = {v: k for k, v in protein2rna_map.items()}

In [12]:
rna_raw = pd.read_csv("../../data/rna/rnaseq_voom.csv", index_col=0).T

rna_raw.index.name = 'SIDM'

rna_raw = rna_raw.reset_index()
rna_sample = pd.merge(rna_raw,
                      meta[['SIDM',
                            'Cell_line']].drop_duplicates()).drop(['SIDM'],
                                                                  axis=1)

rna_sample = rna_sample.set_index(['Cell_line'])

In [13]:
drug_df = pd.read_csv('../../data/drug/drug_final_processed_eg_id.csv.gz', low_memory=False)

In [14]:
drug_df['max_screening_conc_ln'] = drug_df['max_screening_conc'].map(np.log)

In [15]:
max_conc_map = drug_df[[
    'drug_id', 'max_screening_conc_ln'
]].drop_duplicates().set_index('drug_id').to_dict()['max_screening_conc_ln']

In [16]:
lm_res = pd.read_csv("../../result_files/lm/lm_sklearn_degr_drug_annotated.csv.gz")

In [17]:
lm_res = lm_res[(lm_res['fdr'] < 0.1) & (lm_res['r2'] > 0.4) & (lm_res['skew']< -1)]
lm_res['x_protein'] = lm_res['x_id'].map(rna2protein_map)

In [18]:
lm_assoc_list = list(zip(lm_res['y_id'], lm_res['x_id'], lm_res['x_protein']))

# calculation