In [1]:
import pandas as pd
import rpy2
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri, Formula
pandas2ri.activate()
from rpy2.robjects.packages import importr
from rpy2.robjects.conversion import localconverter
'''
Using DESeq2 in python
Adopted from:  https://stackoverflow.com/questions/41821100/running-deseq2-through-rpy2
and also from: https://gist.github.com/wckdouglas/3f8fb27a3d7a1eb24c598aa04f70fb25
'''

import os, sys, time,re
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import matplotlib.lines as mlines
from matplotlib.gridspec import  GridSpec
import seaborn as sns

from scipy import stats
from scipy.cluster import hierarchy
from statsmodels.stats.multitest import multipletests
from collections import defaultdict
import pysam
from collections import defaultdict
import statsmodels.api as sm

REFFLAT_hg38      = '../ref/refFlat_hg38_repiso.txt'
REFFLAT_chlSab2   = '../ref/refFlat_chlSab2.txt'     # Green monkey genome, for Vero cell data.
REFFLAT_SARSCOV2  = '../ref/annot_SARSCOV2.txt'      # Not exactly refFlat, but similar format. Used ORF start-end information.

BAMDIR_hostmapped = '/extdata1/baeklab/Doyeon/SARSCOV2/data/%s_hostalign_021721/%s.bam'     #e.g. %('mRNASeq','mRNA_2h_rep1')
BAMDIR_cov2mapped = '/extdata1/baeklab/Doyeon/SARSCOV2/data/%s_SARSCOV2align_021721/%s.bam' #e.g. %('mRNASeq','mRNA_2h_rep1')
RPKMDIR           = '/extdata1/baeklab/Doyeon/SARSCOV2/data/rpkm_081820/%s.txt'             #e.g. %'RPF_2h_rep1'
'''
Sequencing data can be downloaded from https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE157490
'''

GENELISTDIR  = '../ref/%s.txt' #e.g. %'Gordonetal_interacting_proteins'
RESULTDIR    = '../result/%s'  #e.g. %('mRNA_quantification.tsv')
FIGDIR       = '../fig/%s'     #e.g. %('revision_QC/021721_QTISeq_12h.pdf')

%matplotlib inline

## DESeq R package

In [2]:
'''
For install DESeq2 packages:
open R: ~/baeklab/Hyeonseong/anaconda3/envs/Doyeon-tf2/bin/R
> install.packages("BiocManager")
> BiocManager::install("DESeq2")
loooooooong time required for installing dependent packages
'''

deseq2 = importr('DESeq2')
to_dataframe = ro.r('function(x) data.frame(x)')
BiocGenerics = importr('BiocGenerics')


In [3]:
class py_DESeq2:
    '''
    DESeq2 object through rpy2
    input:
    count_matrix: should be a pandas dataframe with each column as count, and a id column for gene id
        example:
        id    sampleA    sampleB
        geneA    5    1
        geneB    4    5
        geneC    1    2
    design_matrix: an design matrix in the form of pandas dataframe, see DESeq2 manual, samplenames as rownames
                treatment
    sampleA1        A
    sampleA2        A
    sampleB1        B
    sampleB2        B
    design_formula: see DESeq2 manual, example: "~ treatment""
    gene_column: column name of gene id columns, exmplae "id"
    '''
    def __init__(self, count_matrix, design_matrix, design_formula, gene_column='id'):
        try:
            assert gene_column in count_matrix.columns, 'Wrong gene id column name'
            gene_id = count_matrix[gene_column]
        except AttributeError:
            sys.exit('Wrong Pandas dataframe?')

        self.dds = None
        self.deseq_result = None
        self.deseq_result_Robj = None
        self.resLFC = None
        self.comparison = None
        self.normalized_count_matrix = None
        self.gene_column = gene_column
        self.gene_id = count_matrix[self.gene_column]
        with localconverter(ro.default_converter + pandas2ri.converter):
            self.count_matrix = ro.conversion.py2rpy(count_matrix.drop(gene_column,axis=1))
            self.design_matrix = ro.conversion.py2rpy(design_matrix)
        self.design_formula = Formula(design_formula)

    def run_deseq(self, **kwargs):
        self.dds = deseq2.DESeqDataSetFromMatrix(countData=self.count_matrix, 
                                        colData=self.design_matrix,
                                        design=self.design_formula)        
        self.dds = deseq2.DESeq(self.dds, **kwargs)
        
    def get_deseq_result(self,lfcshrink=True, **kwargs):
        self.comparison = deseq2.resultsNames(self.dds)
        if lfcshrink:
            self.deseq_result_Robj = deseq2.lfcShrink(self.dds,coef='treatment_B_vs_A',type='apeglm',**kwargs)
        else:
            self.deseq_result_Robj = deseq2.results(self.dds, **kwargs)
        self.deseq_result = to_dataframe(self.deseq_result_Robj)
        
        with localconverter(ro.default_converter + pandas2ri.converter):
            self.deseq_result = ro.conversion.rpy2py(self.deseq_result) ## back to pandas dataframe
        
        self.deseq_result[self.gene_column] = self.gene_id.values
        

## Basic functions

In [4]:
class gene:
    def __init__(self):
        self.sGeneSym       = ''
        self.sNMID          = ''
        self.sChrID         = ''
        self.nExons         = 0
        self.nExonStartlist = []
        self.nExonEndlist   = []
        self.sStrandDir     = ''
        self.nORF_5p_gidx   = 0
        self.nORF_3p_gidx   = 0
        self.nExonlen       = 0
        self.nU5len         = 0
        self.nU3len         = 0

    def parse_refflat(self,refline):
        sInfolist = refline.strip().replace(' ','\t').split('\t')
        self.sGeneSym = sInfolist[0].upper()
        self.sNMID    = sInfolist[1]
        self.sChrID   = sInfolist[2] ##chr1,,,,chrX,chrY for human
        self.sStrandDir   = sInfolist[3]
        self.nORF_5p_gidx = int(sInfolist[6])
        self.nORF_3p_gidx = int(sInfolist[7])
        self.nExons       = int(sInfolist[8])
        self.nExonStartlist = [int(i) for i in sInfolist[9].split(',') if i != '']
        self.nExonEndlist   = [int(i) for i in sInfolist[10].split(',') if i != '']
        assert (self.nExons == len(self.nExonStartlist)) and (self.nExons == len(self.nExonEndlist))
        self.nExonlen       = sum([end-start for start,end in zip(self.nExonStartlist, self.nExonEndlist)]) 
        
        tmp_exonlen = 0
        for start,end in zip(self.nExonStartlist, self.nExonEndlist):
            if start <= self.nORF_5p_gidx <  end:
                self.nU5len  = tmp_exonlen + (self.nORF_5p_gidx - start)
            if start <= self.nORF_3p_gidx   <= end:
                self.nU3len = self.nExonlen - (tmp_exonlen + (self.nORF_3p_gidx - start))
            tmp_exonlen += (end-start)
        if self.sStrandDir == '-':
            tmp_nU3len  = self.nU5len
            self.nU5len = self.nU3len
            self.nU3len = tmp_nU3len
#########################
def load_RPKM(samplename):
    '''
    id	symbol	ef_length	nsegments1	nsegments2	nsegments3	readcount	readcount_uniq	exlev	exrnk	expct
    NM_006004	UQCRH	276	18702	18702	18494	9250.416666666679	32	27048.868164275482	1.0	0.005263989050902774
    '''
    df = pd.read_csv(RPKMDIR %samplename,sep='\t')
    df = df.rename(columns={'id':'NMID','symbol':'gsym',
                            'readcount':'nreads','exlev':'RPKM'})
    
    df['gsym'] = df['gsym'].str.upper()
    
    df['nreads'] = df['nreads'].round(0)
    df = df[['NMID','gsym','nreads','RPKM','exrnk','expct']]
    df = df.set_index('NMID')
    return df

## Figure 5a- DEG identification and plotting

### DEG identification

In [5]:
def identify_DEG(infected=[],uninfected=[], DEG_method=py_DESeq2, verbose=False,
                 OutF = ''):
    ## infected samples
    nread_df      = load_RPKM(infected[0])
    infected_nreads_df         = nread_df[['gsym','nreads']]
    infected_nreads_df.columns = ['gsym',infected[0]]
    infected_expct_df          = nread_df[['gsym','expct']]
    infected_expct_df.columns  = ['gsym',infected[0]+'_expct']
    for samplename in infected[1:]:
        nread_df = load_RPKM(samplename)
        infected_nreads_df[samplename]         = nread_df['nreads']
        infected_expct_df[samplename+'_expct'] = nread_df['expct']
    infected_expct_df = infected_expct_df.set_index(['gsym']).mean(axis=1) #average expct
    ##
    ## uninfected samples
    nread_df      = load_RPKM(uninfected[0])
    uninfected_nreads_df         = nread_df[['gsym','nreads']]
    uninfected_nreads_df.columns = ['gsym',uninfected[0]]
    uninfected_expct_df          = nread_df[['gsym','expct']]
    uninfected_expct_df.columns  = ['gsym',uninfected[0]+'_expct']
    for samplename in uninfected[1:]:
        nread_df = load_RPKM(samplename)
        uninfected_nreads_df[samplename]         = nread_df['nreads']
        uninfected_expct_df[samplename+'_expct'] = nread_df['expct']
    uninfected_expct_df = uninfected_expct_df.set_index(['gsym']).mean(axis=1) #average expct
    ##
    
    #For convenience, replace gsym to index and drop NMID
    infected_nreads_df   = infected_nreads_df.reset_index(drop=True)
    infected_nreads_df   = infected_nreads_df.set_index(['gsym'])
    uninfected_nreads_df = uninfected_nreads_df.reset_index(drop=True)
    uninfected_nreads_df = uninfected_nreads_df.set_index(['gsym'])
    
    count_mat = pd.concat([infected_nreads_df,uninfected_nreads_df],axis=1)
    count_mat = count_mat.reset_index()
    count_mat = count_mat.rename(columns={'index':'id'})
    design_mat= pd.DataFrame(['B' for i in range(len(infected))]+['A' for i in range(len(uninfected))],
                          index=infected+uninfected)
    design_mat.columns = ['treatment']
    
    DESeq = DEG_method(count_mat,design_mat,design_formula='~ treatment',gene_column='id')
    DESeq.run_deseq()
    DESeq.get_deseq_result(lfcshrink=True)
    
    result_df = DESeq.deseq_result
    if verbose:
        display(result_df.head())
    result_df = result_df.set_index(['id'])
    result_df['infected_expct']   = infected_expct_df
    result_df['uninfected_expct'] = uninfected_expct_df
    
    if verbose:
        display(result_df.head(10))
    if OutF != '':
        result_df.to_csv(RESULTDIR %OutF, sep = '\t')
    return result_df

### Plots (MA, volcano)

In [6]:
#MAplot: not used in the paper
def plotMA(sample_prefix='mRNA',hpi='0h', padj_cut=0.01, log2fc_cut=2.0, OutFigname='',
           ax = None, show_fig = False):
    InFname = f'{sample_prefix}_DESeq2_{hpi}.tsv'
    df = pd.read_csv(RESULTDIR %InFname, sep ='\t', header=0, index_col=0)
    '''
    id	baseMean	log2FoldChange	lfcSE	pvalue	padj	infection_expct	uninfection_expct
    A1BG	0.0					83.81389166710532	87.61120176870033
    A1CF	0.20868225617457992	0.06273705450357624	0.5463246183313462	0.4157015259052005		79.67310627993893	87.61120176870033
    '''
    up_df       = df[(df['padj']<padj_cut) & (df['log2FoldChange']>0)]
    up_lfc_df   = up_df[(up_df['log2FoldChange']> log2fc_cut)]
    up_x_df     = up_df[(up_df['log2FoldChange']<=log2fc_cut)]
    down_df     = df[(df['padj']<padj_cut) & (df['log2FoldChange']<0)]
    down_lfc_df = down_df[(down_df['log2FoldChange']<  -log2fc_cut)]
    down_x_df   = down_df[(down_df['log2FoldChange']>= -log2fc_cut)]
    nonsig_df   = df[df['padj']>=padj_cut]
    
    if ax == None:
        fig, ax = plt.subplots(figsize=(4,3))
    dot_size = 3
    ax.scatter(nonsig_df['baseMean']   ,nonsig_df['log2FoldChange'],   color='grey',   rasterized = True, s = dot_size)
    ax.scatter(up_x_df['baseMean']     ,up_x_df['log2FoldChange'],     color='pink',   rasterized = True, s = dot_size)
    ax.scatter(down_x_df['baseMean']   ,down_x_df['log2FoldChange'],   color='skyblue',rasterized = True, s = dot_size)
    ax.scatter(up_lfc_df['baseMean']   ,up_lfc_df['log2FoldChange'],   color='red',    rasterized = True, s = dot_size)
    ax.scatter(down_lfc_df['baseMean'] ,down_lfc_df['log2FoldChange'], color='blue',   rasterized = True, s = dot_size)
    ax.set_ylim(-5,5)
    ax.set_xscale('log')
    ax.set_xlabel('Num. reads')
    ax.set_ylabel('log2(fold change)')
    ax.set_title('Criteria: P.adj<%.02f, abs(log2fc)>%.01f\n# upregulated: %d, #downregulated: %d' %(padj_cut,
                                                                                                   log2fc_cut,
                                                                                                   len(up_lfc_df),
                                                                                                   len(down_lfc_df)
                                                                                                  ))
    if OutFigname != '':
        plt.tight_layout()
        plt.savefig(FIGDIR %OutFigname)
        plt.close()
    elif show_fig:
        plt.tight_layout()
        plt.show()
        plt.close()
    
    return None

def plotVolcano(sample_prefix='mRNA',hpi='0h',padj_cut=0.01,log2fc_cut=2.0,
                OutFigname='',ax = None, show_fig = False):
    InFname = f'{sample_prefix}_DESeq2_{hpi}.tsv'
    df = pd.read_csv(RESULTDIR %InFname, sep ='\t', header=0, index_col=0)
    '''
    id	baseMean	log2FoldChange	lfcSE	pvalue	padj	infection_expct	uninfection_expct
    A1BG	0.0					83.81389166710532	87.61120176870033
    A1CF	0.20868225617457992	0.06273705450357624	0.5463246183313462	0.4157015259052005		79.67310627993893	87.61120176870033
    '''
    up_df       = df[(df['padj']<padj_cut) & (df['log2FoldChange']>0)]
    up_lfc_df   = up_df[(up_df['log2FoldChange']> log2fc_cut)]
    up_x_df     = up_df[(up_df['log2FoldChange']<=log2fc_cut)]
    down_df     = df[(df['padj']<padj_cut) & (df['log2FoldChange']<0)]
    down_lfc_df = down_df[(down_df['log2FoldChange']<  -log2fc_cut)]
    down_x_df   = down_df[(down_df['log2FoldChange']>= -log2fc_cut)]
    nonsig_df   = df[df['padj']>=padj_cut]
    
    if ax == None:
        fig, ax = plt.subplots(figsize=(3,3))
    dot_size = 3
    ax.scatter(nonsig_df['log2FoldChange'],  -np.log10(nonsig_df['padj']),    color='k',      rasterized = True, s = dot_size)
    ax.scatter(up_x_df['log2FoldChange'],    -np.log10(up_x_df['padj']    ),  color='pink',   rasterized = True, s = dot_size)
    ax.scatter(down_x_df['log2FoldChange'],  -np.log10(down_x_df['padj']  ),  color='skyblue',rasterized = True, s = dot_size)
    ax.scatter(up_lfc_df['log2FoldChange'],  -np.log10(up_lfc_df['padj']    ),color='red',    rasterized = True, s = dot_size)
    ax.scatter(down_lfc_df['log2FoldChange'],-np.log10(down_lfc_df['padj']  ),color='blue',   rasterized = True, s = dot_size)
    
    ax.set_xlabel('log2(fold change)')
    ax.set_ylabel('-log10(q value)')
    ax.set_xlim(-10,10)
    ax.set_ylim(0,100)
    ax.set_title(hpi)
    if OutFigname != '':
        plt.tight_layout()
        plt.savefig(FIGDIR %OutFigname)
        plt.close()
    elif show_fig:
        plt.tight_layout()
        plt.show()
        plt.close()
    
    return None

def plot_volcano_multipanel(sample_prefix, hpi_list_list = [['0h','1h','2h','4h'],['12h','16h','24h','36h']],
                            padj_cut=0.01,log2fc_cut=2.0,OutFigname=''):
    
    fig, axes  = plt.subplots(nrows=len(hpi_list_list),           ncols=len(hpi_list_list[0]), 
                              figsize=(3*(len(hpi_list_list[0])), 3*len(hpi_list_list)))
    for idx_row, hpi_list in enumerate(hpi_list_list):
        for idx_col, hpi in enumerate(hpi_list):
            ax = axes[idx_row,idx_col]
            plotVolcano(sample_prefix=sample_prefix,hpi=hpi, padj_cut=padj_cut,log2fc_cut=log2fc_cut,
                        OutFigname='',ax = ax, show_fig = False)
    if OutFigname != None:
        plt.tight_layout()
        plt.savefig(FIGDIR %OutFigname)
        plt.close()
    else:
        plt.tight_layout()
        plt.show()
        plt.close()
    return None

### DEG identification and plotting for mRNA, RPF, and QTI (Fig. 5, SFig. 10)

In [None]:
#DESeq2 for mRNA, RPF, and QTI
tmp_hpi_list = ['%dh' %i for i in [0,1,2,4,12,16,24,36,48]]
tmp_sample_prefix_list = ['mRNA','RPF','QTI']

for sample_prefix in tmp_sample_prefix_list:
    if sample_prefix == 'mRNA':
        enumerated_prefix = ['RPFpaired','QTIpaired']
    else:
        enumerated_prefix = [sample_prefix]
    for hpi in tmp_hpi_list:
        print(sample_prefix,hpi,time.ctime(),sep='\t')
        if hpi == '48h':
            reps = 3
            uninfected_name = 'uninfected48h'
        else:
            reps = 2
            uninfected_name = 'uninfected'
        infected   = []
        uninfected = []
        for prefix in enumerated_prefix:
            for rep in range(1, reps+1):
                infected.append(  f'{prefix}_{hpi}_rep{rep}')
                uninfected.append(f'{prefix}_{uninfected_name}_rep{rep}')
        OutF = f'{sample_prefix}_DESeq2_{hpi}.tsv'
        _= identify_DEG(infected=infected,uninfected=uninfected, 
                        DEG_method=py_DESeq2, verbose= False, OutF = OutF)
                

In [8]:
#volcano plot
'''Fig. 5a: RPF'''
plot_volcano_multipanel('RPF', hpi_list_list = [['0h','1h','2h','4h'],['12h','16h','24h','36h']],
                        padj_cut=0.01,log2fc_cut=2.0,OutFigname='Fig5_RPF_volcano.pdf')
'''SFig. 10a: mRNA, QTI'''
plot_volcano_multipanel('mRNA', hpi_list_list = [['0h','1h','2h','4h'],['12h','16h','24h','36h']],
                        padj_cut=0.01,log2fc_cut=2.0,OutFigname='SFig10a_mRNA_volcano.pdf')
plot_volcano_multipanel('QTI', hpi_list_list = [['0h','1h','2h','4h'],['12h','16h','24h','36h']],
                        padj_cut=0.01,log2fc_cut=2.0,OutFigname='SFig10a_QTI_volcano.pdf')

## Fig. 5b-g (hierarchical clustering of DEGs, and time-course expression)

In [9]:
def timecourse_plot(DEG_df_list,hpi_list, padj_cut=0.01,log2fc_cut=2, clip = 3,
                    ylim = 8, n_cluster=5, as_average=False, 
                    load_precalc_clusters=False, clusterFname='',
                    clusters_to_show=[],clusters_priority=[], highlight_gene_info_df = pd.DataFrame(),
                    cl_method='ward',cl_metric='euclidean',
                    ax = None, OutFigname='',show_fig=True, verbose=False):
    if OutFigname != '':
        assert ('timecourse.' in OutFigname)
    if not(type(ylim) in [list,tuple]):
        ylim = (-ylim,ylim)
    up_DEG_list   = []
    down_DEG_list = []
    timecourse_df = pd.DataFrame(columns=hpi_list,     index = DEG_df_list[0].index) #index: gsym (id)
    DEG_summary_df= pd.DataFrame(columns=['up','down'],index = hpi_list)
    
    for hpi,DEG_df in zip(hpi_list,DEG_df_list):
        df = DEG_df.copy()
        up_DEG   = list(df[(df['padj']<padj_cut) & \
                           (df['log2FoldChange']>log2fc_cut) &\
                           ((df['infected_expct']<50) |(df['uninfected_expct']<50))
                          ].index)
        down_DEG = list(df[(df['padj']<padj_cut) & \
                           (df['log2FoldChange']<-log2fc_cut) &\
                           ((df['infected_expct']<50) |(df['uninfected_expct']<50))
                          ].index)
        up_DEG_list   += up_DEG
        down_DEG_list += down_DEG
        DEG_summary_df.loc[hpi] = [len(up_DEG),len(down_DEG)]
        timecourse_df[hpi]   = df['log2FoldChange']
    
    DEG_list = list(set(up_DEG_list)|set(down_DEG_list))
    if verbose:
        print('summary of DEGs')
        print('total #:', len(DEG_list), sep = '\t')
        display(DEG_summary_df.T)
        
    timecourse_df = timecourse_df.fillna(0.0)
    if load_precalc_clusters:
        ID_cl_df = pd.read_csv(RESULTDIR %clusterFname, sep = '\t', header = 0, index_col = 0)
    else:
        ID_cl_df = cluster_DEGs(timecourse_df,DEG_list, cl_method=cl_method,
                                cl_metric=cl_metric, n_cluster=n_cluster, clip=clip,
                                OutFigname=OutFigname.replace('timecourse.pdf','hclust.pdf'),show_fig=show_fig)
        ID_cl_df.to_csv(RESULTDIR %clusterFname,
                        sep = '\t')
    np.random.seed(2020)
    sampled_cloud = np.random.choice(sorted(list(set(timecourse_df.index)- (set(up_DEG_list)|set(down_DEG_list)))), 100)
    
    ####Colors for different clusters
    if ax == None:
        fig, ax = plt.subplots(figsize=(3.5,3))
    
    palette = sns.color_palette('bright',10) ###########CAUTION: HARD-CODED (max 6 colors)
    palette = [color for i,color in enumerate(palette) if not(i in [0,3,5,7])]
    
    if clusters_to_show == []:
        clusters_to_show = list(range(1,1+n_cluster))
    
    
    if as_average:
        ylim = (-5,5)
        for DEG in DEG_list:
            ax.plot(timecourse_df.loc[DEG],color='grey',alpha=0.2,linewidth=0.5)
            
        for cluster in clusters_to_show:
            color   = palette[cluster-1]
            DEGS_within_cluster  = [DEG for DEG in DEG_list if ID_cl_df.loc[DEG,'cluster'] == cluster]
            lfc_df  = timecourse_df.loc[DEGS_within_cluster]
            avg_lfc = lfc_df.mean(axis=0)
            std_lfc = lfc_df.std( axis=0)
            ax.plot(avg_lfc,color=color,linewidth=1)
            ax.fill_between(np.arange(len(avg_lfc)), 
                            avg_lfc-std_lfc,avg_lfc+std_lfc, color=color, alpha=0.7)
        
    else:
        if len(highlight_gene_info_df) == 0:
            grey_color = False
        else:
            grey_color = True
        for DEG in DEG_list:
            cluster = ID_cl_df.loc[DEG,'cluster']
            if grey_color:
                color  = 'grey'
                alpha  = 0.2
            else:
                color  = palette[cluster-1]
                alpha  = 1.0
                
            if cluster in clusters_to_show:
                if not (cluster in clusters_priority):
                    ax.plot(timecourse_df.loc[DEG],color=color,linewidth=0.5, alpha = alpha)
                    
        ## clusters to draw last
        for p_cluster in clusters_priority:
            for DEG in DEG_list:
                cluster = ID_cl_df.loc[DEG,'cluster']
                if grey_color:
                    color  = 'grey'
                    alpha  = 0.2
                else:
                    color  = palette[cluster-1]
                    alpha  = 1.0
                if cluster == p_cluster:
                    ax.plot(timecourse_df.loc[DEG],color=color,linewidth=0.5, alpha = alpha)
        
        highlight_summary_df = pd.DataFrame(columns = ['lfc_36h','lfc_maxabs']) 
        
        for genelist_name, row in highlight_gene_info_df.iterrows():
            color        = row['color']
            linewidth    = row['linewidth']
            print_cutoff = row['print_cutoff']
            highlight_genelist = row['genelist']
            
            for gsym in highlight_genelist: ##for special markup, Fig. 6
                if gsym in timecourse_df.index:
                    ax.plot(timecourse_df.loc[gsym], color=color, linewidth=linewidth)
                    lfc_maxabs = timecourse_df.loc[gsym].abs().max()
                    if lfc_maxabs>print_cutoff:
                        lfc_36h = timecourse_df.loc[gsym,str(hpi_list[-1])]
                        loc_y   = np.clip(lfc_36h,*ylim)
                        
                        ax.text(len(hpi_list)-1,loc_y,gsym)
                        highlight_summary_df.loc[gsym] = [lfc_36h, lfc_maxabs]
        
        if verbose:
            print('highlight genes with >1.5 max(|log2fc|)')
            display(highlight_summary_df)
    
    ax.set_ylim(*ylim)
    ax.set_ylabel('log2(fold change)')
    ax.set_xlabel('hpi')
    
    if OutFigname != '':
        plt.tight_layout()
        plt.savefig(FIGDIR %OutFigname)
        plt.close()
    elif show_fig:
        plt.tight_layout()
        plt.show()
        plt.close()
    
    return timecourse_df


def to_hex(r, g, b,float_scale = True):
    if float_scale:
        r = int(r*255)
        g = int(g*255)
        b = int(b*255)
    rgb = r<<16 | g<<8 | b
    return f'{rgb:06x}'

def cluster_DEGs(timecourse_df,DEG_list, cl_method='ward',cl_metric='euclidean',
                 n_cluster=5, clip=2.8,  OutFigname='', show_fig=False):
    df = timecourse_df.loc[DEG_list].clip(-clip,clip)
    linkage = hierarchy.linkage(df, method=cl_method,metric=cl_metric)
    cluster = hierarchy.fcluster(linkage,n_cluster,criterion='maxclust')
    palette = sns.color_palette('bright',10)
    palette = [color for i,color in enumerate(palette) if not(i in [0,3,5,7])]  ##CAUTION: HARD-CODED
    
    lut         = {(i+1): col for i,col in enumerate(palette)}
    ID_cl_df    = pd.DataFrame(np.array([ [cluster[i] for i,ID in enumerate(df.index)] ]).T, 
                               index=df.index, columns=['cluster'])
    
    colors      = pd.Series([lut[ID_cl_df.loc[ID,'cluster']] for ID in ID_cl_df.index],index = ID_cl_df.index)
    colors.name = 'Cluster'
    max_lfc = np.ceil(clip)
    clustergrid = sns.clustermap(df,figsize=(10,10),method=cl_method,metric=cl_metric,
                                 row_colors = colors,row_linkage= linkage,col_cluster=False,
                                 cmap='coolwarm',vmin=-max_lfc,vmax=max_lfc,
                                 cbar_kws={'label': 'log2(fold change)'},
                                 rasterized = True)
    
    clustergrid.ax_heatmap.set_xlabel('hpi')
    clustergrid.ax_heatmap.set_ylabel('')
    
    ## print list of clustered DEGs, for cytoscape
    for i in range(1,1+n_cluster):
        IDs = list(ID_cl_df[ID_cl_df['cluster'] == i].index)
        print('Cluster%d (%d)' %(i, len(IDs)),','.join(IDs))
    
    if OutFigname != '':
        plt.savefig(FIGDIR %OutFigname)
        plt.close()
    elif show_fig:
        plt.show()
        plt.close()
        
    return ID_cl_df

### Clustering and timecourse plot (Fig. 5b,c, SFig. 10b,c)

In [10]:
for tmp_sample_prefix in ['mRNA','RPF','QTI']:
    print(f'-------{tmp_sample_prefix}-------')
    tmp_DEG_df_list   = []
    tmp_hpi_list      = ['%dh' %i for i in [0,1,2,4,12,16,24,36]]
    for hpi in tmp_hpi_list:
        InFname    = f'{tmp_sample_prefix}_DESeq2_{hpi}.tsv'
        tmp_DEG_df = pd.read_csv(RESULTDIR %(InFname), sep ='\t', header=0, index_col=0)
        tmp_DEG_df_list.append(tmp_DEG_df)

    timecourse_df = timecourse_plot(tmp_DEG_df_list,tmp_hpi_list, padj_cut=0.01, log2fc_cut=2, clip = 2.8,
                                    ylim = 8, n_cluster=5, as_average=False, 
                                    load_precalc_clusters=False, clusterFname=f'{tmp_sample_prefix}_cluster.tsv',
                                    clusters_to_show=[],clusters_priority=[], 
                                    cl_method='ward',cl_metric='euclidean',ax = None, 
                                    OutFigname=f'Fig5_{tmp_sample_prefix}_individual_timecourse.pdf',
                                    show_fig=False, verbose=True)
    timecourse_df.to_csv(RESULTDIR %(f'{tmp_sample_prefix}_timecourse.tsv'),sep='\t')
    _             = timecourse_plot(tmp_DEG_df_list,tmp_hpi_list, padj_cut=0.01, log2fc_cut=2, clip = 2.8,
                                    ylim = 8, n_cluster=5, as_average=True, 
                                    load_precalc_clusters=True, clusterFname=f'{tmp_sample_prefix}_cluster.tsv',
                                    clusters_to_show=[],clusters_priority=[], 
                                    cl_method='ward',cl_metric='euclidean',ax = None, 
                                    OutFigname=f'Fig5_{tmp_sample_prefix}_average_timecourse.pdf',
                                    show_fig=False, verbose=False)

-------mRNA-------
summary of DEGs
total #:	542


Unnamed: 0,0h,1h,2h,4h,12h,16h,24h,36h
up,0,1,5,22,100,193,209,216
down,0,0,4,26,85,87,80,58


Cluster1 (23) IER5L,PIM1,PRKCD,GRB7,TOX3,SOCS3,MIDN,NOL4L,BCORL1,ZNF792,FAM110C,GSE1,BCL3,CDC42EP4,EGR1,SPRED3,FOS,SPSB2,NKX6-1,BCOR,PLEKHH3,CXXC5,TIGD7
Cluster2 (134) CRYM,SPIRE2,KRT13,TCEA3,SERPINA4,RASL11A,ZNF488,AREG,ST6GALNAC2,ANKRD1,TXNIP,AKR1C2,FAM171B,VSIG1,ACTA2,IGF2BP2,YPEL4,HLA-DOA,FSCN1,CABYR,RDM1,ZNF816-ZNF321P,ASS1,ADSSL1,KIF5C,RTN4RL2,SCARA3,UNC5A,AQP3,GCNT4,ARHGAP23,PHF7,CPM,SMARCD3,NDRG4,MOK,TNFAIP2,ARRDC4,WNT7B,SULF2,VSIR,TMPRSS3,AKR1C1,MCAM,CCDC110,KLK11,MYLK,ARHGEF25,ALDH1L1,TMCC3,PPP1R1B,TP53I11,ERP27,UGT1A1,TTYH1,SYT7,AQP4,BBOF1,GOLT1A,ACTBL2,GNAZ,KREMEN1,SH3BP2,TMEM176A,TRIM15,AKR1B15,TCEA2,HLF,PSG4,CYP1B1,PLAC8,NANOS1,PLXND1,CLDN16,TUBB4A,TLE2,FOSL1,FBLN1,IQCH,RCAN1,PTGES,SLC29A4,SLC16A14,TIMP3,CDKN3,UGT1A6,FSTL1,ZNF285,APOL3,BCAS1,GSDMB,BCAM,GMCL1,DBP,SCNN1A,MYZAP,BPIFB1,KLF9,CCL2,C9ORF135,FAM131B,ALDH1A1,MUC5B,TSPAN7,COLCA2,MAOA,WIPF3,AKR1B10,RASSF5,NCCRP1,REEP2,NMT2,RARRES1,ADGRB2,CYBRD1,CACNG4,TNNC1,ALDH1A3,ATP2B4,FGD3,ID1,IFITM2,ANKRD33B,NR1H4,ABI3BP,BCL2L1

Unnamed: 0,lfc_36h,lfc_maxabs


-------RPF-------
summary of DEGs
total #:	677


Unnamed: 0,0h,1h,2h,4h,12h,16h,24h,36h
up,1,0,3,24,91,150,164,191
down,0,2,8,35,146,201,168,100


Cluster1 (37) CHAC1,IER5L,CDKN1C,PRKD2,PLEKHG3,LIPE,TFEB,KLF2,GRB7,CDC42EP2,RTN4R,ADM2,YPEL3,NOL4L,ZSWIM4,TRIM15,BCORL1,GSE1,MAFF,PLEKHG6,BCL3,PIK3IP1,SHF,MYZAP,IRF5,KLF9,CDK18,SESN2,NFATC4,FBXL8,TRIM31,ZFP36,CTTNBP2NL,BCOR,GLIS2,TFAP2C,TBC1D10A
Cluster2 (292) SLC40A1,EPHX1,PAQR8,ERO1B,MFSD4A,PPM1L,SYTL3,SECTM1,KRT13,UGT1A5,TCEA3,BTD,HLA-DMB,SERPINA4,ITM2A,ALDH1L2,DEFB1,ODAM,HLA-DQA2,IGF1R,CTSE,AREG,ZNF226,PRTG,ADAM10,ST6GALNAC2,FTL,ANKRD1,ARHGDIB,MGLL,SLC26A9,TXNIP,PLAT,AKR1C2,NRP2,ERAP2,HLA-DQA1,RBM12,TM4SF4,VSIG1,TMEM173,IFNAR2,ACE2,IFITM10,DHDH,ASS1,NID1,TMEM176B,ADSSL1,VWA8,LMF1,CXORF38,LAPTM4A,SRPX2,CD24,UNC5A,AQP3,HSPG2,MANSC1,UPK3B,CPM,LCN2,PLA2G10,PDPR,HIST1H2BI,CYP1A1,ENPP5,UGT1A10,PODXL,NDRG4,HLA-DRB5,SLC16A5,SEC22A,VASN,WNT7B,TMC5,SULF2,SLC22A5,ACAD10,CNIH4,VSIR,FAM234B,CTSH,FBP1,HAVCR1,AKR1C1,C3,AGT,TMEM45A,TSPAN8,NEK2,MYSM1,MCAM,SLC16A7,LAMB2,UGGT1,ALG10,HACL1,PRCP,PPP1R1B,ANG,WLS,SLC7A11,UGT1A1,TTYH1,UGGT2,ZSCAN18,AKR1C3,SLC34A2,CGRRF1,WFDC2,AASS,TLR5,EMP2,RHBDD1,NRGN,HL

Unnamed: 0,lfc_36h,lfc_maxabs


-------QTI-------
summary of DEGs
total #:	861


Unnamed: 0,0h,1h,2h,4h,12h,16h,24h,36h
up,0,1,6,104,126,232,197,148
down,0,0,16,54,149,216,185,112


Cluster1 (38) STYK1,GRB7,TGFBR3,KLHL22,HOXB4,PLSCR4,BCL6,IFRD1,ARRDC3,TLDC1,TFAP2C,TBC1D10A,PEX11B,TAF7,B3GALT5,IER5L,IER2,MYORG,CDC42EP2,IRF2,VRK3,VPS25,SOCS3,SRGAP2,NOL4L,PEF1,RALB,SERTAD2,MID1IP1,BTN3A3,ING4,BLCAP,ARID5B,PRR15L,ANKMY2,RARA,TMEM139,TTC33
Cluster2 (349) GSTM3,EIF3F,SECTM1,UGT1A5,ITM2A,ODAM,HLA-DQA2,KRTCAP3,AREG,HSD17B8,MGLL,TXNIP,RDH13,AKR1C2,RNF19A,RBM12,PIGR,PSAP,GCOM1,FUT3,ACE2,HEXA,IFITM10,DHDH,ASS1,METTL15,EHHADH,DNASE1L1,STX8,KREMEN2,UNC5A,C15ORF39,NAB2,SGK2,PEX6,SLC16A5,IFFO2,RPL18,KDELR1,POTEF,VASN,TMEM59,TMC5,TMEM37,SULF2,TAPBP,LITAF,VSIR,CTSH,CLN3,TSPAN8,SLPI,DUSP5,KLK11,CYR61,SLC16A7,CPT2,ALDH1L1,RHPN2,CLDN1,RELA,PPP1R1B,SLC7A11,MAPK8IP3,MIDN,B3GNT7,ADAP1,PRAME,DAZAP2,HLA-DPA1,NDRG2,ZC3H12A,MRPL41,FCGBP,NECTIN2,GBP2,MAPT,ZMIZ1,FAM234A,ZNF274,ICAM5,TRIM15,MYADM,AKR1B15,GPR37,COL1A1,ICAM1,CXORF38,SHISA5,CYP1B1,PPARG,SPINK1,RND3,LRG1,PLAC8,CHST4,ATXN10,PEX11A,TMPRSS4,HLA-F,SAT1,TMEM161A,MAFF,SLC9A3R2,KRT15,FUCA1,DUSP1,BCL3,PTGES,GALK1,ACADS,RARB,TIMP3,CHCHD10,

Unnamed: 0,lfc_36h,lfc_maxabs


### Timscourse plots for wach cluster (Fig. 5d-g, SFigs. 10d, 11a-e)

In [11]:
for tmp_sample_prefix in ['mRNA','RPF','QTI']:
    print(f'-------{tmp_sample_prefix}-------')
    tmp_DEG_df_list   = []
    tmp_hpi_list      = ['%dh' %i for i in [0,1,2,4,12,16,24,36]]
    for hpi in tmp_hpi_list:
        InFname    = f'{tmp_sample_prefix}_DESeq2_{hpi}.tsv'
        tmp_DEG_df = pd.read_csv(RESULTDIR %(InFname), sep ='\t', header=0, index_col=0)
        tmp_DEG_df_list.append(tmp_DEG_df)

    for cluster in range(1,1+5):
        _             = timecourse_plot(tmp_DEG_df_list,tmp_hpi_list, padj_cut=0.01, log2fc_cut=2, clip = 2.8,
                                        ylim = 8, n_cluster=5, as_average=True, 
                                        load_precalc_clusters=True, clusterFname=f'{tmp_sample_prefix}_cluster.tsv',
                                        clusters_to_show=[cluster],clusters_priority=[], 
                                        cl_method='ward',cl_metric='euclidean',ax = None, 
                                        OutFigname=f'Fig5_{tmp_sample_prefix}_average_cluster{cluster}_timecourse.pdf',
                                        show_fig=False, verbose=False)

-------mRNA-------
-------RPF-------
-------QTI-------


## Figure 6- Inspection of the genes of interest
### Timecourse expression of the highlighted genes (Fig. 6a,b,d,e)

In [12]:
def plot_Fig6abde_multipanel(OutFigname=''):
    #Fig.6a: Gordon et al + ACE2, TMPRSS2
    Fig6a_df = pd.DataFrame(columns = ['color','linewidth','print_cutoff','genelist'])
    Fig6a_df.loc['Gordonetal']  = ['#a52a2a', 0.5, 100.0, [i.strip().upper() for i in open(GENELISTDIR %'Gordonetal_interacting_proteins')]]
    Fig6a_df.loc['ACE2+TMPRSS2']= ['blue',    1.0, 0.0,   ['ACE2', 'TMPRSS2']]
    
    #Fig. 6b: Daniloski et al. MOI0.3,CRISPR log2fc>1 host factors (drugtargets: subgroup)
    Fig6b_df = pd.DataFrame(columns = ['color','linewidth','print_cutoff','genelist'])
    hostfactors_drugtargets = pd.read_csv(GENELISTDIR %'Daniloski_MOI03_dgidb',sep='\t').iloc[:,0].str.upper().tolist()
    hostfactors             = pd.read_csv(GENELISTDIR %'Daniloski_MOI03_rank',sep='\t').iloc[:,0].str.upper().tolist()
    hostfactors_nontargets  = list(set(hostfactors)-set(hostfactors_drugtargets))
    Fig6b_df.loc['Daniloski_hostfactors_nontargets'] = ['#a52a2a', 0.5, 1.5, hostfactors_nontargets]
    Fig6b_df.loc['Daniloski_hostfactors_drugtargets']= ['magenta', 1.0, 1.5, hostfactors_drugtargets]
    
    #Fig. 6d: IFN_typeI_III+Blancomelo_IFN1response_DEGs_sig
    Fig6d_df = pd.DataFrame(columns = ['color','linewidth','print_cutoff','genelist'])
    IFN_response_list      = [i.strip().upper() for i in open(GENELISTDIR %'IFN_typeI_III+Blancomelo_IFN1response_DEGs_sig')]
    IFN_list               = [gsym for gsym in IFN_response_list if 'IFN' in gsym]
    others_list            = [gsym for gsym in IFN_response_list if not('IFN' in gsym)]
    Fig6d_df.loc['others'] = ['#a52a2a',    0.5, 100.0, others_list]
    Fig6d_df.loc['IFN']    = ['green',    1.0,   100.0, IFN_list]
    
    #Fig. 6e: Blancomelo_cytokineDEGs_sig_Fig2+4
    Fig6e_df = pd.DataFrame(columns = ['color','linewidth','print_cutoff','genelist'])
    Fig6e_df.loc['cytokine']  = ['#a52a2a', 0.5, 100.0, [i.strip().upper() for i in open(GENELISTDIR %'Blancomelo_cytokineDEGs_sig_Fig2+4')]]
    
    highlight_gene_info_df_list = [Fig6a_df,Fig6b_df,Fig6d_df, Fig6e_df]
    tmp_sample_prefix_list = ['mRNA','RPF','QTI']
    
    fig, axes  = plt.subplots(nrows=len(highlight_gene_info_df_list), ncols=len(tmp_sample_prefix_list), 
                              figsize=(3.5*len(tmp_sample_prefix_list), 3*len(highlight_gene_info_df_list)))
    
    for idx_col, tmp_sample_prefix in enumerate(tmp_sample_prefix_list):
        print(f'------{tmp_sample_prefix}------')
        tmp_DEG_df_list   = []
        tmp_hpi_list      = ['%dh' %i for i in [0,1,2,4,12,16,24,36]]
        for hpi in tmp_hpi_list:
            InFname    = f'{tmp_sample_prefix}_DESeq2_{hpi}.tsv'
            tmp_DEG_df = pd.read_csv(RESULTDIR %(InFname), sep ='\t', header=0, index_col=0)
            tmp_DEG_df_list.append(tmp_DEG_df)
        
        for idx_row, highlight_gene_info_df in enumerate(highlight_gene_info_df_list):
            if idx_row == 1:
                ylim = (2.5,-5.0)
            else:
                ylim = 5.0
            display(highlight_gene_info_df)
            ax = axes[idx_row,idx_col]
            _             = timecourse_plot(tmp_DEG_df_list,tmp_hpi_list, padj_cut=0.01, log2fc_cut=2, clip = 2.8,
                                        ylim = ylim, n_cluster=5, as_average=False, 
                                        load_precalc_clusters=True, clusterFname=f'{tmp_sample_prefix}_cluster.tsv',
                                        clusters_to_show=[], clusters_priority=[], highlight_gene_info_df = highlight_gene_info_df,
                                        cl_method='ward',cl_metric='euclidean',ax = ax, 
                                        OutFigname='', show_fig=False, verbose=False)
    if OutFigname != '':
        plt.savefig(FIGDIR %OutFigname)
        plt.close()
    else:
        plt.show()
        plt.close()
    return None



In [13]:
plot_Fig6abde_multipanel(OutFigname='Fig6abde_reported_genes_timecourse.pdf')

------mRNA------


Unnamed: 0,color,linewidth,print_cutoff,genelist
Gordonetal,#a52a2a,0.5,100.0,"[PCNT, PVR, POLA1, FASTKD5, PRIM2, ITGB1, CNTR..."
ACE2+TMPRSS2,blue,1.0,0.0,"[ACE2, TMPRSS2]"


Unnamed: 0,color,linewidth,print_cutoff,genelist
Daniloski_hostfactors_nontargets,#a52a2a,0.5,1.5,"[ACTR3, ATP6V1G1, KIAA1033, CCDC93, UVRAG, POD..."
Daniloski_hostfactors_drugtargets,magenta,1.0,1.5,"[VPS35, ACE2, ATP6V1C1, ATP6V1B2, CTSL, HDAC9,..."


Unnamed: 0,color,linewidth,print_cutoff,genelist
others,#a52a2a,0.5,100.0,"[MX1, XAF1, OAS1, IFITM1, OAS3, IRF9, OAS2, IR..."
IFN,green,1.0,100.0,"[IFNA1, IFNA2, IFNA4, IFNA5, IFNA6, IFNA7, IFN..."


Unnamed: 0,color,linewidth,print_cutoff,genelist
cytokine,#a52a2a,0.5,100.0,"[CCL2, CCL20, CCL8, CSF2, CSF3, CXCL1, CXCL14,..."


------RPF------


Unnamed: 0,color,linewidth,print_cutoff,genelist
Gordonetal,#a52a2a,0.5,100.0,"[PCNT, PVR, POLA1, FASTKD5, PRIM2, ITGB1, CNTR..."
ACE2+TMPRSS2,blue,1.0,0.0,"[ACE2, TMPRSS2]"


Unnamed: 0,color,linewidth,print_cutoff,genelist
Daniloski_hostfactors_nontargets,#a52a2a,0.5,1.5,"[ACTR3, ATP6V1G1, KIAA1033, CCDC93, UVRAG, POD..."
Daniloski_hostfactors_drugtargets,magenta,1.0,1.5,"[VPS35, ACE2, ATP6V1C1, ATP6V1B2, CTSL, HDAC9,..."


Unnamed: 0,color,linewidth,print_cutoff,genelist
others,#a52a2a,0.5,100.0,"[MX1, XAF1, OAS1, IFITM1, OAS3, IRF9, OAS2, IR..."
IFN,green,1.0,100.0,"[IFNA1, IFNA2, IFNA4, IFNA5, IFNA6, IFNA7, IFN..."


Unnamed: 0,color,linewidth,print_cutoff,genelist
cytokine,#a52a2a,0.5,100.0,"[CCL2, CCL20, CCL8, CSF2, CSF3, CXCL1, CXCL14,..."


------QTI------


Unnamed: 0,color,linewidth,print_cutoff,genelist
Gordonetal,#a52a2a,0.5,100.0,"[PCNT, PVR, POLA1, FASTKD5, PRIM2, ITGB1, CNTR..."
ACE2+TMPRSS2,blue,1.0,0.0,"[ACE2, TMPRSS2]"


Unnamed: 0,color,linewidth,print_cutoff,genelist
Daniloski_hostfactors_nontargets,#a52a2a,0.5,1.5,"[ACTR3, ATP6V1G1, KIAA1033, CCDC93, UVRAG, POD..."
Daniloski_hostfactors_drugtargets,magenta,1.0,1.5,"[VPS35, ACE2, ATP6V1C1, ATP6V1B2, CTSL, HDAC9,..."


Unnamed: 0,color,linewidth,print_cutoff,genelist
others,#a52a2a,0.5,100.0,"[MX1, XAF1, OAS1, IFITM1, OAS3, IRF9, OAS2, IR..."
IFN,green,1.0,100.0,"[IFNA1, IFNA2, IFNA4, IFNA5, IFNA6, IFNA7, IFN..."


Unnamed: 0,color,linewidth,print_cutoff,genelist
cytokine,#a52a2a,0.5,100.0,"[CCL2, CCL20, CCL8, CSF2, CSF3, CXCL1, CXCL14,..."


### Correlation b/w max(|log2fc|) ~ Daniloski et al. physiological impact (Fig. 6c)

In [14]:
def calc_plot_assoc_lfc_physioimpact(sample_prefix='RPF', OutFigname=''):
    tmp_DEG_df_list   = []
    tmp_hpi_list      = ['%dh' %i for i in [0,1,2,4,12,16,24,36]]
    for hpi in tmp_hpi_list:
        InFname    = f'{sample_prefix}_DESeq2_{hpi}.tsv'
        tmp_DEG_df = pd.read_csv(RESULTDIR %(InFname), sep ='\t', header=0, index_col=0)
        tmp_DEG_df_list.append(tmp_DEG_df)
        
    
    hostfactors_df = pd.read_csv(GENELISTDIR %'Daniloski_MOI03_rank',sep='\t')
    hostfactors_df['gsym'] = hostfactors_df['gsym'].str.upper()
    hostfactors    = hostfactors_df['gsym']
    hostfactors_df = hostfactors_df.set_index('gsym')
    
    hostfactors_drugtargets = pd.read_csv(GENELISTDIR %'Daniloski_MOI03_dgidb',sep='\t').iloc[:,0].str.upper().tolist()
    
    timecourse_df = pd.read_csv(RESULTDIR %(f'{sample_prefix}_timecourse.tsv'), 
                                sep = '\t', index_col = 0)
    
    timecourse_df = timecourse_df.loc[hostfactors]
    timecourse_df['max'] = timecourse_df.abs().max(axis=1)
    timecourse_df['rank']= hostfactors_df['rank']
    timecourse_df['lfc'] = hostfactors_df['lfc']
    timecourse_df = timecourse_df.dropna(how='any')
    colors = ['magenta' if i in hostfactors_drugtargets else '#a52a2a' for i in timecourse_df.index]
    print('vs. maxabs',stats.spearmanr(timecourse_df['lfc'],timecourse_df['max']))
        
    xvar = 'max'
    yvar = 'rank'
    fig  = plt.figure(figsize=(3.5,3.5))
    display(timecourse_df.sort_values(xvar,ascending=False).head(5))
    plt.scatter(timecourse_df[xvar],timecourse_df[yvar], c = colors)
    plt.ylabel('CRISPR screening rank')
    plt.xlabel('max(|log2fc|)')
    plt.ylim(bottom=0)
    timecourse_df[[xvar,yvar]].to_csv(RESULTDIR %(f'Fig6c_correlation.tsv'), 
                                      sep = '\t')
    z = np.polyfit( timecourse_df[xvar], timecourse_df[yvar],1)
    p = np.poly1d(z)
    plt.plot(timecourse_df[xvar].sort_values(),p(timecourse_df[xvar].sort_values()),"k--")
    sprho, spp = stats.spearmanr(timecourse_df[xvar],timecourse_df[yvar])
    plt.text(timecourse_df[xvar].max()/2,timecourse_df[yvar].max()/1.1,f'Spearman rho={sprho:.03f}\nP={spp:.02e}')
    
    plt.tight_layout()
    if OutFigname != '':
        plt.savefig(FIGDIR %OutFigname)
    else:
        plt.show()
    plt.close()
    
    return None

In [15]:
calc_plot_assoc_lfc_physioimpact(sample_prefix = 'RPF', OutFigname='Fig6c_physiological_assoc.pdf')

vs. maxabs SpearmanrResult(correlation=0.387003406631181, pvalue=0.0007844012325431527)


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike


Unnamed: 0_level_0,0h,1h,2h,4h,12h,16h,24h,36h,max,rank,lfc
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ACE2,-6.607765e-06,-0.000702,-0.055994,-0.319392,-2.566465,-3.545599,-3.782672,-4.10104,4.10104,12,1.6266
ATP6AP2,-8.08625e-06,0.001427,0.116543,0.13018,-1.579491,-2.076438,-2.049512,-1.855499,2.076438,5,2.0384
SRSF1,1.806937e-06,0.004399,0.659602,1.374653,1.379408,1.851016,1.163534,0.166559,1.851016,30,1.27
COMMD4,-9.434869e-07,-0.000646,-0.057539,-0.217448,-1.021387,-0.324847,-0.372262,-1.518002,1.518002,38,1.1835
ATP6V1G1,-2.726768e-06,-0.000727,0.044275,0.437669,1.441861,1.31066,1.383131,1.24599,1.441861,15,1.5741


### Fig. 6i: Inspection of eIFs

In [16]:
def plot_Fig6i_eIF(OutFigname=''):
    Fig6i_df = pd.DataFrame(columns = ['color','linewidth','print_cutoff','genelist'])
    Fig6i_df.loc['nonAUG_enhancers']       = ['red', 0.5, 1.5, ['EIF5B','EIF2A','EIF2D']]
    Fig6i_df.loc['nonAUG_enhancers_eIF5']  = ['red', 1.0, 1.5, ['EIF5']] #eIF5 alone for thicker line
    Fig6i_df.loc['nonAUG_repressors']      = ['blue', 0.5, 1.5, ['EIF1','EIF1AX','EIF1AY']]
    
    highlight_gene_info_df = Fig6i_df
    tmp_sample_prefix      = 'RPF'
    
    fig, ax  = plt.subplots(figsize=(3.5, 3))
    
    print('------'+tmp_sample_prefix+'------')
    tmp_DEG_df_list   = []
    tmp_hpi_list      = ['%dh' %i for i in [0,1,2,4,12,16,24,36]]
    for hpi in tmp_hpi_list:
        InFname    = f'{tmp_sample_prefix}_DESeq2_{hpi}.tsv'
        tmp_DEG_df = pd.read_csv(RESULTDIR %(InFname), sep ='\t', header=0, index_col=0)
        tmp_DEG_df_list.append(tmp_DEG_df)
    ylim = 2.5
    display(highlight_gene_info_df)
    _             = timecourse_plot(tmp_DEG_df_list,tmp_hpi_list, padj_cut=0.01, log2fc_cut=2, clip = 2.8,
                                ylim = ylim, n_cluster=5, as_average=False, 
                                load_precalc_clusters=True, clusterFname=f'{tmp_sample_prefix}_cluster.tsv',
                                clusters_to_show=[], clusters_priority=[], highlight_gene_info_df = highlight_gene_info_df,
                                cl_method='ward',cl_metric='euclidean',ax = ax, 
                                OutFigname='', show_fig=False, verbose=False)
    if OutFigname != '':
        #plt.tight_layout()
        plt.savefig(FIGDIR %OutFigname)
        plt.close()
    else:
        #plt.tight_layout()
        plt.show()
        plt.close()
    return None


In [17]:
plot_Fig6i_eIF(OutFigname='Fig6i_eIF_timecourse.pdf')

------RPF------


Unnamed: 0,color,linewidth,print_cutoff,genelist
nonAUG_enhancers,red,0.5,1.5,"[EIF5B, EIF2A, EIF2D]"
nonAUG_enhancers_eIF5,red,1.0,1.5,[EIF5]
nonAUG_repressors,blue,0.5,1.5,"[EIF1, EIF1AX, EIF1AY]"
