In [1]:
def TPMcalculator(pseudobulktable, genelengthtable, cellcounts = None, cellpops = None, blob = False, cellcountnorm = True):
    """
    This function calculates the TPM and writes TPM files for ANANSE of single cell populations. 
    This can be done for the cellpopulations as well as the blob of all other cellpopulations. 
    For blob the parameter blob = True must be specified. For the blob approach the median of 
    the row values are taken after cellcount normalization unless the cellcountnorm is specified as False.
    The input requires a pseudobulk count table, a gene length table and a cellcount table.
    """
# Take the highest number of the genelengths
    data3 = genelengthtable.groupby('ID')['LEN'].max().reset_index()

# Merge the dataframes to add lengths to the IDs
    print("Merging the gene lengths to the pseudobulk count dataframe")
    data4 = pd.merge(pseudobulktable, data3, on='ID')

# Checking if cellpopulations are given
    if cellpops == None:
        cellpops = list(data4.columns[(data4.columns != 'ID') & (data4.columns != 'LEN')])
    else:
        try:
            test = pseudobulktable[cellpops]
        except KeyError as e:
            raise KeyError('Your specified cell population(s) can not be found in the pseudobulk table')
            
# Checking if the pseudobulk table and cellcount table have equal cellpopulations
    try:
        test = pseudobulktable[cellpops]
    except KeyError as e:
        raise KeyError('Your specified cell population(s) are not equal in the pseudobulk table and the cellcount table')
        
# Display values not as scientific number but as normal float
    #pd.set_option('display.float_format', lambda x: '%.30f' % x)

# Exclude genes where no length is known
    nan_value = float("NaN")
    data4.replace("", nan_value, inplace=True)
    data4.dropna(subset = ["LEN"], inplace=True)

    if cellcountnorm == True:
# Selecting populations >100 cell in scATAC-seq + normalizing against cell number
        print ("Adding the ratio for median cellnumber normalisation")
        cellcounts = cellcounts[cellcounts['population'].isin(cellpops)]
        cellcounts

# Determining the median of single cell populations
        MED = cellcounts["number_of_cells"].median()
        cellcounts['ratio'] = cellcounts["number_of_cells"]/MED

# If cellcount is below median, then use original value or ratio
        cellcounts.loc[cellcounts['ratio'] < 1, 'ratio'] = 1

# Divide upon the ratio numpy array for each column
        lst = cellcounts['ratio'].to_numpy()
        data4[cellpops] = data4[cellpops].div(lst)
# Calculating the TPM
    print ("Calculating TPM values for all cellpopulations")
    df = data4.rename(columns={'ID': 'gene', 'LEN': 'length'})
    df.index = df['gene']
    del df['gene']
    nm = norm()
    nm.tpm(df=df, gl='length')
    
# Get TPM normalized dataframe
    tpm_df = nm.tpm_norm
    tpmdata = tpm_df
    tpmdata = tpmdata[cellpops]
    #tpmdata = tpmdata.rename(index={0: 'ID'})
    tpmdata.index.names = ["ID"]
# subselecting the files to generate tpm value for each condition BLOB
    if blob == True:
        for i in tpmdata.columns:
            print ("writing TPM files for blob against cell population " + i)
            data5 = tpmdata
            tpmdata2 = data5.loc[ : , (data5.columns != i) & (data5.columns != "length")]
            tpmdata2cols = list(tpmdata2.columns)
            tpmdata2 = tpmdata2[tpmdata2cols].median(axis=1)
            final_df = pd.Series(tpmdata2, name=i)
            final_df.to_csv(str(i)+"blobnormtpm.tsv", sep="\t", index=True)
    else:        
        for i in tpmdata.columns:
            if i != "ID" and i != "MIL" and i != "transcripts" and i != "length":
                print ("Writing TPM files for cell population " + i)
                final_df = tpmdata[[i]]
                final_df.to_csv(str(i)+"tpm.tsv", sep="\t", index=True)
    
    return

In [2]:
def CPMcalculator(pseudobulktable, cellcounts = None, cellpops = None, blob = False, cellcountnorm = False, outdir = "/"):
    """
    This function calculates the CPM and writes CPM files for ANANSE of single cell populations. 
    This can be done for the cellpopulations as well as the blob of all other cellpopulations. 
    For blob the parameter blob = True must be specified. For the blob approach the median of 
    the row values are taken after cellcount normalization unless the cellcountnorm is specified as False.
    The input requires a pseudobulk count table and a cellcount table.
    """

# Checking if cellpopulations are given
    if cellpops == None:
        cellpops = list(pseudobulktable.columns[(pseudobulktable.columns != 'ID')])
    else:
        try:
            pseudobulktable = pseudobulktable[cellpops]
        except KeyError as e:
            raise KeyError('Your specified cell population(s) can not be found in the pseudobulk table')
            
# Checking if the pseudobulk table and cellcount table have equal cellpopulations
    try:
        test = pseudobulktable[cellpops]
    except KeyError as e:
        raise KeyError('Your specified cell population(s) are not equal in the pseudobulk table and the cellcount table')

    #data4 = pseudobulktable
    pseudobulktable
    if cellcountnorm == True:
# Selecting populations >100 cell in scATAC-seq + normalizing against cell number
        print ("Adding the ratio for median cellnumber normalisation")
        cellcounts = cellcounts[cellcounts['population'].isin(cellpops)]
        cellcounts

# Determining the median of single cell populations
        MED = cellcounts["number_of_cells"].median()
        cellcounts['ratio'] = cellcounts["number_of_cells"]/MED

# If cellcount is below median, then use original value or ratio
        cellcounts.loc[cellcounts['ratio'] < 1, 'ratio'] = 1

# Divide upon the ratio numpy array for each column
        lst = cellcounts['ratio'].to_numpy()
        pseudobulktable[cellpops] = pseudobulktable[cellpops].div(lst)
    
    
    pseudobulktable.index = pseudobulktable['ID']
    del pseudobulktable['ID']
    
# Calculating the CPM
    print ("Calculating CPM values for all cellpopulations")
    df = pseudobulktable
    nm = norm()
    nm.cpm(df=df)
    
# Get TPM normalized dataframe
    cpm_df = nm.cpm_norm
    cpmdata = cpm_df
    cpmdata = cpmdata[cellpops]
    cpmdata.index.names = ["ID"]
    
# subselecting the files to generate cpm value for each condition BLOB
    if blob == True:
        for i in cpmdata.columns:
            print ("writing CPM files for blob against cell population " + i)
            data5 = cpmdata
            cpmdata2 = data5.loc[ : , (data5.columns != i) & (data5.columns != "length")]
            cpmdata2cols = list(cpmdata2.columns)
            cpmdata2 = cpmdata2[cpmdata2cols].median(axis=1)
            final_df = pd.Series(cpmdata2, name=i)
            final_df.to_csv(outdir +"blobnormcpm.tsv", sep="\t", index=True)
    else:        
        for i in cpmdata.columns:
            if i != "ID" and i != "MIL" and i != "transcripts" and i != "length":
                print ("Writing CPM files for cell population " + i)
                final_df = cpmdata[[i]]
                final_df.to_csv(outdir + i +"_cpm.tsv", sep="\t", index=True)
    
    return

In [3]:
# Importing important libraries
import pandas as pd
from bioinfokit.analys import norm, get_data

# Data and cellcounts for ESC data and shared genes in-between cornea and ESC
data = pd.read_csv('/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE/RNA_ESC_cpm/20220401/cornea_ESC_pseudobulk.tsv', sep='\t', header=0)

outdir = "/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/RNA_CPM/"

CPMcalculator(data,outdir = outdir)

Calculating CPM values for all cellpopulations
Writing CPM files for cell population ESC
Writing CPM files for cell population Cj
Writing CPM files for cell population CSSC
Writing CPM files for cell population LESC
Writing CPM files for cell population CE
Writing CPM files for cell population LSC
Writing CPM files for cell population Mel
Writing CPM files for cell population CF
Writing CPM files for cell population Ves
Writing CPM files for cell population MF
Writing CPM files for cell population SK
Writing CPM files for cell population IC
Writing CPM files for cell population CDH19.
Writing CPM files for cell population LE
Writing CPM files for cell population EC
Writing CPM files for cell population TSK


In [6]:
# Importing important libraries
import pandas as pd
from bioinfokit.analys import norm, get_data

# Data and cellcounts for ESC data and shared genes in-between cornea and ESC
data = pd.read_csv('/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE/RNA_intra_cpm/20220421/intra_comp_pseudobulk.tsv', sep='\t', header=0)

outdir = "/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE/RNA_intra_cpm/20220421/RNA_CPM/"

CPMcalculator(data,outdir = outdir)

Calculating CPM values for all cellpopulations
Writing CPM files for cell population epi
Writing CPM files for cell population stromal


In [3]:
# Importing important libraries
import pandas as pd
from bioinfokit.analys import norm, get_data

# Importing the data files nessesary for TPM calculation
# Data and cellcounts for the four cells
data = pd.read_csv('/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/R/scRNA-seq/20210804/pseudobulk_4cells_TPM.tsv', sep='\t', header=0)
cellcounts = pd.read_csv('/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/R/scRNA-seq/20210802/cellcounts_4cells.tsv', sep='\t', header=0)

# Data and cellcounts for blob
#data = pd.read_csv('/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/R/scRNA-seq/20210617/pseudobulk.tsv', sep='\t', header=0)
#cellcounts = pd.read_csv('/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/R/scRNA-seq/20210730/cellcounts.tsv', sep='\t', header=0)

# Load in the gene lengths
data2 = pd.read_csv('/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/jupyter_notebooks/genelengths/idgl.tsv', sep='\t', header=0)

# Specify the cellpopulations you want from your dataframe
#cellpops = ['CjS','LNPCs','CSB','CB','StC','CSSCs','MEC','LPCs']

TPMcalculator(data,data2,cellcounts,blob=False,cellcountnorm=True)

Merging the gene lengths to the pseudobulk count dataframe
Adding the ratio for median cellnumber normalisation
Calculating TPM values for all cellpopulations
Writing TPM files for cell population LiCo
Writing TPM files for cell population FCVes
Writing TPM files for cell population StCSC
Writing TPM files for cell population MECIC


###############################################################################################################################
Code for quantile normalization of the counts
###############################################################################################################################

In [12]:
def qquantlogcalc(joinedcovtable, sigregionstable = None, cellpops = None, outdir = None):
    """
    Function that performs log transformation, q-quantile normalization and filters out significant regions. 
    The joined covtable is the output from Combine_peaks_V2 script and the sigregionstable is the output from
    significance_calc_peaks_gimme.R. You can specify your own cell populations with the cellpops argument in a list.
    """
    # Perform the logtransformation
    numeric_df = joinedcovtable.apply(lambda x: np.log(x+1) if np.issubdtype(x.dtype, np.number) and np.number != 0 else x)
    numeric_df

    # Add the rownames for quantile normalization
    numeric_df.index = numeric_df['loc']
    del numeric_df["loc"]

    # Checking if cellpopulations are given
    if cellpops == None:
        cellpops = list(joinedcovtable.columns[joinedcovtable.columns != 'loc'])
    else:
        try:
            test = joinedcovtable[cellpops]
        except KeyError as e:
            raise KeyError('Your specified cell population(s) can not be found in the joined coverage table')
    
    # Convert dataframe back to numeric (nessesary for the logtransformation)
    numeric_df = numeric_df[cellpops]
    numeric_df = numeric_df.astype(int)
    
    # Perform the q-quantile normalization
    numericdf_quant = qnorm.quantile_normalize(numeric_df, ncpus=4)
    
    if outdir == None:
        outdir = ""
    
    # Saving the file (not significant)
    print("Writing the non-significant quantile file")
    numericdf_quant.to_csv(outdir +"quantileall.tsv", sep="\t", index=True)
    
    # If no significant regions have been specified
    if sigregionstable == None:
    # Merge the dataframes to add lengths to the IDs
        return sigregionstable
    
    # If significant regions have been specified
    print("Merging the significant regions and the q-quantile dataframe")
    sigregionstable.index = sigregionstable[0]
    sigmerge = pd.merge(numericdf_quant, sigregionstable, left_index = True, right_index = True)
    del sigmerge[0]
    sigmerge.index.names = ["loc"]
    
    # Saving the file (significant)
    print("Writing the significant quantile file")
    sigmerge.to_csv(outdir +"quantilesig.tsv", sep="\t", index=True)
    return sigmerge

In [13]:
import pandas as pd
import numpy as np
import qnorm

data = pd.read_csv('/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE/ATAC_qquant/tmp/joinedcovtable_no_zero.tsv', sep='\t', header=0)
#sigregions = pd.read_csv('/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/ATAC_bam/2022-01-18_peaks_all/tmp/sigregions.txt', sep=' ', header=None)
#cellpops = ['CjS','LNPCs','CSB','CB','StC','CSSCs','MEC','LPCs']
outdir = "/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE/ATAC_qquant/"

qquantlogcalc(data, outdir = outdir)

Writing the non-significant quantile file
