In [1]:
def TPMcalculator(pseudobulktable, genelengthtable, cellcounts, cellpops = None, blob = False, cellcountnorm = True):
    """
    This function calculates the TPM and writes TPM files for ANANSE of single cell populations. 
    This can be done for the cellpopulations as well as the blob of all other cellpopulations. 
    For blob the parameter blob = True must be specified. For the blob approach the median of 
    the row values are taken after cellcount normalization unless the cellcountnorm is specified as False.
    The input requires a pseudobulk count table, a gene length table and a cellcount table.
    """
# Take the highest number of the genelengths
    data3 = genelengthtable.groupby('ID')['LEN'].max().reset_index()

# Merge the dataframes to add lengths to the IDs
    print("Merging the gene lengths to the pseudobulk count dataframe")
    data4 = pd.merge(pseudobulktable, data3, on='ID')

# Checking if cellpopulations are given
    if cellpops == None:
        cellpops = list(data4.columns[(data4.columns != 'ID') & (data4.columns != 'LEN')])
    else:
        try:
            test = pseudobulktable[cellpops]
        except KeyError as e:
            raise KeyError('Your specified cell population(s) can not be found in the pseudobulk table')
            
# Checking if the pseudobulk table and cellcount table have equal cellpopulations
    try:
        test = pseudobulktable[cellpops]
    except KeyError as e:
        raise KeyError('Your specified cell population(s) are not equal in the pseudobulk table and the cellcount table')
        
# Display values not as scientific number but as normal float
    #pd.set_option('display.float_format', lambda x: '%.30f' % x)

# Exclude genes where no length is known
    nan_value = float("NaN")
    data4.replace("", nan_value, inplace=True)
    data4.dropna(subset = ["LEN"], inplace=True)

    if cellcountnorm == True:
# Selecting populations >100 cell in scATAC-seq + normalizing against cell number
        print ("Adding the ratio for median cellnumber normalisation")
        cellcounts = cellcounts[cellcounts['population'].isin(cellpops)]
        cellcounts

# Determining the median of single cell populations
        MED = cellcounts["number_of_cells"].median()
        cellcounts['ratio'] = cellcounts["number_of_cells"]/MED

# If cellcount is below median, then use original value or ratio
        cellcounts.loc[cellcounts['ratio'] < 1, 'ratio'] = 1

# Divide upon the ratio numpy array for each column
        lst = cellcounts['ratio'].to_numpy()
        data4[cellpops] = data4[cellpops].div(lst)
# Calculating the TPM
    print ("Calculating TPM values for all cellpopulations")
    df = data4.rename(columns={'ID': 'gene', 'LEN': 'length'})
    df.index = df['gene']
    del df['gene']
    nm = norm()
    nm.tpm(df=df, gl='length')
    
# Get TPM normalized dataframe
    tpm_df = nm.tpm_norm
    tpmdata = tpm_df
    tpmdata = tpmdata[cellpops]
    #tpmdata = tpmdata.rename(index={0: 'ID'})
    tpmdata.index.names = ["ID"]
# subselecting the files to generate tpm value for each condition BLOB
    if blob == True:
        for i in tpmdata.columns:
            print ("writing TPM files for blob against cell population " + i)
            data5 = tpmdata
            tpmdata2 = data5.loc[ : , (data5.columns != i) & (data5.columns != "length")]
            tpmdata2cols = list(tpmdata2.columns)
            tpmdata2 = tpmdata2[tpmdata2cols].median(axis=1)
            final_df = pd.Series(tpmdata2, name=i)
            final_df.to_csv(str(i)+"blobnormtpm.tsv", sep="\t", index=True)
    else:        
        for i in tpmdata.columns:
            if i != "ID" and i != "MIL" and i != "transcripts" and i != "length":
                print ("Writing TPM files for cell population " + i)
                final_df = tpmdata[[i]]
                final_df.to_csv(str(i)+"tpm.tsv", sep="\t", index=True)
    
    return

In [3]:
# Importing important libraries
import pandas as pd
from bioinfokit.analys import norm, get_data

# Importing the data files nessesary for TPM calculation
# Data and cellcounts for the four cells
data = pd.read_csv('/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/R/scRNA-seq/20210804/pseudobulk_4cells_TPM.tsv', sep='\t', header=0)
cellcounts = pd.read_csv('/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/R/scRNA-seq/20210802/cellcounts_4cells.tsv', sep='\t', header=0)

# Data and cellcounts for blob
#data = pd.read_csv('/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/R/scRNA-seq/20210617/pseudobulk.tsv', sep='\t', header=0)
#cellcounts = pd.read_csv('/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/R/scRNA-seq/20210730/cellcounts.tsv', sep='\t', header=0)

# Load in the gene lengths
data2 = pd.read_csv('/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/jupyter_notebooks/genelengths/idgl.tsv', sep='\t', header=0)

# Specify the cellpopulations you want from your dataframe
#cellpops = ['CjS','LNPCs','CSB','CB','StC','CSSCs','MEC','LPCs']

TPMcalculator(data,data2,cellcounts,blob=False,cellcountnorm=True)

Merging the gene lengths to the pseudobulk count dataframe
Adding the ratio for median cellnumber normalisation
Calculating TPM values for all cellpopulations
Writing TPM files for cell population LiCo
Writing TPM files for cell population FCVes
Writing TPM files for cell population StCSC
Writing TPM files for cell population MECIC


###############################################################################################################################
Code for quantile normalization of the counts
###############################################################################################################################

In [35]:
def qquantlogcalc(joinedcovtable, sigregionstable, cellpops = None):
    """
    Function that performs log transformation, q-quantile normalization and filters out significant regions. 
    The joined covtable is the output from Combine_peaks_V2 script and the sigregionstable is the output from
    significance_calc_peaks_gimme.R. You can specify your own cell populations with the cellpops argument in a list.
    """
    # Perform the logtransformation
    numeric_df = joinedcovtable.apply(lambda x: np.log(x+1) if np.issubdtype(x.dtype, np.number) and np.number != 0 else x)
    numeric_df

    # Add the rownames for quantile normalization
    numeric_df.index = numeric_df['loc']
    del numeric_df["loc"]

    # Checking if cellpopulations are given
    if cellpops == None:
        cellpops = list(joinedcovtable.columns[joinedcovtable.columns != 'loc'])
    else:
        try:
            test = joinedcovtable[cellpops]
        except KeyError as e:
            raise KeyError('Your specified cell population(s) can not be found in the joined coverage table')
    
    # Convert dataframe back to numeric (nessesary for the logtransformation)
    numeric_df = numeric_df[cellpops]
    numeric_df = numeric_df.astype(int)
    
    # Perform the q-quantile normalization
    numericdf_quant = qnorm.quantile_normalize(numeric_df, ncpus=4)
    
    # Saving the file (not significant)
    print("Writing the non-significant quantile file")
    numericdf_quant.to_csv("quantileall.tsv", sep="\t", index=True)
    
    # Merge the dataframes to add lengths to the IDs
    print("Merging the significant regions and the q-quantile dataframe")
    sigregionstable.index = sigregionstable[0]
    sigmerge = pd.merge(numericdf_quant, sigregionstable, left_index = True, right_index = True)
    del sigmerge[0]
    sigmerge.index.names = ["loc"]
    
    # Saving the file (significant)
    print("Writing the significant quantile file")
    sigmerge.to_csv("quantilesig.tsv", sep="\t", index=True)
    
    return sigmerge

In [36]:
import pandas as pd
import numpy as np
import qnorm

data = pd.read_csv('/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/lako2021/merge_peaks/2021-08-25/tmp/joinedcovtable.tsv', sep='\t', header=0)
sigregions = pd.read_csv('/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/lako2021/merge_peaks/2021-08-25_4cellreps/tmp/sigregions.txt', sep=' ', header=None)
#cellpops = ['CjS','LNPCs','CSB','CB','StC','CSSCs','MEC','LPCs']
qquantlogcalc(data,sigregions)

Writing the non-significant quantile file
Merging the significant regions and the q-quantile dataframe
Writing the significant quantile file


Unnamed: 0_level_0,LiCo,StCSC
loc,Unnamed: 1_level_1,Unnamed: 2_level_1
chr1:1000214-1000414,4.189347,3.890701
chr1:1021726-1021926,2.880095,1.146961
chr1:1050621-1050821,2.880095,1.146961
chr1:1064076-1064276,4.189347,3.890701
chr1:1218125-1218325,2.880095,2.205948
...,...,...
chr9:137745631-137745831,2.880095,1.146961
chr9:137805027-137805227,2.880095,2.205948
chr9:138013844-138014044,2.880095,1.146961
chr9:138116844-138117044,4.189347,1.146961


In [None]:
##################################################################################################

In [2]:
# for arm calculation only!
import pandas as pd
import numpy as np
import qnorm
#data = pd.read_csv('/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/lako2021/merge_peaks/2021-27-06/tmp/newtable.tsv', sep='\t', header=0)
data = pd.read_csv('/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/lako2021/merge_peaks/2021-08-25/tmp/joinedcovtable.tsv', sep='\t', header=0)
data

# removing low quality cellpopulations
#data = data[['loc','LPCs','LNPCs','CjS','CSB','CB','StC','MEC','CSSCs']]
#data
# joining the columns based on the arms and then doing the logtransform
data[['arm1']] = data[['LPCs','LNPCs','CjS','CSB','CB']].median(axis=1)
data[['arm2']] = data[['StC','CSSCs']].median(axis=1)
data


Unnamed: 0,loc,LPCs,LNPCs,CjS,CSB,CB,StC,MEC,CSSCs,arm1,arm2
0,chr1:10092-10292,25.0,26.0,69.0,60.0,70.0,14.0,2.0,19.0,60.0,16.5
1,chr1:11188-11388,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,chr1:29060-29260,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,chr1:29426-29626,1.0,0.0,3.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0
4,chr1:31143-31343,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
307561,chr9:138276905-138277105,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
307562,chr9:138277202-138277402,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
307563,chr9:138287670-138287870,2.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
307564,chr9:138330142-138330342,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
# Do the logtransform ARMS

numeric_df = data.apply(lambda x: np.log(x+1) if np.issubdtype(x.dtype, np.number) and np.number != 0 else x)
numeric_df

# convert back to numeric (nessesary for the logtransform)
numeric_df[['arm1','arm2']] = numeric_df[['arm1','arm2']].astype(int)

# add the rownames for quantile normalization
numeric_df.index = numeric_df['loc']
numeric_df = numeric_df[['arm1','arm2']]
numeric_df

Unnamed: 0_level_0,arm1,arm2
loc,Unnamed: 1_level_1,Unnamed: 2_level_1
chr1:10092-10292,4,2
chr1:11188-11388,0,0
chr1:29060-29260,0,0
chr1:29426-29626,0,0
chr1:31143-31343,0,0
...,...,...
chr9:138276905-138277105,0,0
chr9:138277202-138277402,0,0
chr9:138287670-138287870,0,0
chr9:138330142-138330342,0,0


In [4]:
# perform the quantile normalization
numericdf_quant = qnorm.quantile_normalize(numeric_df[['arm1','arm2']], ncpus=4)
numericdf_quant

Unnamed: 0_level_0,arm1,arm2
loc,Unnamed: 1_level_1,Unnamed: 2_level_1
chr1:10092-10292,3.370986,2.610270
chr1:11188-11388,0.000000,0.622947
chr1:29060-29260,0.000000,0.622947
chr1:29426-29626,0.000000,0.622947
chr1:31143-31343,0.000000,0.622947
...,...,...
chr9:138276905-138277105,0.000000,0.622947
chr9:138277202-138277402,0.000000,0.622947
chr9:138287670-138287870,0.000000,0.622947
chr9:138330142-138330342,0.000000,0.622947


In [5]:
# saving the file
numericdf_quant.to_csv("quantilearms.tsv", sep="\t", index=True)

#############################################################