In [1]:
import scanpy as sc
import numpy as np
import pandas as pd
import math

from celescope.tools import utils
from celescope.__init__ import HELP_DICT
from celescope.rna.mkref import Mkref_rna
from celescope.tools.step import Step, s_common

In [2]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.set_figure_params(dpi=80, facecolor='white')

In [39]:
adata = sc.read('/SGRNJ06/randd/USER/cjj/celedev/vdj10x/20220803/c3rna/LC38CA/06.analysis/LC38CA.h5ad')

In [None]:
adata

In [None]:
adata.var.sort_values(by='total_counts',ascending=False)

In [42]:
def find_anomalies(data_list):
    anomalies = []
    # set upper limit
    data_std = np.std(data_list)
    data_mean = np.mean(data_list)
    anomaly_cut_off = data_std * 3
    
    upper_limit = data_mean + data_std * 3
    print(upper_limit)
    
    # Generate outliers
    for outlier in data_list:
        if outlier > upper_limit:
            anomalies.append(outlier)
    return anomalies

In [43]:
data_list = adata.var.sort_values(by='total_counts',ascending=False).total_counts.tolist()

In [44]:
anomalies=find_anomalies(data_list)

109890.24291829795


In [None]:
anomalies

In [2]:
count_file = pd.read_csv('/SGRNJ06/randd/USER/cjj/TESTDATA/test_rna/20220708crmixed/220417014/05.count/220417014_count_detail.txt',sep='\t')

In [None]:
count_file

In [3]:
count_file['total_counts'] = count_file.groupby('geneID')['count'].transform('sum')

In [4]:
count_file.sort_values('total_counts',ascending=False, inplace=True)

In [5]:
df_for_calling = count_file

In [None]:
df_for_calling

In [8]:
        df_for_calling['total_counts'] = df_for_calling.groupby('geneID')['count'].transform('sum')
        df_for_calling.sort_values('total_counts', ascending=False, inplace=True)

        # upper limit
        data_std = np.std(df_for_calling.total_counts)
        data_mean = np.mean(df_for_calling.total_counts)
        upper_limit = data_mean + data_std * 3

In [9]:
upper_limit

36778911.06755154

In [10]:
df_for_calling = df_for_calling[df_for_calling['total_counts'].apply(lambda x: x <= upper_limit)]

In [None]:
df_for_calling

In [12]:
del df_for_calling['total_counts']

In [None]:
df_for_calling

In [7]:
import copy
import sys

In [3]:
        df = pd.read_table('/SGRNJ06/randd/USER/cjj/celedev/vdj10x/20220803/c3rna/LC38CA/05.count/LC38CA_count_detail.txt', header=0)

In [10]:
        df['total_counts'] = df.groupby('geneID')['count'].transform('sum')
        df.sort_values('total_counts', ascending=False, inplace=True)

        # upper limit
        data_std = np.std(df.drop_duplicates('geneID').total_counts)
        data_mean = np.mean(df.drop_duplicates('geneID').total_counts)

In [59]:
        upper_limit = data_mean + data_std * 3

In [60]:
        upper_limit

357863.7320139976

In [None]:
        df.drop_duplicates('geneID')

In [None]:
        # delete outliers and get high-exp genes
        df_for_calling = df[df['total_counts'].apply(lambda x: x <= upper_limit)]
        df_high_exp = df[df['total_counts'].apply(lambda x: x > upper_limit)]

In [None]:
        del df_for_calling['total_counts']

In [37]:
    def pre_calling(df, coef=6):
        """
        Three-Sigma Limits. Three sigma can determine if any outliers exist in a data set when evaluating collected variables.
        Delete anomaly high-expressed genes for cell_calling.
        Args:
            df_for_calling: count detail file.
            coef: use for calculating upper_limit.
        Returns:
            df_for_calling where anomaly high-expressed genes have been deleted.
        """
        df = df.copy()
        df['total_counts'] = df.groupby('geneID')['count'].transform('sum')
        df.sort_values('total_counts', ascending=False, inplace=True)

        # upper limit
        data_std = np.std(df.drop_duplicates('geneID').total_counts)
        data_mean = np.mean(df.drop_duplicates('geneID').total_counts)
        upper_limit = data_mean + data_std * coef

        # delete outliers and get high-exp genes
        df_for_calling = df[df['total_counts'].apply(lambda x: x <= upper_limit)]
        df_high_exp = df[df['total_counts'].apply(lambda x: x > upper_limit)]
        del df_for_calling['total_counts']

        return df_for_calling, df_high_exp

In [35]:
        df = pd.read_table('/SGRNJ06/randd/USER/cjj/celedev/cell_calling/20220826auto_compare/Clu-mes/05.count/Clu-mes_count_detail.txt', header=0)

In [None]:
        df

In [38]:
        df_for_calling, df_high_exp = pre_calling(df)

In [46]:
        df_for_calling

Unnamed: 0,Barcode,geneID,UMI,count
5817121,ACATCGGACCTCCTCAATTGCTGCAGT,ENSMUSG00000098178,CCACGCCCCCGG,3
66679639,TGTAGTGTGCGTCATACCGGCCATGTT,ENSMUSG00000098178,CGAATGCGCCCA,4
31935779,CTGGTACTTACACCAACGCCATATCTC,ENSMUSG00000098178,TTTCACCCTGGC,12
42356964,GCGAGTAACAACGTCCAATCGAACTCC,ENSMUSG00000098178,ATACCCTTGAGA,1
31684701,CTCAGAACTGTATCCTTCCTATGACCA,ENSMUSG00000098178,TACAATCTACAC,2
...,...,...,...,...
42720395,GCGAGTAACGCTGCGATAGTCTCGAAG,ENSMUSG00000113086,GCAGTCCCGGTT,1
32868423,CTTACGCAGCTGTGGTATGACATGGCT,ENSMUSG00000115507,CGGGTAAGCGCC,1
768035,AACACCGTTAACGTCCAAACTGGTTCC,ENSMUSG00000065604,CTTTGTAACCGC,1
21218890,CAGTCTTCGAACGTCCAAGACTATTCC,ENSMUSG00000065836,TCCTCGAGTGTT,1


In [47]:
        df_high_exp

Unnamed: 0,Barcode,geneID,UMI,count,total_counts
0,AACACACAGAACACACAGGTGGCAACT,ENSMUSG00000052305,ATCAGAAACCGG,1,35016187
58948068,TCGGTTCGTCACTAGGCACACACCTCA,ENSMUSG00000052305,GACTCCGCATTT,1,35016187
44152781,GCTCTCACTGCAACTTCACCGAGATGT,ENSMUSG00000052305,GCAACCGTGGGT,2,35016187
44152782,GCTCTCACTGCAACTTCACCGAGATGT,ENSMUSG00000052305,GCAAGATTATCC,3,35016187
44152783,GCTCTCACTGCAACTTCACCGAGATGT,ENSMUSG00000052305,GCAAGCACGTTT,1,35016187
...,...,...,...,...,...
62563085,TGCATCAAGTAGCGATGAGTCTCGAAG,ENSMUSG00000002985,ATGCGGCTGCGA,1,2290550
62563084,TGCATCAAGTAGCGATGAGTCTCGAAG,ENSMUSG00000002985,ATCGAGCGTAGT,1,2290550
7072629,ACGAATGGAACCTCGACTAGTGCTACA,ENSMUSG00000002985,GTGCCCGTTATA,1,2290550
39839570,GATGTTACGGGTGATCAGCCATATCTC,ENSMUSG00000002985,TCCGTTATCATC,3,2290550


In [43]:
        df

Unnamed: 0,Barcode,geneID,UMI,count
0,AACACACAGAACACACAGGTGGCAACT,ENSMUSG00000052305,ATCAGAAACCGG,1
1,AACACACAGAACGCTAGTAACAAGTGG,ENSMUSG00000069919,ATCTCTGATTGT,1
2,AACACACAGAACGCTAGTAACAAGTGG,ENSMUSG00000069919,TAAATCTAGCCA,1
3,AACACACAGAACGCTAGTAACAAGTGG,ENSMUSG00000069919,TTGGCAAGTGGC,3
4,AACACACAGAACGCTAGTAACAAGTGG,ENSMUSG00000052305,CAAGAGCTATAA,1
...,...,...,...,...
68445631,TTCCAATCGGATCAGGACCAAGAGTAG,ENSMUSG00000091269,AAAGCACGCTAA,1
68445632,TTGGTGACCAAGGTGGTAACTCGGATT,ENSMUSG00000006360,CCTGCGGTAATT,9
68445633,TTGGTGACCCCATAATCGAACAGGAAC,ENSMUSG00000069919,TTTTGTCCCTGT,2
68445634,TTGGTGACCTTACACGACCCAATTGGC,ENSMUSG00000069919,AAAGTCCTATCA,9


In [45]:
df_high_exp.geneID.unique()

array(['ENSMUSG00000052305', 'ENSMUSG00000069919', 'ENSMUSG00000069917',
       'ENSMUSG00000073940', 'ENSMUSG00000076609', 'ENSMUSG00000064339',
       'ENSMUSG00000002985'], dtype=object)

In [48]:
df_high_exp.drop_duplicates('geneID')

Unnamed: 0,Barcode,geneID,UMI,count,total_counts
0,AACACACAGAACACACAGGTGGCAACT,ENSMUSG00000052305,ATCAGAAACCGG,1,35016187
1216119,AACACCGTTGGCATGCAATGGCAATTC,ENSMUSG00000069919,GCTGAATTTGAC,3,33818268
22613813,CCAATGTCTCATACGACCGACATGGCT,ENSMUSG00000069917,GCCGTGAATGTG,5,23991231
45533651,GGATACCACCTGTGGTATACGTGTGTT,ENSMUSG00000073940,TCCATGTACAAC,1,6818469
18430113,CACCTGTAAAGGCTGTTGGAACCTGTA,ENSMUSG00000076609,GAGCGTTCACGG,1,3080977
11377868,AGGAGCAATCCTACAAGGCCATATCTC,ENSMUSG00000064339,TAAGCCAAGTTT,5,3080459
2527710,AACTGGCGACGTCATACCTCCAGTTAG,ENSMUSG00000002985,AGACGATGGCAC,4,2290550


In [49]:
        df1 = pd.read_table('/SGRNJ06/randd/USER/cjj/celedev/vdj10x/20220803/c3rna/LC38CA/05.count/LC38CA_count_detail.txt', header=0)

In [50]:
        df1

Unnamed: 0,Barcode,geneID,UMI,count
0,AAACATCGAAACATCGAAACATCG,ENSG00000204287,AAGGCATTC,1
1,AAACATCGAAACATCGAAACATCG,ENSG00000204592,ACGTAGGGT,1
2,AAACATCGAAACATCGAAACATCG,ENSG00000198712,AGAGCGCCG,1
3,AAACATCGAAACATCGAAACATCG,ENSG00000198712,TGGGTTGTT,1
4,AAACATCGAAACATCGAAACATCG,ENSG00000158874,AGGAGAGAT,1
...,...,...,...,...
142541468,TTCACGCATTCACGCATTCACGCA,ENSG00000170315,TATTTGGCA,1
142541469,TTCACGCATTCACGCATTCACGCA,ENSG00000149806,TCCGCATAC,1
142541470,TTCACGCATTCACGCATTCACGCA,ENSG00000158874,TCTGGGGTA,1
142541471,TTCACGCATTCACGCATTCACGCA,ENSG00000177954,TGTATGGCT,1


In [51]:
        df_for_calling1, df_high_exp1 = pre_calling(df1)

In [52]:
        df_for_calling1

Unnamed: 0,Barcode,geneID,UMI,count
83072316,CCTCCTGATCCGTCTACCAGTTCA,ENSG00000071082,GAGAGTTAA,1
132937339,TCCGTCTAACACAGAATTCACGCA,ENSG00000071082,TACCAACTC,6
11181631,AAGAGATCCTCAATGACTCAATGA,ENSG00000071082,TTTCACGAG,4
46671027,AGTCACTAACGTATCAAGCAGGAA,ENSG00000071082,GGGAGCTGT,1
17234045,AATGTTGCCTGGCATAGCCAAGAC,ENSG00000071082,GCCAAAATA,3
...,...,...,...,...
32684777,ACTATGCAAACAACCACAAGGAGC,ENSG00000212145,AAAGCATCG,1
106314214,GACTAGTATGGAACAAGAATCTGA,ENSG00000243064,GGGGACTCC,1
59884473,CAAGACTAACAGCAGAGTGTTCTA,ENSG00000233778,GGGCTCGTA,1
103546682,GACAGTGCAGAGTCAAAAGAGATC,ENSG00000214797,ACGACGACT,1


In [53]:
        df_high_exp1

Unnamed: 0,Barcode,geneID,UMI,count,total_counts
67513841,CAGATCTGACTATGCAGAACAGGC,ENSG00000158874,TAGCGAGCC,1,14617744
27142194,ACCACTGTAGCCATGCCCTCTATC,ENSG00000158874,CTGTGGCAT,1,14617744
27142200,ACCACTGTAGCCATGCCCTCTATC,ENSG00000158874,GACGGAACC,3,14617744
27142199,ACCACTGTAGCCATGCCCTCTATC,ENSG00000158874,GAATGGCGT,2,14617744
90301467,CGCATACATCCGTCTAACCACTGT,ENSG00000158874,AAGAGAGTC,1,14617744
...,...,...,...,...,...
19638641,ACACAGAAAGCCATGCGAGTTAGC,ENSG00000106927,CATACGGTG,7,709227
25089821,ACAGCAGAGATAGACAACAGCAGA,ENSG00000106927,ACTTACGTA,1,709227
109655201,GAGTTAGCTGGAACAAACATTGGC,ENSG00000106927,TTCCACGCG,1,709227
97284472,CTGGCATAAAGACGGAGTCTGTCA,ENSG00000106927,TGTCCGCGA,1,709227


In [54]:
        df1

Unnamed: 0,Barcode,geneID,UMI,count
0,AAACATCGAAACATCGAAACATCG,ENSG00000204287,AAGGCATTC,1
1,AAACATCGAAACATCGAAACATCG,ENSG00000204592,ACGTAGGGT,1
2,AAACATCGAAACATCGAAACATCG,ENSG00000198712,AGAGCGCCG,1
3,AAACATCGAAACATCGAAACATCG,ENSG00000198712,TGGGTTGTT,1
4,AAACATCGAAACATCGAAACATCG,ENSG00000158874,AGGAGAGAT,1
...,...,...,...,...
142541468,TTCACGCATTCACGCATTCACGCA,ENSG00000170315,TATTTGGCA,1
142541469,TTCACGCATTCACGCATTCACGCA,ENSG00000149806,TCCGCATAC,1
142541470,TTCACGCATTCACGCATTCACGCA,ENSG00000158874,TCTGGGGTA,1
142541471,TTCACGCATTCACGCATTCACGCA,ENSG00000177954,TGTATGGCT,1


In [55]:
df_high_exp1.geneID.unique()

array(['ENSG00000158874', 'ENSG00000198804', 'ENSG00000251562',
       'ENSG00000087086', 'ENSG00000198938', 'ENSG00000166710',
       'ENSG00000198727', 'ENSG00000198899', 'ENSG00000198712',
       'ENSG00000198886', 'ENSG00000210082', 'ENSG00000211459',
       'ENSG00000019582', 'ENSG00000198763', 'ENSG00000118271',
       'ENSG00000167996', 'ENSG00000205542', 'ENSG00000198786',
       'ENSG00000163631', 'ENSG00000075624', 'ENSG00000130208',
       'ENSG00000177954', 'ENSG00000101439', 'ENSG00000204287',
       'ENSG00000137818', 'ENSG00000198888', 'ENSG00000198840',
       'ENSG00000130203', 'ENSG00000034510', 'ENSG00000211592',
       'ENSG00000187514', 'ENSG00000106927'], dtype=object)

In [56]:
df_high_exp1.drop_duplicates('geneID')

Unnamed: 0,Barcode,geneID,UMI,count,total_counts
67513841,CAGATCTGACTATGCAGAACAGGC,ENSG00000158874,TAGCGAGCC,1,14617744
43512018,AGCCATGCTCTTCACACAAGGAGC,ENSG00000198804,AACAGGTCT,1,7077833
67442294,CAGATCTGACATTGGCAGATCGCA,ENSG00000251562,ATAGTGTTA,1,5801404
139300874,TGGCTTCACCGTGAGAACAGCAGA,ENSG00000087086,AATGTACCC,3,5097303
102746794,GAATCTGAGACAGTGCAAGGACAC,ENSG00000198938,CAGGCGTGT,1,4620966
36565608,AGATCGCACGAACTTACGACACAC,ENSG00000166710,TGGTCCCGT,1,4270788
111560566,GATGAATCAAGGACACTAGGATGA,ENSG00000198727,TTCTGGCCA,2,3590443
25029058,ACAGCAGAGACTAGTACCAGTTCA,ENSG00000198899,GATAGCGTG,1,3273124
104170153,GACAGTGCCGACTGGAAGGCTAAC,ENSG00000198712,CTGTTGAGC,1,3179381
115435881,GCCACATACCTCCTGACATACCAA,ENSG00000198886,TAGAGCTGG,3,3166008


In [57]:
df_high_exp.drop_duplicates('geneID')

Unnamed: 0,Barcode,geneID,UMI,count,total_counts
0,AACACACAGAACACACAGGTGGCAACT,ENSMUSG00000052305,ATCAGAAACCGG,1,35016187
1216119,AACACCGTTGGCATGCAATGGCAATTC,ENSMUSG00000069919,GCTGAATTTGAC,3,33818268
22613813,CCAATGTCTCATACGACCGACATGGCT,ENSMUSG00000069917,GCCGTGAATGTG,5,23991231
45533651,GGATACCACCTGTGGTATACGTGTGTT,ENSMUSG00000073940,TCCATGTACAAC,1,6818469
18430113,CACCTGTAAAGGCTGTTGGAACCTGTA,ENSMUSG00000076609,GAGCGTTCACGG,1,3080977
11377868,AGGAGCAATCCTACAAGGCCATATCTC,ENSMUSG00000064339,TAAGCCAAGTTT,5,3080459
2527710,AACTGGCGACGTCATACCTCCAGTTAG,ENSMUSG00000002985,AGACGATGGCAC,4,2290550


In [60]:
alist = df_high_exp.drop_duplicates('geneID').total_counts.tolist()
alist1 = df_high_exp1.drop_duplicates('geneID').total_counts.tolist()

In [61]:
alist

[35016187, 33818268, 23991231, 6818469, 3080977, 3080459, 2290550]

In [None]:
alist1

In [65]:
np.median(alist)

6818469.0

In [66]:
np.median(alist1)

1472301.0

In [82]:
from sklearn import preprocessing

In [69]:
df_temp = df.copy()

In [70]:
df_temp

In [None]:
        df_temp['total_counts'] = df_temp.groupby('geneID')['count'].transform('sum')
        df_temp.sort_values('total_counts', ascending=False, inplace=True)

In [72]:
        data_std = np.std(df_temp.drop_duplicates('geneID').total_counts)
        data_mean = np.mean(df_temp.drop_duplicates('geneID').total_counts)

In [73]:
        df_temp['zscore'] = df_temp['total_counts'].apply(lambda x: (x-data_mean)/data_std)

In [74]:
        df_temp

Unnamed: 0,Barcode,geneID,UMI,count,total_counts,zscore
0,AACACACAGAACACACAGGTGGCAACT,ENSMUSG00000052305,ATCAGAAACCGG,1,35016187,105.483428
58948068,TCGGTTCGTCACTAGGCACACACCTCA,ENSMUSG00000052305,GACTCCGCATTT,1,35016187,105.483428
44152781,GCTCTCACTGCAACTTCACCGAGATGT,ENSMUSG00000052305,GCAACCGTGGGT,2,35016187,105.483428
44152782,GCTCTCACTGCAACTTCACCGAGATGT,ENSMUSG00000052305,GCAAGATTATCC,3,35016187,105.483428
44152783,GCTCTCACTGCAACTTCACCGAGATGT,ENSMUSG00000052305,GCAAGCACGTTT,1,35016187,105.483428
...,...,...,...,...,...,...
42720395,GCGAGTAACGCTGCGATAGTCTCGAAG,ENSMUSG00000113086,GCAGTCCCGGTT,1,1,-0.027150
32868423,CTTACGCAGCTGTGGTATGACATGGCT,ENSMUSG00000115507,CGGGTAAGCGCC,1,1,-0.027150
768035,AACACCGTTAACGTCCAAACTGGTTCC,ENSMUSG00000065604,CTTTGTAACCGC,1,1,-0.027150
21218890,CAGTCTTCGAACGTCCAAGACTATTCC,ENSMUSG00000065836,TCCTCGAGTGTT,1,1,-0.027150


In [78]:
df_temp.drop_duplicates('geneID').zscore.tolist()[:10]

[105.48342824747625,
 101.87386531884874,
 72.26309159086644,
 20.51821977787811,
 9.256429482172075,
 9.254868647426964,
 6.874719202106183,
 5.054026564295209,
 4.9486822717781465,
 3.786484117929335]

In [83]:
df_temp = df1.copy()

In [80]:
        df_temp['total_counts'] = df_temp.groupby('geneID')['count'].transform('sum')
        df_temp.sort_values('total_counts', ascending=False, inplace=True)
        data_std = np.std(df_temp.drop_duplicates('geneID').total_counts)
        data_mean = np.mean(df_temp.drop_duplicates('geneID').total_counts)
        df_temp['zscore'] = df_temp['total_counts'].apply(lambda x: (x-data_mean)/data_std)
        df_temp

Unnamed: 0,Barcode,geneID,UMI,count,total_counts,zscore
67513841,CAGATCTGACTATGCAGAACAGGC,ENSG00000158874,TAGCGAGCC,1,14617744,125.033097
27142194,ACCACTGTAGCCATGCCCTCTATC,ENSG00000158874,CTGTGGCAT,1,14617744,125.033097
27142200,ACCACTGTAGCCATGCCCTCTATC,ENSG00000158874,GACGGAACC,3,14617744,125.033097
27142199,ACCACTGTAGCCATGCCCTCTATC,ENSG00000158874,GAATGGCGT,2,14617744,125.033097
90301467,CGCATACATCCGTCTAACCACTGT,ENSG00000158874,AAGAGAGTC,1,14617744,125.033097
...,...,...,...,...,...,...
32684777,ACTATGCAAACAACCACAAGGAGC,ENSG00000212145,AAAGCATCG,1,1,-0.062515
106314214,GACTAGTATGGAACAAGAATCTGA,ENSG00000243064,GGGGACTCC,1,1,-0.062515
59884473,CAAGACTAACAGCAGAGTGTTCTA,ENSG00000233778,GGGCTCGTA,1,1,-0.062515
103546682,GACAGTGCAGAGTCAAAAGAGATC,ENSG00000214797,ACGACGACT,1,1,-0.062515


In [81]:
df_temp.drop_duplicates('geneID').zscore.tolist()[:10]

[125.03309732932877,
 60.50810303585075,
 49.5846884304224,
 43.559138127901306,
 39.482744891061714,
 36.48599420637836,
 30.663742824021686,
 27.948192645218228,
 27.145959496835772,
 27.0315161368695]

In [3]:
df2 = pd.read_table('/SGRNJ06/randd/PROJECT/RD20073101_ScRNA_VDJ/LZL_2022/20220804/H_0727PBMC1_Nlib/05.count/H_0727PBMC1_Nlib_count_detail.txt', header=0)

In [4]:
df_temp = df2.copy()

In [10]:
df_temp

Unnamed: 0,Barcode,geneID,UMI,count
0,AAACATCGAAACATCGAAACATCG,ENSG00000000938,GACGGGCGT,4
1,AAACATCGAAACATCGAACAACCA,ENSG00000197971,AAAAACGTC,1
2,AAACATCGAAACATCGAACAACCA,ENSG00000197971,CACGGAGGC,1
3,AAACATCGAAACATCGAACAACCA,ENSG00000162704,AAAACGAGG,3
4,AAACATCGAAACATCGAACAACCA,ENSG00000162704,AAAGGCCGA,3
...,...,...,...,...
20550198,TTCACGCATTCACGCATGGAACAA,ENSG00000080371,CAAACCCAT,3
20550199,TTCACGCATTCACGCATGGAACAA,ENSG00000163220,GCAATGATG,1
20550200,TTCACGCATTCACGCATGGCTTCA,ENSG00000115271,GAGAGCTGA,1
20550201,TTCACGCATTCACGCATGGCTTCA,ENSG00000166710,GCGCAGACC,1


In [91]:
        df_temp['total_counts'] = df_temp.groupby('geneID')['count'].transform('sum')
        df_temp.sort_values('total_counts', ascending=False, inplace=True)
        data_std = np.std(df_temp.drop_duplicates('geneID').total_counts)
        data_mean = np.mean(df_temp.drop_duplicates('geneID').total_counts)
        df_temp['zscore'] = df_temp['total_counts'].apply(lambda x: (x-data_mean)/data_std)
        df_temp

Unnamed: 0,Barcode,geneID,UMI,count,total_counts,zscore
9703519,CAGCGTTAAACTCACCACACGACC,ENSG00000251562,CGAAACGTA,2,1423264,72.017549
6592407,AGTACAAGTCTTCACACGGATTGC,ENSG00000251562,GGTGAACAT,1,1423264,72.017549
19046894,TATCAGCAGAATCTGATGGTGGTA,ENSG00000251562,CTAAGCCTT,1,1423264,72.017549
6592456,AGTACAAGTCTTCACACTGAGCCA,ENSG00000251562,CGGCAGTCC,1,1423264,72.017549
6592457,AGTACAAGTCTTCACACTGAGCCA,ENSG00000251562,CGGCAGTCT,4,1423264,72.017549
...,...,...,...,...,...,...
2986251,ACACGACCAGTCACTAAATCCGTC,ENSG00000278147,GCAGTCAAC,1,1,-0.089042
2522554,ACAAGCTAAAGGACACAGATCGCA,ENSG00000221263,TCCGTGACA,1,1,-0.089042
16886122,GCGAGTAACGAACTTACACCTTAC,ENSG00000226823,TGGGAAATC,1,1,-0.089042
10981760,CCGAAGTATCTTCACAAGTGGTCA,ENSG00000252397,TGGTTCCTC,1,1,-0.089042


In [92]:
df_temp.drop_duplicates('geneID').zscore.tolist()[:10]

[72.01754877045545,
 70.93620044410547,
 65.48127842169028,
 54.232601093191,
 29.007506779192376,
 25.681235959921917,
 20.89703969146791,
 15.111593096287194,
 14.979869630746059,
 14.057906697700842]

In [93]:
df3 = pd.read_table('/SGRNJ03/PROJ03/PROJ_20.SC/P21100502_SCOPEv2/temp/GC-NA014_auto/GC-NA014/05.count/GC-NA014_count_detail.txt', header=0)
df_temp = df3.copy()

In [98]:
        df_temp['total_counts'] = df_temp.groupby('geneID')['count'].transform('sum')
        df_temp.sort_values('total_counts', ascending=False, inplace=True)
        data_std = np.std(df_temp.drop_duplicates('geneID').total_counts)
        data_mean = np.mean(df_temp.drop_duplicates('geneID').total_counts)
        df_temp['zscore'] = df_temp['total_counts'].apply(lambda x: (x-data_mean)/data_std)
        df_temp

Unnamed: 0,Barcode,geneID,UMI,count,total_counts,zscore
67356326,GAATCTGACTGGCATAAACTCACC,ENSG00000182333,GGCACGTGT,2,14892466,99.863067
7319914,AAGAGATCCAAGGAGCGAGCTGAA,ENSG00000182333,CCTATCATG,3,14892466,99.863067
7319928,AAGAGATCCAAGGAGCGAGCTGAA,ENSG00000182333,CCTTTATAA,5,14892466,99.863067
7319927,AAGAGATCCAAGGAGCGAGCTGAA,ENSG00000182333,CCTTGCTGG,5,14892466,99.863067
7319926,AAGAGATCCAAGGAGCGAGCTGAA,ENSG00000182333,CCTTGCATT,1,14892466,99.863067
...,...,...,...,...,...,...
81311581,GTACGCAAGTCGTAGAAACAACCA,ENSG00000223640,AAACTGTCG,1,1,-0.036655
45468901,CATACCAACCTCCTGAAGCCATGC,ENSG00000279685,AGCCGGTCA,1,1,-0.036655
52336484,CCTAATCCCCTCTATCTAGGATGA,ENSG00000231355,TTGATGCGC,1,1,-0.036655
54626326,CCTCTATCTCTTCACAAGCCATGC,ENSG00000279340,GGCCTAAAG,1,1,-0.036655


In [99]:
df_temp

Unnamed: 0,Barcode,geneID,UMI,count,total_counts,zscore
67356326,GAATCTGACTGGCATAAACTCACC,ENSG00000182333,GGCACGTGT,2,14892466,99.863067
7319914,AAGAGATCCAAGGAGCGAGCTGAA,ENSG00000182333,CCTATCATG,3,14892466,99.863067
7319928,AAGAGATCCAAGGAGCGAGCTGAA,ENSG00000182333,CCTTTATAA,5,14892466,99.863067
7319927,AAGAGATCCAAGGAGCGAGCTGAA,ENSG00000182333,CCTTGCTGG,5,14892466,99.863067
7319926,AAGAGATCCAAGGAGCGAGCTGAA,ENSG00000182333,CCTTGCATT,1,14892466,99.863067
...,...,...,...,...,...,...
81311581,GTACGCAAGTCGTAGAAACAACCA,ENSG00000223640,AAACTGTCG,1,1,-0.036655
45468901,CATACCAACCTCCTGAAGCCATGC,ENSG00000279685,AGCCGGTCA,1,1,-0.036655
52336484,CCTAATCCCCTCTATCTAGGATGA,ENSG00000231355,TTGATGCGC,1,1,-0.036655
54626326,CCTCTATCTCTTCACAAGCCATGC,ENSG00000279340,GGCCTAAAG,1,1,-0.036655


In [100]:
df_temp.drop_duplicates('geneID').zscore.tolist()[:10]

[99.86306738999997,
 83.60269505881809,
 68.09954326900085,
 55.774300163900925,
 45.71945085735753,
 43.74701618431557,
 33.56889935356001,
 31.09883309496326,
 29.97619035173436,
 28.410727676212396]

In [101]:
df4 = pd.read_table('/SGRNJ03/PROJ03/PROJ_20.SC/P21100502_SCOPEv2/temp/NA016CA_auto/NA016CA/05.count/NA016CA_count_detail.txt', header=0)
df_temp = df4.copy()

In [102]:
        df_temp['total_counts'] = df_temp.groupby('geneID')['count'].transform('sum')
        df_temp.sort_values('total_counts', ascending=False, inplace=True)
        data_std = np.std(df_temp.drop_duplicates('geneID').total_counts)
        data_mean = np.mean(df_temp.drop_duplicates('geneID').total_counts)
        df_temp['zscore'] = df_temp['total_counts'].apply(lambda x: (x-data_mean)/data_std)
        df_temp

Unnamed: 0,Barcode,geneID,UMI,count,total_counts,zscore
36503770,CCTCCTGACAATGGAAAACAACCA,ENSG00000210082,GAACGATTG,3,16102117,164.011099
38973941,CGACTGGAAACGTGATACACGACC,ENSG00000210082,CATGTAGGT,1,16102117,164.011099
22855536,ATAGCGACGAACAGGCAACTCACC,ENSG00000210082,GGAAGGAAC,1,16102117,164.011099
22855535,ATAGCGACGAACAGGCAACTCACC,ENSG00000210082,GATCCCTGA,3,16102117,164.011099
22855534,ATAGCGACGAACAGGCAACTCACC,ENSG00000210082,GATCATGTG,2,16102117,164.011099
...,...,...,...,...,...,...
46796331,GACTAGTAACAAGCTATGGTGGTA,ENSG00000274469,TACCCTGTC,1,1,-0.027399
52684520,GCTCGGTACAGATCTGGAGTTAGC,ENSG00000265136,CCGGGTAAA,1,1,-0.027399
47972421,GAGTTAGCAACAACCAAGATGTAC,ENSG00000255313,ACGCCTGAT,1,1,-0.027399
54043223,GGTGCGAAATCATTCCGTACGCAA,ENSG00000234325,GTATATGCA,1,1,-0.027399


In [104]:
df_temp.drop_duplicates('geneID').zscore.tolist()[:10]

[164.01109886964917,
 59.981764548363586,
 56.324910059461786,
 55.67890743283802,
 37.85270068730572,
 37.513745924910175,
 36.91113138235237,
 27.205250595176512,
 26.003627845294368,
 24.953022930160703]