In [1]:
import scanpy as sc
import numpy as np
import pandas as pd
import math

from celescope.tools import utils
from celescope.__init__ import HELP_DICT
from celescope.rna.mkref import Mkref_rna
from celescope.tools.step import Step, s_common

In [10]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.set_figure_params(dpi=80, facecolor='white')

In [11]:
adata = sc.read('/SGRNJ06/randd/USER/cjj/celedev/cell_calling/20220826/Clu-mes/06.analysis/Clu-mes.h5ad')

In [12]:
adata

AnnData object with n_obs × n_vars = 13367 × 53715
    obs: 'n_genes_by_counts', 'total_counts', 'total_counts_mito', 'pct_counts_mito', 'cluster'
    var: 'gene_ids', 'mito', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'hvg', 'leiden', 'neighbors', 'pca', 'rank_genes_groups', 'tsne', 'umap'
    obsm: 'X_pca', 'X_tsne', 'X_umap'
    varm: 'PCs'
    layers: 'normalised'
    obsp: 'connectivities', 'distances'

In [37]:
adata.var.sort_values(by='total_counts',ascending=False)

Unnamed: 0,gene_ids,mito,n_cells_by_counts,mean_counts,pct_dropout_by_counts,total_counts,highly_variable,means,dispersions,dispersions_norm,mean,std
Hba-a1,ENSMUSG00000069919,False,13286,419.027832,0.605970,5601145.0,False,7.132112e+00,7.642726,1.062879,5.398033,2.159774
Hbb-bs,ENSMUSG00000052305,False,13326,348.342194,0.306726,4656290.0,False,6.955199e+00,7.440139,-0.140650,5.441536,1.952531
Hba-a2,ENSMUSG00000069917,False,13143,297.801453,1.675769,3980712.0,False,6.791392e+00,7.308577,-0.922229,5.026884,2.206951
Hbb-bt,ENSMUSG00000073940,False,11542,76.227722,13.653026,1018936.0,False,5.424318e+00,5.987933,1.000000,3.596482,2.316263
Igkc,ENSMUSG00000076609,False,8073,36.730232,39.604997,490973.0,False,3.980299e+00,7.955221,1.079033,1.398347,1.521519
...,...,...,...,...,...,...,...,...,...,...,...,...
Gm18060,ENSMUSG00000109043,False,0,0.000000,100.000000,0.0,False,1.000000e-12,,0.000000,0.000000,1.000000
Gm4565,ENSMUSG00000109396,False,0,0.000000,100.000000,0.0,False,1.000000e-12,,0.000000,0.000000,1.000000
Gm18467,ENSMUSG00000109567,False,0,0.000000,100.000000,0.0,False,1.000000e-12,,0.000000,0.000000,1.000000
Gm8685,ENSMUSG00000109247,False,0,0.000000,100.000000,0.0,False,1.000000e-12,,0.000000,0.000000,1.000000


In [42]:
def find_anomalies(data_list):
    anomalies = []
    # set upper limit
    data_std = np.std(data_list)
    data_mean = np.mean(data_list)
    anomaly_cut_off = data_std * 3
    
    upper_limit = data_mean + data_std * 3
    print(upper_limit)
    
    # Generate outliers
    for outlier in data_list:
        if outlier > upper_limit:
            anomalies.append(outlier)
    return anomalies

In [43]:
data_list = adata.var.sort_values(by='total_counts',ascending=False).total_counts.tolist()

In [44]:
anomalies=find_anomalies(data_list)

109890.24291829795


In [45]:
anomalies

[5601145.0,
 4656290.0,
 3980712.0,
 1018936.0,
 490973.0,
 435471.0,
 236164.0,
 206412.0,
 200470.0,
 186348.0,
 158239.0,
 150868.0,
 138895.0,
 138766.0,
 132535.0,
 124719.0,
 112644.0]

In [2]:
count_file = pd.read_csv('/SGRNJ06/randd/USER/cjj/TESTDATA/test_rna/20220708crmixed/220417014/05.count/220417014_count_detail.txt',sep='\t')

In [72]:
count_file

Unnamed: 0,Barcode,geneID,UMI,count
0,AAACATCGAAACATCGAAACATCG,ENSMUSG00000067713,AAAATGTTC,1
1,AAACATCGAAACATCGAAACATCG,ENSMUSG00000079614,AACGTGGTC,3
2,AAACATCGAAACATCGAAACATCG,ENSMUSG00000026377,AAGTGGCGT,6
3,AAACATCGAAACATCGAAACATCG,ENSMUSG00000027649,AATAAGTAT,1
4,AAACATCGAAACATCGAAACATCG,ENSMUSG00000098178,ACCAATAGC,3
...,...,...,...,...
69255608,TTCACGCATTCACGCATTCACGCA,ENSMUSG00000012848,TTACGATGG,2
69255609,TTCACGCATTCACGCATTCACGCA,ENSMUSG00000098923,TTCATCGAT,7
69255610,TTCACGCATTCACGCATTCACGCA,ENSMUSG00000058558,TTTAGGACA,3
69255611,TTCACGCATTCACGCATTCACGCA,ENSMUSG00000032525,TTTCCGGGG,1


In [3]:
count_file['total_counts'] = count_file.groupby('geneID')['count'].transform('sum')

In [4]:
count_file.sort_values('total_counts',ascending=False, inplace=True)

In [5]:
df_for_calling = count_file

In [6]:
df_for_calling

Unnamed: 0,Barcode,geneID,UMI,count,total_counts
19861624,AGCAGGAAACGCTCGAGACTAGTA,ENSMUSG00000076609,TATGGAAAC,3,29953458
13320265,ACCACTGTAGTACAAGGACAGTGC,ENSMUSG00000076609,ACTCGAGCT,1,29953458
13320254,ACCACTGTAGTACAAGGACAGTGC,ENSMUSG00000076609,ACTAGGCAG,2,29953458
13320255,ACCACTGTAGTACAAGGACAGTGC,ENSMUSG00000076609,ACTAGGGAG,2,29953458
13320256,ACCACTGTAGTACAAGGACAGTGC,ENSMUSG00000076609,ACTAGGGTG,4,29953458
...,...,...,...,...,...
13959880,ACCTCCAAAGCAGGAAATCCTGTA,ENSMUSG00000096149,ATCTACGTC,1,1
11174955,ACAGATTCATCATTCCCTGTAGCC,ENSMUSG00000064987,CAGTATAAG,1,1
11175714,ACAGATTCATCATTCCGGTGCGAA,ENSMUSG00000041138,AGAATGATA,1,1
35972020,CCGAAGTAAGATGTACGATAGACA,ENSMUSG00000029720,CCGGCCCGC,1,1


In [8]:
        df_for_calling['total_counts'] = df_for_calling.groupby('geneID')['count'].transform('sum')
        df_for_calling.sort_values('total_counts', ascending=False, inplace=True)

        # upper limit
        data_std = np.std(df_for_calling.total_counts)
        data_mean = np.mean(df_for_calling.total_counts)
        upper_limit = data_mean + data_std * 3

In [9]:
upper_limit

36778911.06755154

In [10]:
df_for_calling = df_for_calling[df_for_calling['total_counts'].apply(lambda x: x <= upper_limit)]

In [11]:
df_for_calling

Unnamed: 0,Barcode,geneID,UMI,count,total_counts
19861624,AGCAGGAAACGCTCGAGACTAGTA,ENSMUSG00000076609,TATGGAAAC,3,29953458
20491323,AGCCATGCAGATCGCAAGATGTAC,ENSMUSG00000076609,GACGTGCAG,3,29953458
20490847,AGCCATGCAGATCGCAAGATGTAC,ENSMUSG00000076609,CTCAGGACA,10,29953458
20491329,AGCCATGCAGATCGCAAGATGTAC,ENSMUSG00000076609,GACTAGACG,3,29953458
20491318,AGCCATGCAGATCGCAAGATGTAC,ENSMUSG00000076609,GACGGTCGG,5,29953458
...,...,...,...,...,...
10500908,ACACGACCCATACCAAAAGACGGA,ENSMUSG00000097943,GATTCCCTC,1,1
34702449,CCAGTTCACACTTCGAACTATGCA,ENSMUSG00000106940,AGCACAACC,1,1
39306536,CCTCCTGACCTCTATCCAGCGTTA,ENSMUSG00000083771,CATGAGGAG,1,1
9179954,ACAAGCTACGACTGGACAAGACTA,ENSMUSG00000083903,AGTTCGAGG,1,1


In [12]:
del df_for_calling['total_counts']

In [13]:
df_for_calling

Unnamed: 0,Barcode,geneID,UMI,count
19861624,AGCAGGAAACGCTCGAGACTAGTA,ENSMUSG00000076609,TATGGAAAC,3
20491323,AGCCATGCAGATCGCAAGATGTAC,ENSMUSG00000076609,GACGTGCAG,3
20490847,AGCCATGCAGATCGCAAGATGTAC,ENSMUSG00000076609,CTCAGGACA,10
20491329,AGCCATGCAGATCGCAAGATGTAC,ENSMUSG00000076609,GACTAGACG,3
20491318,AGCCATGCAGATCGCAAGATGTAC,ENSMUSG00000076609,GACGGTCGG,5
...,...,...,...,...
10500908,ACACGACCCATACCAAAAGACGGA,ENSMUSG00000097943,GATTCCCTC,1
34702449,CCAGTTCACACTTCGAACTATGCA,ENSMUSG00000106940,AGCACAACC,1
39306536,CCTCCTGACCTCTATCCAGCGTTA,ENSMUSG00000083771,CATGAGGAG,1
9179954,ACAAGCTACGACTGGACAAGACTA,ENSMUSG00000083903,AGTTCGAGG,1
