In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import anndata as ad
import scvelo as scv
import scipy as sp
from scipy import special
from scipy.stats import poisson,norm
from scipy.special import j_roots
from scipy.special import beta as beta_fun 
import matplotlib.pyplot as plt
from matplotlib.pyplot import rc_context
import seaborn as sns

import test.utils as test

In [2]:
scv.set_figure_params(dpi=100, color_map = 'viridis_r')
scv.settings.verbosity = 1
sc.logging.print_header()

scanpy==1.7.2 anndata==0.8.0 umap==0.4.1 numpy==1.19.5 scipy==1.8.0 pandas==1.4.1 scikit-learn==1.0.2 statsmodels==0.13.2 python-igraph==0.9.9


In [3]:
# read UMI counts and params for c57 and cast viariants
UMI_c57 = pd.read_csv('data_arme/F1_ref/umiSplit_c57.qc.csv', index_col=0)
UMI_cast = pd.read_csv('data_arme/F1_ref/umiSplit_cast.qc.csv', index_col=0)

In [4]:
UMI = UMI_c57 + UMI_cast

In [5]:
# transpose to ge n_cells x n_genes shape
UMI_c57_T = UMI_c57.transpose(copy=True)
UMI_cast_T = UMI_cast.transpose(copy=True)
UMI_T = UMI.transpose(copy=True)

In [6]:
adata = ad.AnnData(UMI_T)
adata.layers['spliced'] = UMI_c57_T
adata.layers['unspliced'] = UMI_cast_T
adata

AnnData object with n_obs × n_vars = 682 × 23270
    layers: 'spliced', 'unspliced'

In [7]:
gtf = pd.read_csv("data/Mus_musculus.GRCm38.102.chr.gtf", sep='\t', skiprows=5, header=None)
gtf.columns = ['chr', 'database', 'type', 'start', 'end', '.', 'strand', '.', 'other']
gtf = gtf[gtf['type'] == "gene"]
gtf = gtf.reset_index()
gtf

Unnamed: 0,index,chr,database,type,start,end,.,strand,..1,other
0,0,1,havana,gene,3073253,3074322,.,+,.,"gene_id ""ENSMUSG00000102693""; gene_version ""1""..."
1,3,1,ensembl,gene,3102016,3102125,.,+,.,"gene_id ""ENSMUSG00000064842""; gene_version ""1""..."
2,6,1,ensembl_havana,gene,3205901,3671498,.,-,.,"gene_id ""ENSMUSG00000051951""; gene_version ""5""..."
3,24,1,havana,gene,3252757,3253236,.,+,.,"gene_id ""ENSMUSG00000102851""; gene_version ""1""..."
4,27,1,havana,gene,3365731,3368549,.,-,.,"gene_id ""ENSMUSG00000103377""; gene_version ""1""..."
...,...,...,...,...,...,...,...,...,...,...
55396,1868643,MT,insdc,gene,13552,14070,.,-,.,"gene_id ""ENSMUSG00000064368""; gene_version ""1""..."
55397,1868649,MT,insdc,gene,14071,14139,.,-,.,"gene_id ""ENSMUSG00000064369""; gene_version ""1""..."
55398,1868652,MT,insdc,gene,14145,15288,.,+,.,"gene_id ""ENSMUSG00000064370""; gene_version ""1""..."
55399,1868657,MT,insdc,gene,15289,15355,.,+,.,"gene_id ""ENSMUSG00000064371""; gene_version ""1""..."


In [8]:
dic_gene_names = {}
index = 0
for line in gtf['other'].tolist():
    line = line.split(';')
    dic_gene_names[line[0][9:-1]]= [gtf['chr'][index], line[2][12:-1]]
    index += 1

In [9]:
adata.var['gene_name'] = [dic_gene_names[x][1] if x in dic_gene_names.keys() else 'NA' for x in adata.var.index]
adata.var['chromosome'] = [dic_gene_names[x][0] if x in dic_gene_names.keys() else 'NA' for x in adata.var.index]
adata.var

Unnamed: 0,gene_name,chromosome
ENSMUSG00000000001,Gnai3,3
ENSMUSG00000000028,Cdc45,16
ENSMUSG00000000037,Scml2,X
ENSMUSG00000000049,Apoh,11
ENSMUSG00000000056,Narf,11
...,...,...
ENSMUSG00000115595,Rps12-ps2,14
ENSMUSG00000115637,Gm30970,14
ENSMUSG00000115643,Gm49011,14
ENSMUSG00000115725,4930572G02Rik,14


In [10]:
label = []
for chrom in adata.var['chromosome']:
    if chrom == "X":
        label.append('X')
    elif chrom == "Y":
        label.append('Y')
    else:
        label.append('autosome')
adata.var['autosomes'] = label

In [11]:
label = []
for chrom in adata.var['chromosome']:
    if chrom == "MT":
        label.append('MT')
    else:
        label.append('nuclear')
adata.var['nuclear'] = label

In [12]:
label = []
for genes in adata.var['gene_name']:
    if genes[:2]=='Rp':
        label.append('Ribosomal protein')
    else:
        label.append('other protein')
adata.var['Ribosomal_prot'] = label

In [13]:
test.remove_na(adata)
test.find_ratios_sum(adata)
test.find_ratios_std(adata)
adata

AnnData object with n_obs × n_vars = 682 × 3134
    var: 'gene_name', 'chromosome', 'autosomes', 'nuclear', 'Ribosomal_prot', 'sum_allele_1', 'sum_allele_2', 'ratio_allele_1', 'ratio_allele_2', 'ratio_sum_allele_1', 'ratio_sum_allele_2', 'ratio_mean_allele_1', 'ratio_mean_allele_2', 'ratio_std_allele_1', 'ratio_std_allele_2'
    layers: 'spliced', 'unspliced', 'ratio_allele_1', 'ratio_allele_2'

In [22]:
sc.pp.neighbors(adata, n_neighbors=8, n_pcs=31)
sc.tl.umap(adata)
sc.tl.leiden(adata)
adata.obs['clusters'] = adata.obs['leiden'].copy()

         Falling back to preprocessing with `sc.pp.pca` and default params.


In [19]:
adata.var_names

Index(['ENSMUSG00000000001', 'ENSMUSG00000000049', 'ENSMUSG00000000088',
       'ENSMUSG00000000120', 'ENSMUSG00000000184', 'ENSMUSG00000000295',
       'ENSMUSG00000000303', 'ENSMUSG00000000326', 'ENSMUSG00000000339',
       'ENSMUSG00000000346',
       ...
       'ENSMUSG00000084146', 'ENSMUSG00000085051', 'ENSMUSG00000087278',
       'ENSMUSG00000089617', 'ENSMUSG00000097824', 'ENSMUSG00000100620',
       'ENSMUSG00000103811', 'ENSMUSG00000112500', 'ENSMUSG00000114922',
       'ENSMUSG00000115141'],
      dtype='object', length=3134)

In [23]:
sc.tl.rank_genes_groups(adata, groupby="clusters")



In [24]:
adata

AnnData object with n_obs × n_vars = 682 × 3134
    obs: 'leiden', 'clusters'
    var: 'gene_name', 'chromosome', 'autosomes', 'nuclear', 'Ribosomal_prot', 'sum_allele_1', 'sum_allele_2', 'ratio_allele_1', 'ratio_allele_2', 'ratio_sum_allele_1', 'ratio_sum_allele_2', 'ratio_mean_allele_1', 'ratio_mean_allele_2', 'ratio_std_allele_1', 'ratio_std_allele_2'
    uns: 'rank_genes_groups', 'neighbors', 'umap', 'leiden'
    obsm: 'X_pca', 'X_umap'
    layers: 'spliced', 'unspliced', 'ratio_allele_1', 'ratio_allele_2'
    obsp: 'distances', 'connectivities'

In [28]:
adata.uns['rank_genes_groups']

{'params': {'groupby': 'clusters',
  'reference': 'rest',
  'method': 't-test',
  'use_raw': True,
  'layer': None,
  'corr_method': 'benjamini-hochberg'},
 'names': rec.array([('ENSMUSG00000063316', 'ENSMUSG00000023944', 'ENSMUSG00000018593', 'ENSMUSG00000035095', 'ENSMUSG00000020415', 'ENSMUSG00000037411', 'ENSMUSG00000092341', 'ENSMUSG00000004665', 'ENSMUSG00000021477', 'ENSMUSG00000092341', 'ENSMUSG00000029447'),
            ('ENSMUSG00000068220', 'ENSMUSG00000020267', 'ENSMUSG00000026042', 'ENSMUSG00000021539', 'ENSMUSG00000094530', 'ENSMUSG00000026837', 'ENSMUSG00000027656', 'ENSMUSG00000062825', 'ENSMUSG00000018567', 'ENSMUSG00000031328', 'ENSMUSG00000059796'),
            ('ENSMUSG00000017404', 'ENSMUSG00000059208', 'ENSMUSG00000028618', 'ENSMUSG00000087128', 'ENSMUSG00000015837', 'ENSMUSG00000024909', 'ENSMUSG00000037266', 'ENSMUSG00000035783', 'ENSMUSG00000027204', 'ENSMUSG00000090841', 'ENSMUSG00000040824'),
            ...,
            ('ENSMUSG00000001348', 'ENSMUSG0000002