In [7]:
import os
import scanpy as sc
import pandas as pd
import json
from pathlib import Path
import scipy
from scipy.sparse import csr_matrix as csr
import numpy as np

In [8]:
def preprocess_fast(sdata1, mode = 'customized', target_sum=1e4,base = 2,zero_center = True,regressout = False):
    if type(sdata1.layers['raw']) != scipy.sparse._csr.csr_matrix:
        sdata1.layers['raw'] = csr(sdata1.layers['raw'].copy())
    sdata1.X = sdata1.layers['raw'].copy()
    if mode == 'default':
        sc.pp.normalize_total(sdata1)  # normalize counts per cell
        sdata1.layers['norm'] = csr(sdata1.X.copy())
        # sc.pp.log1p(sdata1)
        # sdata1.layers['log1p_norm'] = csr(sdata1.X.copy())
        sc.pp.scale(sdata1,zero_center = zero_center)
        if scipy.sparse.issparse(sdata1.X): #### automatically change to non csr matrix (zero_center == True, the .X would be sparce)
            sdata1.X = sdata1.X.toarray().copy()
        # sdata1.layers['log1p_norm_scaled'] = sdata1.X.copy()
        if regressout:
            sdata1.obs['total_counts'] = sdata1.layers['raw'].toarray().sum(axis=1)
            sc.pp.regress_out(sdata1, ['total_counts'])
            sdata1.layers['log1p_norm_scaled'] = sdata1.X.copy()
        return sdata1 #### sdata1.X is sdata1.layers['log1p_norm_scaled']
    elif mode == 'customized':
        if target_sum == 1e4:
            target_sum_str = '1e4'
        else:
            target_sum_str = str(target_sum)
        sc.pp.normalize_total(sdata1,target_sum=target_sum)
        sdata1.layers[f'norm{target_sum_str}'] = csr(sdata1.X.copy())
        sc.pp.log1p(sdata1,base = base)
        sdata1.layers[f'log{str(base)}_norm{target_sum_str}'] = csr(sdata1.X.copy())
        sc.pp.scale(sdata1,zero_center = zero_center)
        if scipy.sparse.issparse(sdata1.X): #### automatically change to non csr matrix (zero_center == True, the .X would be sparce)
            sdata1.X = sdata1.X.toarray().copy()
        sdata1.layers[f'log{str(base)}_norm{target_sum_str}_scaled'] = sdata1.X.copy()
        if regressout:
            sdata1.obs['total_counts'] = sdata1.layers['raw'].toarray().sum(axis=1)
            sc.pp.regress_out(sdata1, ['total_counts'])
            sdata1.layers[f'log{str(base)}_norm{target_sum_str}_scaled'] = sdata1.X.copy()
        return sdata1 #### sdata1.X is sdata1.layers[f'log{str(base)}_norm{target_sum_str}_scaled']
    else:
        print('Please set the `mode` as one of the {"default", "customized"}.')

# Set the base path

In [9]:
base_path = Path("/home/unix/wangyanz/codon_usage/star_ribo")
data_pool = base_path.joinpath("data")
tag = "RIBO_STAR_rep23"
star_ribo_data = data_pool.joinpath(f"{tag}.h5ad")

# Process the data from `raw` to `norm`

In [10]:
sdata = sc.read_h5ad(star_ribo_data)
sdata.layers['raw'] = sdata.X.copy()
sdata = preprocess_fast(sdata,mode = 'default')
sdata.write_h5ad(data_pool.joinpath(f"processed_{tag}.h5ad"))



In [11]:
sdata

AnnData object with n_obs × n_vars = 223089 × 5413
    obs: 'sample', 'cell_type', 'x', 'y'
    var: 'highly_variable', 'mean', 'std'
    obsm: 'X_pca_harmony', 'X_umap'
    layers: 'log1p_norm_scaled', 'raw', 'norm'

In [22]:
# store the gene name
gene_name = sdata.var.index

gene_name_file = data_pool.joinpath(f"gene_name_{tag}.txt")
with open(gene_name_file, "w") as fp:
    fp.write("\n".join(gene_name.tolist()))

# Group data by cell type

In [24]:
# group the data by cell type

# sdata.obs.groupby('cell_type', group_keys=True) retuen a index
# x.index will give you the cell index
# sdata[x.index] will group the sdata by that index, each item is a small AnnData.
# by .layers["norm"], you can acess the real data.

grouped_adata = sdata.obs.groupby('cell_type').apply(lambda x: sdata[x.index].layers["norm"].mean(axis=0))
value_list = []
for i in  grouped_adata.values:
    value_list.append(i.flatten())
value_list = np.array(value_list).reshape((len(grouped_adata.index), sdata.n_vars))
    
df = pd.DataFrame(value_list, index=grouped_adata.index, columns=gene_name)
df.to_csv(data_pool.joinpath(f"cell_type_norm_mean_{tag}.csv"))
df

Unnamed: 0_level_0,AA467197,AI467606,AI593442,AU018091,AU021092,AW551984,Aacs,Aadac,Aadat,Aard,...,Zic1,Zim1,Zmat4,Zmynd15,Znrf1,Zp2,Zrsr2,Zswim2,Zwint,Zyx
cell_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AC,0.004974,0.021652,0.003429,0.006541,0.008714,0.033846,0.008141,0.015048,0.002065,0.022009,...,0.007314,0.001613,0.076049,0.018378,0.01949,0.002751,0.071045,0.006576,0.010589,0.002898
CHOR_EPEN,0.002372,0.021247,0.010584,0.003644,0.007064,0.031658,0.006634,0.017251,0.002257,0.013433,...,0.00827,0.003837,0.067792,0.029904,0.005049,0.001222,0.090173,0.005516,0.00619,0.00258
CHO_PEP,0.005829,0.022585,0.005885,0.008008,0.008448,0.176376,0.006344,0.018906,0.005077,0.016327,...,0.017398,0.004436,0.107026,0.025355,0.017371,0.002426,0.142867,0.005992,0.023486,0.004667
DE_MEN,0.006315,0.019926,0.010486,0.007549,0.00715,0.026438,0.009572,0.017506,0.004084,0.019389,...,0.025175,0.004109,0.28161,0.023393,0.023067,0.002525,0.111507,0.003753,0.015768,0.003734
INH,0.006134,0.020109,0.012345,0.006809,0.008486,0.025976,0.006466,0.015839,0.004475,0.017694,...,0.009985,0.003806,0.144495,0.021782,0.020431,0.00211,0.108107,0.005087,0.024374,0.004397
MLG,0.005648,0.026817,0.003215,0.005662,0.01097,0.033661,0.00737,0.016552,0.002198,0.023243,...,0.005431,0.001468,0.089068,0.018023,0.019509,0.003545,0.076461,0.008158,0.011541,0.004171
OLG,0.006343,0.025956,0.003031,0.009427,0.010651,0.039371,0.008963,0.018761,0.004067,0.026039,...,0.00604,0.002046,0.078917,0.021544,0.011402,0.003645,0.086244,0.006709,0.008945,0.003009
OPC,0.003806,0.0112,0.001252,0.011281,0.007511,0.061534,0.004423,0.012915,0.001982,0.031582,...,0.008852,0.0,0.072825,0.016684,0.021168,0.001702,0.078769,0.005852,0.015454,0.007176
Other,0.010182,0.020232,0.004836,0.005111,0.011736,0.025239,0.008993,0.016096,0.00299,0.018901,...,0.008364,0.003342,0.108979,0.015421,0.027872,0.004785,0.062074,0.002926,0.019149,0.003459
PVM,0.001773,0.03094,0.008894,0.005782,0.015852,0.051141,0.010424,0.019184,0.002745,0.024517,...,0.010912,0.0063,0.121653,0.023797,0.01994,0.004018,0.111415,0.009218,0.010515,0.003648
