## SCFEA 
Creating matrix to run scFEA in http://scflux.org/
The input of scFEA is a scRNA-seq or general transcriptomics data, in which each row is one gene and each column is one sample. TPM (or CPM/FPKM) normalized data is recommended. scFEA webserver accepts comma-(.csv), space-(.txt), tab-(.txt) delimited input fills. Please make sure the input data is in a matrix form and contains row/column names. 

In [1]:
import scanpy as sc
import decoupler as dc

In [2]:
adata = sc.read_h5ad("adata_solo_annotated_all.h5ad")

In [3]:
adata

AnnData object with n_obs × n_vars = 60459 × 11945
    obs: '_scvi_batch', '_scvi_labels', 'batch', 'batch_id', 'cell_type', 'condition', 'is_doublet', 'is_outlier', 'is_outlier_counts', 'is_outlier_genes', 'is_outlier_mito', 'is_outlier_top_20', 'leiden', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'log1p_total_counts_mt', 'log1p_total_counts_ribo', 'n_counts', 'n_genes', 'n_genes_by_counts', 'origin', 'pct_counts_in_top_20_genes', 'pct_counts_mt', 'pct_counts_ribo', 'sample_id', 'total_counts', 'total_counts_mt', 'total_counts_ribo', 'value', 'outlier', 'mt_outlier', 'ribo_outlier'
    var: 'ensembl_id', 'feature_types', 'gene_name', 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'n_cells', 'n_counts', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'highly_variable_nbatches'
    uns: 'cell_type_colors', 'hvg', 'is_doublet_colors', 'leiden', 'leiden_colors', 

In [None]:
adata_cpm = adata.copy()  # apply this to a copy so we can compare methods
adata_cpm.raw = adata_cpm  # store a copy of the raw values before normalizing
sc.pp.normalize_per_cell(adata_cpm, 
                         counts_per_cell_after=1e6)

In [5]:
pdata = dc.get_pseudobulk(adata,
                          sample_col='sample_id',
                          groups_col="cell_type",
                          layer='counts',
                          mode='sum',
                          min_cells=100,
                          min_counts=1000
                         )

In [6]:
scales_counts = sc.pp.normalize_total(pdata, target_sum=None, inplace=False)
# log1p transform
pdata.layers["log1p_norm"] = sc.pp.log1p(scales_counts["X"], copy=True)

In [11]:
import pandas as pd

# Step 1: Extract the log1p_norm layer
log1p_norm_matrix = pdata.layers["log1p_norm"]

# Step 2: Convert to a pandas DataFrame
# Ensure the rows (index) correspond to samples (adata.obs_names) and columns to genes (adata.var_names)
log1p_norm_df = pd.DataFrame(log1p_norm_matrix,   
                             index=pdata.obs_names,   # Sample names as row indices
                             columns=pdata.var.gene_name) # Gene names as column headers

# Step 3: Export to CSV
#log1p_norm_df.T.to_csv("log1p_norm_matrix.csv")

In [12]:
with open('scFEA.mouse.genes.txt', 'r') as f:
    gene_list = f.read().splitlines()  # This creates a list of genes

# Step 2: Filter the DataFrame to keep only the genes in the list
filtered_log1p_norm_df = log1p_norm_df.T.loc[log1p_norm_df.T.index.isin(gene_list)]


In [13]:
filtered_log1p_norm_df

Unnamed: 0_level_0,10mix1_COLON_Exhausted,10mix2_COLON_Exhausted,11mix1_COLON_Exhausted,11mix2_COLON_Exhausted,10mix1_COLON_Infg,10mix2_COLON_Infg,11mix1_COLON_Infg,11mix2_COLON_Infg,GF1_COLON_Infg,GF2_COLON_Infg,...,GF_ICI2_SLEC_Progenitor,GF_ICI2_plus_SLEC_Progenitor,10mix_ICI1_SLEC_Terminal,10mix_ICI2_SLEC_Terminal,11mix_ICI1_SLEC_Terminal,11mix_ICI2_SLEC_Terminal,GF_ICI1_SLEC_Terminal,GF_ICI1_plus_SLEC_Terminal,GF_ICI2_SLEC_Terminal,GF_ICI2_plus_SLEC_Terminal
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Dlat,3.998096,3.994420,4.015038,3.762799,4.264687,4.013443,3.873187,4.084481,4.330668,4.036375,...,4.452443,4.882694,4.759019,4.686661,4.674407,4.568219,4.789472,4.703448,4.684748,4.647597
Comt,4.364779,4.399291,4.139816,4.280936,3.938837,4.277446,4.165078,3.817793,4.561538,3.839621,...,4.972683,5.105197,5.085173,5.107409,5.181208,5.072165,5.067990,4.897975,4.943543,4.959501
Dbt,3.692427,3.589845,3.429869,3.547163,3.827842,3.338206,3.123965,3.754719,3.483602,3.724522,...,3.859846,3.488495,3.399422,3.655519,3.485469,3.599457,3.470862,3.577276,3.624781,3.488300
Hk2,3.346372,3.661979,3.665019,3.653081,4.383716,4.248013,4.246089,4.215906,4.475476,4.121913,...,3.980014,3.865758,3.903101,4.017314,3.290555,3.701058,3.505365,3.702725,3.977479,3.820938
Cyp51,4.931952,4.868849,5.255898,5.097463,5.036140,4.753106,5.153751,5.089606,5.017214,5.061007,...,5.027122,5.245385,4.925780,5.081336,5.097423,5.070996,5.073660,5.150333,5.009543,5.080003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Cpt1b,1.084212,1.859381,2.302855,1.584875,2.103416,1.983710,2.109662,2.495346,1.598803,1.801120,...,1.105228,1.823463,1.249915,1.405982,1.140862,1.591632,0.856530,1.341702,1.016268,1.133333
Ak6,4.560105,4.727462,4.657283,4.356413,4.656953,4.658658,4.646770,4.313604,4.429498,4.923795,...,4.379231,4.091959,4.308120,4.668933,4.549068,4.658709,4.197059,4.321927,4.262834,4.277251
Uckl1,3.828935,4.043125,4.036946,4.105483,3.973256,4.013443,4.296666,3.615213,4.220989,3.055213,...,4.272449,4.139034,4.308120,4.152883,4.293873,4.137671,4.300234,4.162763,4.204417,4.191854
Hmgcs1,4.231878,4.230433,4.542140,4.485462,4.448877,3.934905,4.646770,4.769176,4.640777,4.121913,...,4.624931,4.560517,4.330899,4.413214,4.371746,4.360596,4.484377,4.446547,4.389260,4.370234


In [14]:
filtered_log1p_norm_df.to_csv("log1p_norm_matrix.csv")

In [None]:
filtered_log1p_norm_df