## SCFEA 
Creating matrix to run scFEA in http://scflux.org/
The input of scFEA is a scRNA-seq or general transcriptomics data, in which each row is one gene and each column is one sample. TPM (or CPM/FPKM) normalized data is recommended. scFEA webserver accepts comma-(.csv), space-(.txt), tab-(.txt) delimited input fills. Please make sure the input data is in a matrix form and contains row/column names. 

In [15]:
import scanpy as sc
import decoupler as dc

In [4]:
adata = sc.read_h5ad("adata_solo_annotated_all.h5ad")

In [6]:
adata

AnnData object with n_obs × n_vars = 60459 × 11945
    obs: '_scvi_batch', '_scvi_labels', 'batch', 'batch_id', 'cell_type', 'condition', 'is_doublet', 'is_outlier', 'is_outlier_counts', 'is_outlier_genes', 'is_outlier_mito', 'is_outlier_top_20', 'leiden', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'log1p_total_counts_mt', 'log1p_total_counts_ribo', 'n_counts', 'n_genes', 'n_genes_by_counts', 'origin', 'pct_counts_in_top_20_genes', 'pct_counts_mt', 'pct_counts_ribo', 'sample_id', 'total_counts', 'total_counts_mt', 'total_counts_ribo', 'value', 'outlier', 'mt_outlier', 'ribo_outlier'
    var: 'ensembl_id', 'feature_types', 'gene_name', 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'n_cells', 'n_counts', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'highly_variable_nbatches'
    uns: 'cell_type_colors', 'hvg', 'is_doublet_colors', 'leiden', 'leiden_colors', 

In [46]:
scales_counts = sc.pp.normalize_total(adata, target_sum=None, inplace=False)
# log1p transform
adata.layers["log1p_norm"] = sc.pp.log1p(scales_counts["X"], copy=True)

In [50]:
adata.var.gene_name

ensembl_id
ENSMUSG00000033845            Mrpl15
ENSMUSG00000025903            Lypla1
ENSMUSG00000033813             Tcea1
ENSMUSG00000033793           Atp6v1h
ENSMUSG00000025907            Rb1cc1
                           ...      
ENSMUSG00000063897    CAAA01118383.1
ENSMUSG00000051412             Vamp7
ENSMUSG00000079834             Tmlhe
ENSMUSG00000095742    CAAA01147332.1
ENSMUSG00000095041        AC149090.1
Name: gene_name, Length: 11945, dtype: category
Categories (11943, object): ['0610009B22Rik', '0610009L18Rik', '0610010F05Rik', '0610010K14Rik', ..., 'Zyx', 'Zzef1', 'Zzz3', 'a']

In [51]:
import pandas as pd

# Step 1: Extract the log1p_norm layer
log1p_norm_matrix = adata.layers["log1p_norm"]

# Step 2: Convert to a pandas DataFrame
# Ensure the rows (index) correspond to samples (adata.obs_names) and columns to genes (adata.var_names)
log1p_norm_df = pd.DataFrame(log1p_norm_matrix,   
                             index=adata.obs_names,   # Sample names as row indices
                             columns=adata.var.gene_name) # Gene names as column headers

# Step 3: Export to CSV
#log1p_norm_df.T.to_csv("log1p_norm_matrix.csv")

In [53]:
log1p_norm_df.T

Unnamed: 0_level_0,AAACCTGAGTTAAGTG-1_10mix1,AAACCTGCAGGATCGA-1_10mix1,AAACCTGCATAACCTG-1_10mix1,AAACCTGCATACCATG-1_10mix1,AAACCTGGTGTGCCTG-1_10mix1,AAACCTGTCTTAACCT-1_10mix1,AAACGGGCAAGGTTCT-1_10mix1,AAACGGGGTATTACCG-1_10mix1,AAACGGGGTCGAGATG-1_10mix1,AAAGATGAGATCCCAT-1_10mix1,...,TTTGTCAGTTCGCTAA-1_GF_ICI2_plus,TTTGTCATCAATAAGG-1_GF_ICI2_plus,TTTGTCATCACGCATA-1_GF_ICI2_plus,TTTGTCATCATGGTCA-1_GF_ICI2_plus,TTTGTCATCCAGAGGA-1_GF_ICI2_plus,TTTGTCATCCCTTGCA-1_GF_ICI2_plus,TTTGTCATCGTTGCCT-1_GF_ICI2_plus,TTTGTCATCTAACTGG-1_GF_ICI2_plus,TTTGTCATCTCTGTCG-1_GF_ICI2_plus,TTTGTCATCTGTTGAG-1_GF_ICI2_plus
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Mrpl15,0.000000,0.0,2.244224,0.000000,0.0,0.000000,1.28739,0.000000,1.737694,0.0,...,0.000000,0.000000,1.757347,0.000000,0.0,0.00000,1.193816,0.000000,2.530077,0.000000
Lypla1,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.00000,0.000000,1.737694,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,1.097816,0.000000,2.530077,0.000000
Tcea1,0.000000,0.0,2.244224,1.950369,0.0,1.812609,1.28739,0.000000,1.737694,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,1.193816,0.000000,2.530077,0.000000
Atp6v1h,2.581758,0.0,0.000000,0.000000,0.0,0.000000,0.00000,0.000000,0.000000,0.0,...,0.000000,0.000000,1.757347,0.000000,0.0,0.00000,1.097816,0.000000,0.000000,1.693097
Rb1cc1,2.581758,0.0,0.000000,0.000000,0.0,1.812609,0.00000,2.032687,0.000000,0.0,...,2.669523,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,2.197122,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CAAA01118383.1,0.000000,0.0,0.000000,0.000000,0.0,0.000000,1.28739,0.000000,1.737694,0.0,...,0.000000,0.000000,1.757347,0.000000,0.0,0.00000,0.000000,0.000000,0.000000,0.000000
Vamp7,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.00000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.000000,0.000000,0.000000
Tmlhe,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.00000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.000000,0.000000,0.000000
CAAA01147332.1,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.00000,2.032687,1.737694,0.0,...,0.000000,0.000000,1.757347,2.364621,0.0,0.00000,1.281724,0.000000,0.000000,0.000000


In [54]:
with open('scFEA.mouse.genes.txt', 'r') as f:
    gene_list = f.read().splitlines()  # This creates a list of genes

# Step 2: Filter the DataFrame to keep only the genes in the list
filtered_log1p_norm_df = log1p_norm_df.T.loc[log1p_norm_df.T.index.isin(gene_list)]


In [55]:
filtered_log1p_norm_df

Unnamed: 0_level_0,AAACCTGAGTTAAGTG-1_10mix1,AAACCTGCAGGATCGA-1_10mix1,AAACCTGCATAACCTG-1_10mix1,AAACCTGCATACCATG-1_10mix1,AAACCTGGTGTGCCTG-1_10mix1,AAACCTGTCTTAACCT-1_10mix1,AAACGGGCAAGGTTCT-1_10mix1,AAACGGGGTATTACCG-1_10mix1,AAACGGGGTCGAGATG-1_10mix1,AAAGATGAGATCCCAT-1_10mix1,...,TTTGTCAGTTCGCTAA-1_GF_ICI2_plus,TTTGTCATCAATAAGG-1_GF_ICI2_plus,TTTGTCATCACGCATA-1_GF_ICI2_plus,TTTGTCATCATGGTCA-1_GF_ICI2_plus,TTTGTCATCCAGAGGA-1_GF_ICI2_plus,TTTGTCATCCCTTGCA-1_GF_ICI2_plus,TTTGTCATCGTTGCCT-1_GF_ICI2_plus,TTTGTCATCTAACTGG-1_GF_ICI2_plus,TTTGTCATCTCTGTCG-1_GF_ICI2_plus,TTTGTCATCTGTTGAG-1_GF_ICI2_plus
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Rdh10,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.00000,0.0,0.000000,0.0,...,0.000000,0.000000,0.0,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.000000
B3gat2,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.00000,0.0,0.000000,0.0,...,0.000000,0.000000,0.0,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.000000
Mgat4a,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.00000,0.0,0.000000,0.0,...,0.000000,0.000000,0.0,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.000000
Chst10,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.00000,0.0,0.000000,0.0,...,0.000000,0.000000,0.0,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.000000
Gls,0.0,0.0,2.244224,0.0,0.000000,1.812609,0.00000,0.0,0.000000,0.0,...,2.669523,0.000000,0.0,2.364621,0.00000,0.0,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Hsd17b10,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.00000,0.0,0.000000,0.0,...,0.000000,0.000000,0.0,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,1.693097
Sms,0.0,0.0,0.000000,0.0,2.027083,0.000000,1.28739,0.0,1.737694,0.0,...,0.000000,1.777059,0.0,0.000000,2.10226,0.0,1.246145,2.197122,0.000000,0.000000
Pdha1,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.00000,0.0,0.000000,0.0,...,2.669523,0.000000,0.0,0.000000,0.00000,0.0,0.000000,0.000000,2.530077,0.000000
Ctps2,0.0,0.0,0.000000,0.0,0.000000,1.812609,0.00000,0.0,1.737694,0.0,...,0.000000,0.000000,0.0,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.000000


In [56]:
filtered_log1p_norm_df.T.to_csv("log1p_norm_matrix.csv")