# Set-up

In [None]:
import os
import sys
import scanpy as sc
import muon as mu

sys.path.append("/cellar/users/aklie/opt/gene_program_evaluation/src/inference/program_models")
from factor_analysis import run_factor_analysis

In [None]:
path_data = "/cellar/users/aklie/opt/gene_program_evaluation/dashapp/example_data/iPSC_EC/cNMF_60_0.2_gene_names.h5mu"
path_configs = "/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/bin/data_analysis/factor_analysis/configs"
path_out = "/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/analysis/condition/factor_analysis"

# Load data

In [None]:
# if filepath ends in .h5ad
if path_data.endswith(".h5ad"):
    data = sc.read_h5ad(path_data)
elif path_data.endswith(".h5mu"):
    data = mu.read_h5mu(path_data)
data

In [None]:
del data.mod["cNMF"]

In [None]:
data.write("/cellar/users/aklie/opt/gene_program_evaluation/dashapp/example_data/iPSC_EC/iPSC_EC.h5mu")

# Preprocess

In [None]:
# Grab RNA
rna = data.copy()
rna.layers["counts"] = rna.X.copy()
rna

In [None]:
# Remove genes expressed in less than 50 cells
sc.pp.filter_genes(rna, min_cells=50)
rna

In [None]:
# Remove genes with names starting with ‘LINC’ and gene names with patterns starting with two letters and six digits
rna = rna[:, ~rna.var.index.str.startswith("LINC")].copy()
rna = rna[:, ~rna.var.index.str.match(r'^[A-Z]{2}\d{6}')].copy()
rna

In [None]:
# Normalize the data
sc.pp.normalize_total(rna, target_sum=1e4)
sc.pp.log1p(rna)

In [None]:
# Set layers
rna.layers["log1p_norm"] = rna.X.copy()
rna.X = rna.layers["counts"].copy()

In [None]:
# if mudata
if isinstance(rna, mu.MuData):
    # Put back into rna
    data.mod["rna"] = rna
else:
    # Make mudata
    data = mu.MuData({"rna": rna})

In [None]:
# Save 
data.write(os.path.join(path_out, "fa.h5mu"))

# Make configs
```yaml
run_factor_analysis_:
    n_components = 5
    random_state = 1234
```

In [None]:
# Configs look like above, make a separate one for 5, 10, 15, 20, 25, 30, 35, 40, 45, 50 n_components
'''
run_factor_analysis_:
    n_components = 5
    random_state = 1234
'''
n_components = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
for n in n_components:
    path_config = os.path.join(path_configs, f"K{n}.gin")
    config = '''run_factor_analysis_:\n\tn_components = {n}\n\trandom_state = 1234'''
    config = config.format(n=n)
    with open(path_config, "w") as f:
        f.write(config)
    print(f"Saved config to {path_config}")

In [None]:
# Run the factor analysis
n_components = [2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
for n in n_components:
    path_config = os.path.join(path_configs, f"K{n}.gin")
    run_factor_analysis(
        mdata=data,
        prog_key=f"factor_analysis_k{n}",
        data_key="rna",
        layer="log1p_norm",
        config_path=path_config,
        inplace=True
    )
    break

# Quick eval

In [None]:
import matplotlib.pyplot as plt
import matplotlib as mpl

In [None]:
adata = data.mod["factor_analysis_k10"].copy()
adata.obsm["X_umap"] = data.mod["rna"].obsm["X_umap"].copy()
adata

In [None]:
# Create a divergent colormap from grey to the current color using matplotlib
cmap = plt.cm.get_cmap("Reds")
sc.pl.umap(adata, color=adata.var_names.tolist() + ["condition_annotation"], ncols=2, cmap=cmap, show=False, vmax="p99.5")

# DONE!

---

In [None]:
mload = mu.read_h5mu("/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/analysis/condition/factor_analysis/fa_K2.h5mu")

In [None]:
mload

In [None]:
mload[""]