In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
import yaml

In [None]:
import matplotlib.pyplot as plt
# plt.rcParams['font.family'] = 'Times New Roman'
plt.rcParams['font.family'] = 'Arial'

import anndata as ad
import scanpy as sc
# sc.settings.verbosity = 3
# sc.logging.print_versions()
Path("results/figures").mkdir(parents=True, exist_ok=True)
Path("results/data").mkdir(parents=True, exist_ok=True)
figure_type = 'svg'
sc.settings.figdir = "results/figures"
sc.settings.set_figure_params(fontsize=12, color_map='RdYlGn', dpi=80, dpi_save=1000)

In [None]:
import sys
sys.path.extend(['../../mylibs'])

import scAnalysis_util

In [None]:
samples = {
    "ZT-410": {
        "path": Path("../../data/ZT-410/").absolute(),
        "name": "no-infection",
    },
    "ZT-487": {
        "path": Path("../../data/ZT-487/").absolute(),
        "name": "3h-infection",
    },
    "ZT-488": {
        "path": Path("../../data/ZT-488/").absolute(),
        "name": "6h-infection",
    },
    "ZT-490": {
        "path": Path("../../data/ZT-490/").absolute(),
        "name": "9h-infection",
    },
    "ZT-491": {
        "path": Path("../../data/ZT-491/").absolute(),
        "name": "12h-infection",
    },
}

adatas = {}
for sample_id, sample_info in samples.items():
    sample_path = sample_info["path"]
    sample_name = sample_info["name"]
    solo_out = sample_path / "starsolo_outputs/Solo.out/GeneFull/filtered"
    solo_out_raw = sample_path / "starsolo_outputs/Solo.out/GeneFull/raw"

    sample_adata = sc.read_h5ad(solo_out / "matrix.stats.h5ad")
    sample_adata.X = sample_adata.X.astype('float64')
    sample_adata.var_names = sample_adata.var['gene_name'].apply(lambda x: x if x and str(x).strip() else None).fillna(sample_adata.var['gene_ids'])
    sample_adata.var_names_make_unique()
    adatas[sample_name] = sample_adata

adata = ad.concat(adatas, label="sample")
adata.obs_names_make_unique()
print(adata.obs["sample"].value_counts())
adata

In [None]:
#### Drop sum_umi_count(gene_id) == 0
count = adata.X.sum(axis=0)
count = np.array(count).flatten()
index = np.where(count>0)[0]
adata = adata[:, index].copy()

In [None]:
sc.pl.highest_expr_genes(adata, n_top=40)

In [None]:
# Remove MT-RNR1 and MT-RNR2 genes from adata
genes_to_remove = ['MT-RNR1', 'MT-RNR2']
mask = ~adata.var_names.isin(genes_to_remove)
adata = adata[:, mask].copy()
print(f"Removed {len(genes_to_remove)} genes. New shape: {adata.shape}")

In [None]:
#### Quality Control
# mitochondrial genes
adata.var["mt"] = adata.var_names.str.lower().str.startswith((
    "mt-"
))
# ribosomal genes
adata.var["ribo"] = adata.var_names.str.lower().str.startswith((
    "rps", "rpl"
))
# hemoglobin genes
adata.var["hb"] = adata.var_names.str.lower().str.contains('^hb[abgdez]$')

In [None]:
sc.pp.calculate_qc_metrics(adata, qc_vars=["mt", "ribo", "hb"], log1p=True, inplace=True)

In [None]:
sc.pl.violin(
    adata,
    ["n_genes_by_counts", "total_counts", "pct_counts_mt"],
    jitter=0.4,
    multi_panel=True
)

In [None]:
sc.pp.filter_cells(adata, min_genes=100)
sc.pp.filter_genes(adata, min_cells=3)
adata

In [None]:
sc.pl.scatter(adata, x="total_counts", y="pct_counts_mt", color="pct_counts_mt")
sc.pl.scatter(adata, x="total_counts", y="n_genes_by_counts", color="pct_counts_mt")

In [None]:
#### Normalization
adata.layers["counts"] = adata.X.copy()  # Store raw counts in a layer for highly variable genes
sc.pp.normalize_total(adata)  # Normalizing to median total counts
sc.pp.log1p(adata)  # Logarithmize the data

In [None]:
#### Highly Variable Genes
sc.pp.highly_variable_genes(adata, flavor='seurat', n_top_genes=2000, batch_key="sample")
sc.pl.highly_variable_genes(adata)

In [None]:
# Save raw expression values before variable gene subset, this will be used for regress_out and scale
adata.raw = adata

In [None]:
sc.tl.pca(adata, n_comps=50)

In [None]:
sc.pl.pca_variance_ratio(adata, n_pcs=50, log=True, show=True)
sc.pl.pca(
    adata,
    color=["sample", "sample", "pct_counts_mt", "pct_counts_mt"],
    dimensions=[(0, 1), (2, 3), (0, 1), (2, 3)],
    ncols=2,
    size=2,
)

In [None]:
sc.external.pp.harmony_integrate(adata, "sample")
adata_raw = adata.copy()

In [None]:
adata = adata_raw.copy()
sc.pp.neighbors(adata, n_pcs=20, n_neighbors=15, metric="euclidean")
sc.tl.umap(adata, min_dist=0.5)
sc.pl.umap(adata, color=["sample"])

In [None]:
sc.tl.leiden(adata, flavor="igraph", resolution=0.8)
sc.pl.umap(adata, color=["leiden"])

In [None]:
import h5py
import os
from scipy import sparse

ROOT_DIR = os.path.abspath(".")
os.makedirs(os.path.join(ROOT_DIR, "data"), exist_ok=True)
RAW_PATH = os.path.join(ROOT_DIR, "data", "raw_data.h5")
DATA_PATH = os.path.join(ROOT_DIR, "data", "processed_data.h5")

# Create louvain column based on sample mapping
sample_to_louvain = {
    'no-infection': 1,
    '3h-infection': 2,
    '6h-infection': 3,
    '9h-infection': 4,
    '12h-infection': 5
}
fake_louvain = adata.obs['sample'].map(sample_to_louvain)
# fake_louvain = adata.obs['leiden']

with h5py.File(DATA_PATH, "w") as f:
    # Write expression matrix
    exp_matrix = f.create_group("expression_matrix")
    exp_matrix.create_dataset("barcodes", data=adata.obs_names.to_numpy())
    exp_matrix.create_dataset("features", data=adata.var_names.to_numpy())
    exp_matrix.create_dataset("data", data=adata.X.data)
    exp_matrix.create_dataset("indices", data=adata.X.indices)
    exp_matrix.create_dataset("indptr", data=adata.X.indptr)

    # Write UMAP and Louvain clustering
    f.create_dataset("UMAP", data=adata.obsm["X_umap"])
    f.create_dataset("Louvain", data=fake_louvain.to_numpy(dtype=int))