In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
import yaml

In [None]:
import matplotlib.pyplot as plt
# plt.rcParams['font.family'] = 'Times New Roman'
plt.rcParams['font.family'] = 'Arial'

import anndata as ad
import scanpy as sc
# sc.settings.verbosity = 3
# sc.logging.print_versions()
Path("results/figures").mkdir(parents=True, exist_ok=True)
Path("results/data").mkdir(parents=True, exist_ok=True)
figure_type = 'svg'
sc.settings.figdir = "results/figures"
sc.settings.set_figure_params(fontsize=12, color_map='RdYlGn', dpi=80, dpi_save=1000)

import scvelo as scv

In [None]:
import sys
sys.path.extend(['../../mylibs'])

import scAnalysis_util

In [None]:
samples = {
    "ZT-410": {
        "path": Path("../../data/ZT-410-velocyto/").absolute(),
        "name": "no-infection",
    },
    "ZT-485": {
        "path": Path("../../data/ZT-485-velocyto/").absolute(),
        "name": "9h-infection", #no-infection-1
    },
    "ZT-486": {
        "path": Path("../../data/ZT-486-velocyto/").absolute(),
        "name": "12h-infection", #1h-infection
    },
    "ZT-487": {
        "path": Path("../../data/ZT-487-velocyto/").absolute(),
        "name": "1h-infection", #3h-infection
    },
    "ZT-488": {
        "path": Path("../../data/ZT-488-velocyto/").absolute(),
        "name": "3h-infection", #6h-infection
    },
    "ZT-490": {
        "path": Path("../../data/ZT-490-velocyto/").absolute(),
        "name": "6h-infection", #9h-infection
    },
    "ZT-491": {
        "path": Path("../../data/ZT-491-velocyto/").absolute(),
        "name": "7.5h-infection", #12h-infection
    },
}

adatas = {}
for sample_id, sample_info in samples.items():
    sample_path = sample_info["path"]
    sample_name = sample_info["name"]
    solo_out = sample_path / "starsolo_outputs/Solo.out/GeneFull/filtered"
    solo_out_raw = sample_path / "starsolo_outputs/Solo.out/GeneFull/raw"

    sample_adata = sc.read_h5ad(solo_out / "matrix.stats.velocyto.h5ad")
    sample_adata.X = sample_adata.X.astype('float64')
    sample_adata.var_names = sample_adata.var['gene_name'].apply(lambda x: x if x and str(x).strip() else None).fillna(sample_adata.var['gene_ids'])
    sample_adata.var_names_make_unique()
    adatas[sample_name] = sample_adata

adata = ad.concat(adatas, label="sample", join="outer", merge="first")
adata.obs_names_make_unique()
print(adata.obs["sample"].value_counts())
adata

In [None]:
scv.pl.proportions(adata)

In [None]:
#### Drop sum_umi_count(gene_id) == 0
count = adata.X.sum(axis=0)
count = np.array(count).flatten()
index = np.where(count>0)[0]
adata = adata[:, index].copy()

In [None]:
sc.pl.highest_expr_genes(adata, n_top=40)

In [None]:
# Remove MT-RNR1 and MT-RNR2 genes from adata
genes_to_remove = ['MT-RNR1', 'MT-RNR2']
mask = ~adata.var_names.isin(genes_to_remove)
adata = adata[:, mask].copy()
print(f"Removed {len(genes_to_remove)} genes. New shape: {adata.shape}")

In [None]:
#### Quality Control
# mitochondrial genes
adata.var["mt"] = adata.var_names.str.lower().str.startswith((
    "mt-"
))
# ribosomal genes
adata.var["ribo"] = adata.var_names.str.lower().str.startswith((
    "rps", "rpl"
))
# hemoglobin genes
adata.var["hb"] = adata.var_names.str.lower().str.contains('^hb[abgdez]$')

In [None]:
sc.pp.calculate_qc_metrics(adata, qc_vars=["mt", "ribo", "hb"], log1p=True, inplace=True)

In [None]:
sc.pl.violin(
    adata,
    ["n_genes_by_counts", "total_counts", "pct_counts_mt"],
    jitter=0.4,
    multi_panel=True
)

In [None]:
sc.pp.filter_cells(adata, min_genes=100)
sc.pp.filter_genes(adata, min_cells=3)
adata

In [None]:
adata_raw = adata.copy()

In [None]:
adata = adata_raw.copy()
adata = adata[
    (adata.obs.n_genes_by_counts > 0) &
    (adata.obs.n_genes_by_counts < 6000) &
    (adata.obs.total_counts < 30000) &
    (adata.obs.pct_counts_mt < 30)
, :].copy()

In [None]:
sc.pl.scatter(adata, x="total_counts", y="pct_counts_mt", color="pct_counts_mt")
sc.pl.scatter(adata, x="total_counts", y="n_genes_by_counts", color="pct_counts_mt")

In [None]:
#### Normalization
adata.layers["counts"] = adata.X.copy()  # Store raw counts in a layer for highly variable genes
sc.pp.normalize_total(adata)  # Normalizing to median total counts
sc.pp.log1p(adata)  # Logarithmize the data

In [None]:
#### Highly Variable Genes
sc.pp.highly_variable_genes(adata, flavor='cell_ranger', n_top_genes=2000, batch_key="sample")
sc.pl.highly_variable_genes(adata)

In [None]:
# Save raw expression values before variable gene subset, this will be used for regress_out and scale
adata.raw = adata

In [None]:
adata

In [None]:
sc.tl.pca(adata, n_comps=80)

In [None]:
sc.pl.pca_variance_ratio(adata, n_pcs=80, log=True, show=True)
sc.pl.pca(
    adata,
    color=["sample", "sample", "pct_counts_mt", "pct_counts_mt"],
    dimensions=[(0, 1), (2, 3), (0, 1), (2, 3)],
    ncols=2,
    size=2,
)

In [None]:
sc.external.pp.harmony_integrate(adata, "sample")
sc.external.pp.scanorama_integrate(adata, "sample")
adata_raw = adata.copy()

In [None]:
adata = adata_raw.copy()

sc.pp.neighbors(adata, n_pcs=15, n_neighbors=20, metric="manhattan")
sc.tl.leiden(adata, flavor="igraph", key_added="leiden", resolution=0.3)
sc.tl.umap(adata, min_dist=0.3, spread=3.0)
sc.pl.umap(adata, color=["sample", "leiden"])

In [None]:
scv.pp.moments(
    adata,
    n_neighbors=20,
    n_pcs=15,
    mode="connectivities",
    method="umap",
    use_highly_variable=True
)

In [None]:
scv.tl.velocity(
    adata,
    mode='stochastic',
    use_highly_variable=True
)

In [None]:
scv.tl.velocity_graph(adata)

In [None]:

scv.pl.velocity_embedding_grid(
    adata,
    basis='umap',
    color='sample',
    arrow_length=3.0,
    scale=2.0,
    alpha=0.4,
    dpi=200,
)

In [None]:

ax = scv.pl.velocity_embedding_grid(
    adata,
    basis='umap',
    color='sample',
    arrow_length=3.0,
    scale=2.0,
    alpha=0.3,
    dpi=200,
    show=False,         # 先不显示，获得axes对象
)
ax.grid(False)          # 移除灰色网格线
plt.show()              # 手动展示结果


In [None]:
scv.pl.velocity_graph(adata, threshold=.1)

In [None]:
x, y = scv.utils.get_cell_transitions(adata, basis='umap', starting_cell=70)
ax = scv.pl.velocity_graph(adata, c='lightgrey', edge_width=.05, show=False)
ax = scv.pl.scatter(adata, x=x, y=y, s=120, c='ascending', cmap='gnuplot', ax=ax)

In [None]:
scv.tl.velocity_pseudotime(adata)
scv.pl.scatter(adata, color='velocity_pseudotime', cmap='gnuplot')