In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
! pip install scanpy python-igraph leidenalg
!pip install --upgrade ipywidgets
!pip install --upgrade session_info
!pip install scanpy scipy numpy umap-learn leidenalg
!pip install matplotlib scikit-learn
!pip install pybiomart

Collecting scanpy
  Downloading scanpy-1.11.1-py3-none-any.whl.metadata (9.9 kB)
Collecting python-igraph
  Downloading python_igraph-0.11.8-py3-none-any.whl.metadata (2.8 kB)
Collecting leidenalg
  Downloading leidenalg-0.10.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting anndata>=0.8 (from scanpy)
  Downloading anndata-0.11.4-py3-none-any.whl.metadata (9.3 kB)
Collecting legacy-api-wrap>=1.4 (from scanpy)
  Downloading legacy_api_wrap-1.4.1-py3-none-any.whl.metadata (2.1 kB)
Collecting scikit-learn<1.6.0,>=1.1 (from scanpy)
  Downloading scikit_learn-1.5.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting session-info2 (from scanpy)
  Downloading session_info2-0.1.2-py3-none-any.whl.metadata (2.5 kB)
Collecting igraph==0.11.8 (from python-igraph)
  Downloading igraph-0.11.8-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting texttable>=1.6.2 (from igraph==0.11.8->python-igrap

In [None]:
import scanpy as sc
import numpy as np
import pandas as pd
import scipy.sparse as sp
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# install pybiomart
try:
    from pybiomart import Server
    BIOMART_AVAILABLE = True
except ImportError:
    BIOMART_AVAILABLE = False

In [None]:
import os
import scanpy as sc
import pandas as pd
from scipy.io import mmread

# 1. 指定数据根目录
base_dir = "/content/drive/MyDrive/JHU/CSC/GSE263152_RAW"

# 2. 读取基因列表（假定两列：基因ID 和 基因名称）
genes_path = os.path.join(base_dir, "GSE263152_genes.tsv.gz")
genes_df = pd.read_csv(genes_path, sep="\t", header=None)
gene_ids   = genes_df.iloc[:, 0].values
gene_names = genes_df.iloc[:, 1].values

# 3. 定义样本信息列表：(文件前缀, 时间点, 组别)
samples = [
    ("GSM8186670_D0-CON", "D0", "scRNA-seq"),
    ("GSM8186671_D2-CON", "D2", "scRNA-seq"),
    ("GSM8186672_D4-CON", "D4", "scRNA-seq"),
    ("GSM8186673_D6-CON", "D6", "scRNA-seq"),
    ("GSM8186674_D8-CON", "D8", "scRNA-seq"),
    ("GSM8186675_D10-CON", "D10", "scRNA-seq"),
    ("GSM8186676_D0-DYN", "D0", "DynaSCOPE"),
    ("GSM8186677_D2-DYN", "D2", "DynaSCOPE"),
    ("GSM8186678_D4-DYN", "D4", "DynaSCOPE"),
    ("GSM8186679_D6-DYN", "D6", "DynaSCOPE"),
    ("GSM8186680_D8-DYN", "D8", "DynaSCOPE"),
    ("GSM8186681_D10-DYN", "D10", "DynaSCOPE"),
    # ... 其他 DynaSCOPE 样本 D2,...,D10
]

adatas = []
# 4. 读取常规转录组和动态转录组数据
for prefix, time, assay in samples:
    mtx_path = os.path.join(base_dir, prefix + "_matrix.mtx.gz")
    bc_path  = os.path.join(base_dir, prefix + "_barcodes.tsv.gz")

    # 读取矩阵并转置为 cells x genes
    X = mmread(mtx_path).tocsr().T
    barcodes = pd.read_csv(bc_path, header=None)[0].values

    # 构建 AnnData
    adata = sc.AnnData(X=X, obs=pd.DataFrame(index=barcodes))
    adata.var_names = gene_names.copy()
    adata.var["gene_id"] = gene_ids.copy()
    adata.var_names_make_unique()
    adata.obs["timepoint"] = time
    adata.obs["assay"]     = assay
    adatas.append(adata)

# 5. 读取糖基化组学数据
glyco_samples = [
    ("GSM8186682_D0C", "D0"),
    ("GSM8186683_D2C", "D2"),
    ("GSM8186684_D4C", "D4"),  # 新增
    ("GSM8186685_D6C", "D6"),  # 新增
    ("GSM8186686_D8C", "D8"),  # 新增
    ("GSM8186687_D10C", "D10") # 新增
]
for prefix, time in glyco_samples:
    umi_path  = os.path.join(base_dir, prefix + "_umi_tag.tsv.gz")
    tsne_path = os.path.join(base_dir, prefix + "_tsne_tag.tsv.gz")

    df = pd.read_csv(umi_path, sep="\t")
    # 假设列名为 "CellBarcode" 和 "UMI_count"
    df.columns = ["CellBarcode", "UMI_count"]
    cell_ids = df["CellBarcode"].values
    counts   = df["UMI_count"].astype(int).values[:, None]

    # 构建只有糖基化特征的 AnnData
    adata = sc.AnnData(X=counts, obs=pd.DataFrame(index=cell_ids))
    adata.var_names = ["glyco_tag"]
    adata.obs["timepoint"] = time
    adata.obs["assay"]     = "Glycosylation-seq"

    # # 可选：读取并存储 t-SNE 坐标
    # if os.path.exists(tsne_path):
    #     tsne_df = pd.read_csv(tsne_path, sep="\t")
    #     pr
    #     # tsne_df.columns = ["CellBarcode", "TSNE_1", "TSNE_2"]
    #     tsne_df = tsne_df.set_index("CellBarcode").reindex(cell_ids)
    #     adata.obsm["X_tsne"] = tsne_df[["TSNE_1", "TSNE_2"]].values

    adatas.append(adata)

# 6. 将所有 AnnData 按变量并集合并
combined_adata = sc.concat(
    adatas,
    join="outer",
    label="batch",  # 可选：标记来源
    keys=[f"{a.obs['timepoint'].iat[0]}_{a.obs['assay'].iat[0]}" for a in adatas],
    fill_value=0
)

# 7. 检查结果
print(combined_adata)


AnnData object with n_obs × n_vars = 275190 × 37494
    obs: 'timepoint', 'assay', 'batch'


  utils.warn_names_duplicates("obs")


In [None]:
combined_adata.write("/content/drive/MyDrive/JHU/CSC/CSCB/num3_adata.h5ad")
combined_adata.write("num3_adata.h5ad")