In [1]:
#! pip install scanpy anndata h5py scvi-tools umap-learn


In [2]:
import anndata as ad
import pandas as pd
import scanpy as sc
import scvi


  from .autonotebook import tqdm as notebook_tqdm
  doc = func(self, args[0].__doc__, *args[1:], **kwargs)
  doc = func(self, args[0].__doc__, *args[1:], **kwargs)


In [3]:

adata = ad.read_h5ad("scRNA-seqProcessedLabelledObject.h5ad")
print(adata)


AnnData object with n_obs × n_vars = 538266 × 22091
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'percent.HB', 'percent.RPS', 'S.Score', 'G2M.Score', 'Phase', 'umis_TCR', 'umis_BCR', 'primary_type', 'secondary_type', 'sequential_group', 'eight_group', 'sampleName', 'age'
    uns: 'version'
    layers: 'RNA_counts', 'RNA_data'


In [4]:
del adata.layers["RNA_data"]


In [5]:
adata.var.head()

AL627309.1
AL669831.5
LINC00115
AL645608.3
NOC2L


In [6]:
adata.obs.head()

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,percent.HB,percent.RPS,S.Score,G2M.Score,Phase,umis_TCR,umis_BCR,primary_type,secondary_type,sequential_group,eight_group,sampleName,age
AAACCTGAGGCGTACA-1-sample42,seurat_object,7685.0,1889.0,2.446324,0.0,47.690306,0.008262,0.003594,S,26,-2147483648,CD4T,CD4_TEM_ANXA1,60y,50-60y,sample42,60
AAACCTGAGGGAGTAA-1-sample42,seurat_object,6206.0,1911.0,3.04544,0.0,36.545279,0.008496,-0.04266,S,11,-2147483648,CD4T,CD4_TEM_ANXA1,60y,50-60y,sample42,60
AAACCTGCAGCGATCC-1-sample42,seurat_object,4726.0,1764.0,4.485823,0.0,26.195514,0.089389,0.009794,S,-2147483648,-2147483648,CD8T,CD8_TEM_GNLY,60y,50-60y,sample42,60
AAACCTGCATATGGTC-1-sample42,seurat_object,7870.0,2112.0,3.481576,0.0,42.757306,-0.002148,0.009421,G2M,-2147483648,35,B,B_Memory,60y,50-60y,sample42,60
AAACCTGCATTGAGCT-1-sample42,seurat_object,9081.0,2095.0,5.241713,0.0,46.558749,0.000482,0.006435,G2M,-2147483648,14,B,B_Memory,60y,50-60y,sample42,60


# Описание колонок в аннотированных данных (AnnData.obs)

Данный датасет представляет собой результаты одноклеточного РНК-секвенирования (scRNA-seq), содержащие аннотации клеток. Ниже приведены описания колонок:

- **`orig.ident`** – Идентификатор исходного образца или объекта, откуда была получена клетка (например, `seurat_object`).
- **`nCount_RNA`** – Общее количество молекул РНК (UMI) в клетке. Показывает уровень экспрессии.
- **`nFeature_RNA`** – Количество уникальных генов, детектированных в клетке. Отражает транскрипционное разнообразие.
- **`percent.mt`** – Доля митохондриальных генов от общего числа РНК. Высокое значение может указывать на стресс или гибель клетки.
- **`percent.HB`** – Доля генов, связанных с гемоглобином, от общего количества РНК. Полезно для анализа кровяных клеток.
- **`percent.RPS`** – Доля рибосомных белков RPS. Высокие значения могут указывать на активный биосинтез белков.
- **`S.Score`** – Оценка активности фазы S клеточного цикла (синтез ДНК). Положительное значение означает активное деление.
- **`G2M.Score`** – Оценка активности фазы G2/M клеточного цикла (подготовка к митозу).
- **`Phase`** – Определённая фаза клеточного цикла: $G1$, $S$ или $G2M$.
- **`umis_TCR`** – Количество UMI, относящихся к генам T-клеточного рецептора (TCR). Используется для идентификации Т-клеток.
- **`umis_BCR`** – Количество UMI, относящихся к генам B-клеточного рецептора (BCR). Позволяет выделить B-клетки.
- **`primary_type`** – Основной тип клетки (например, CD4+ T-клетки, CD8+ T-клетки, B-клетки).
- **`secondary_type`** – Более детальная аннотация клеточного типа, например, `CD4_TEM_ANXA1` (эффекторные CD4+ Т-клетки, экспрессирующие ANXA1) или `B_Memory` (памятные B-клетки).
- **`sequential_group`** – Категориальный признак возрастной группы, например, `60y`.
- **`eight_group`** – Альтернативная возрастная группировка, например, `50-60y`.
- **`sampleName`** – Идентификатор образца (например, `sample42`), указывающий, откуда была получена клетка.
- **`age`** – Возраст донора в годах (например, `60`).

Данные представлены в формате AnnData и могут использоваться для анализа клеточных популяций, идентификации клеточных состояний и сравнения между разными возрастными группами.


# Column Descriptions in Annotated Data (AnnData.obs)

This dataset represents the results of single-cell RNA sequencing (scRNA-seq) with cell annotations. Below is a description of the columns:

- **`orig.ident`** – Identifier of the original sample or dataset from which the cell was obtained (e.g., `seurat_object`).
- **`nCount_RNA`** – Total count of RNA molecules (UMIs) detected per cell. Indicates the expression level.
- **`nFeature_RNA`** – Number of unique genes detected in each cell. Reflects transcriptional diversity.
- **`percent.mt`** – Percentage of mitochondrial genes in the total RNA count. High values may indicate stressed or dying cells.
- **`percent.HB`** – Percentage of hemoglobin-related genes in the total RNA count. Useful for analyzing blood cells.
- **`percent.RPS`** – Percentage of ribosomal protein S (RPS) genes in the total RNA count. High values may indicate increased protein biosynthesis.
- **`S.Score`** – Score representing the cell cycle **S-phase** (DNA synthesis phase). Positive values suggest active division.
- **`G2M.Score`** – Score representing the **G2/M phase** of the cell cycle (preparation for mitosis).
- **`Phase`** – Assigned cell cycle phase: $G1$, $S$, or $G2M$.
- **`umis_TCR`** – Number of UMIs mapped to T-cell receptor (TCR) genes. Used to identify T-cell populations.
- **`umis_BCR`** – Number of UMIs mapped to B-cell receptor (BCR) genes. Used to identify B-cell populations.
- **`primary_type`** – Broad classification of the cell type (e.g., CD4+ T cells, CD8+ T cells, B cells).
- **`secondary_type`** – More detailed subtype of the cell, such as `CD4_TEM_ANXA1` (effector memory CD4+ T cells expressing ANXA1) or `B_Memory` (memory B cells).
- **`sequential_group`** – Categorical age group label, such as `60y`.
- **`eight_group`** – Alternative age grouping format, such as `50-60y`.
- **`sampleName`** – Sample or dataset identifier (e.g., `sample42`), indicating the source of the cell.
- **`age`** – Age of the donor in years (e.g., `60`).

The data is stored in the AnnData format and can be used for analyzing cellular populations, identifying cell states, and comparing different age groups.


In [7]:
raw_counts = adata.layers["RNA_counts"]

df_counts = pd.DataFrame(
    raw_counts[:10, ].toarray(),  # Convert to dense
    index=adata.obs_names[:10],  # Cell barcodes
    columns=adata.var_names  # Gene names
)

df_counts.head()


Unnamed: 0,AL627309.1,AL669831.5,LINC00115,AL645608.3,NOC2L,KLHL17,PLEKHN1,HES4,ISG15,AL645608.2,...,AC084398.2,OR4E1,PRR35,AC018553.2,AC005180.1,AC025048.1,AC015802.1,AP005137.2,AL121917.2,AC016590.3
AAACCTGAGGCGTACA-1-sample42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACCTGAGGGAGTAA-1-sample42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACCTGCAGCGATCC-1-sample42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACCTGCATATGGTC-1-sample42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACCTGCATTGAGCT-1-sample42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
adata.var

AL627309.1
AL669831.5
LINC00115
AL645608.3
NOC2L
...
AC025048.1
AC015802.1
AP005137.2
AL121917.2
AC016590.3


In [9]:
adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,percent.HB,percent.RPS,S.Score,G2M.Score,Phase,umis_TCR,umis_BCR,primary_type,secondary_type,sequential_group,eight_group,sampleName,age
AAACCTGAGGCGTACA-1-sample42,seurat_object,7685.0,1889.0,2.446324,0.000000,47.690306,0.008262,0.003594,S,26,-2147483648,CD4T,CD4_TEM_ANXA1,60y,50-60y,sample42,60
AAACCTGAGGGAGTAA-1-sample42,seurat_object,6206.0,1911.0,3.045440,0.000000,36.545279,0.008496,-0.042660,S,11,-2147483648,CD4T,CD4_TEM_ANXA1,60y,50-60y,sample42,60
AAACCTGCAGCGATCC-1-sample42,seurat_object,4726.0,1764.0,4.485823,0.000000,26.195514,0.089389,0.009794,S,-2147483648,-2147483648,CD8T,CD8_TEM_GNLY,60y,50-60y,sample42,60
AAACCTGCATATGGTC-1-sample42,seurat_object,7870.0,2112.0,3.481576,0.000000,42.757306,-0.002148,0.009421,G2M,-2147483648,35,B,B_Memory,60y,50-60y,sample42,60
AAACCTGCATTGAGCT-1-sample42,seurat_object,9081.0,2095.0,5.241713,0.000000,46.558749,0.000482,0.006435,G2M,-2147483648,14,B,B_Memory,60y,50-60y,sample42,60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TGAGAGGAGTGTTGAA-1-sample52,seurat_object,5295.0,1577.0,2.134089,0.000000,42.662890,0.054058,0.006361,S,18,-2147483648,CD4T,CD4_Naive_CCR7,80y,70-80y,sample52,80
TGATTTCCAACTAGAC-1-sample52,seurat_object,3292.0,1499.0,3.948967,0.000000,24.119077,-0.004835,0.076335,G2M,3,-2147483648,CD8T,CD8_TEM_GNLY,80y,70-80y,sample52,80
TGGCCAGGTGTGGTTT-1-sample52,seurat_object,4137.0,1723.0,4.423495,0.000000,27.121102,-0.033304,-0.035709,G1,8,-2147483648,CD4T,CD4_TEM_ANXA1,80y,70-80y,sample52,80
TGTGTTTTCGGAAACG-1-sample52,seurat_object,5642.0,1737.0,6.097129,0.017724,37.238568,0.021595,-0.015182,S,18,-2147483648,CD4T,CD4_TEM_ANXA1,80y,70-80y,sample52,80


In [10]:
print(f"✅ Загружено: {adata.shape[0]} клеток и {adata.shape[1]} генов")
sc.pp.filter_genes(adata, min_cells=3)
print(f"✅ После фильтрации: {adata.shape[0]} клеток и {adata.shape[1]} генов")

✅ Загружено: 538266 клеток и 22091 генов
✅ После фильтрации: 538266 клеток и 21269 генов


In [11]:
print(f"✅ Загружено: {adata.shape[0]} клеток и {adata.shape[1]} генов")
sc.pp.highly_variable_genes(adata, n_top_genes=2000)
adata = adata[:, adata.var["highly_variable"]]
print(f"✅ После фильтрации: {adata.shape[0]} клеток и {adata.shape[1]} генов")

✅ Загружено: 538266 клеток и 21269 генов
✅ После фильтрации: 538266 клеток и 2000 генов


In [16]:
adata.write_h5ad("filtered_scRNA-seq_dataLite.h5ad")

In [17]:
adata

View of AnnData object with n_obs × n_vars = 538266 × 2000
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'percent.HB', 'percent.RPS', 'S.Score', 'G2M.Score', 'Phase', 'umis_TCR', 'umis_BCR', 'primary_type', 'secondary_type', 'sequential_group', 'eight_group', 'sampleName', 'age'
    var: 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'version', 'hvg'
    layers: 'RNA_counts'