In [5]:
import scanpy as sc

# Load PBMC AnnData object
adata_pbmc = sc.read_h5ad("pbmc_10k_preprocessed_with_obs.h5ad")

# Load TCGA AnnData object
adata_brca = sc.read_h5ad("../data/preprocessed/TCGA-BRCA_final.h5ad")
adata_luad = sc.read_h5ad("../data/preprocessed/TCGA-LUAD_final.h5ad")
adata_lusc = sc.read_h5ad("../data/preprocessed/TCGA-LUSC_final.h5ad")

datasets = {"BRCA": adata_brca, "LUAD": adata_luad, "LUSC": adata_lusc}
# Assume these TCGA datasets are already loaded and harmonized

# Step 1: Extract shared gene set across all TCGA datasets
tcga_gene_set = set.intersection(*[
    set(ds.var["gene_id_clean"]) for ds in datasets.values()
])

# Step 2: Extract gene set from PBMC
pbmc_gene_set = set(adata_pbmc.var["gene_id_clean"])

# Step 3: Compute intersection of TCGA and PBMC gene sets
common_genes = tcga_gene_set.intersection(pbmc_gene_set)
print(f"Number of common genes between PBMC and TCGA: {len(common_genes)}")

# Step 4: Subset each TCGA dataset to only the common genes
for name in datasets:
    adata = datasets[name]
    adata = adata[:, adata.var["gene_id_clean"].isin(common_genes)].copy()
    datasets[name] = adata
    print(f"{name} shape after filtering: {adata.shape}")

# Step 5: Subset PBMC to only the common genes
adata_pbmc = adata_pbmc[:, adata_pbmc.var["gene_id_clean"].isin(common_genes)].copy()
print(f"PBMC shape after filtering: {adata_pbmc.shape}")


Number of common genes between PBMC and TCGA: 19618
BRCA shape after filtering: (644, 19618)
LUAD shape after filtering: (418, 19618)
LUSC shape after filtering: (374, 19618)
PBMC shape after filtering: (2099284, 19618)


In [6]:
import os

# Save filtered TCGA datasets
os.makedirs("../data/preprocessed/filtered_common/", exist_ok=True)

for name, adata in datasets.items():
    adata.write(f"../data/preprocessed/filtered_common/TCGA-{name}_filtered_common_genes.h5ad")
    print(f"Saved TCGA-{name}")

# Save filtered PBMC dataset
adata_pbmc.write("../data/preprocessed/filtered_common/PBMC_10k_filtered_common_genes.h5ad")
print("Saved PBMC")

Saved TCGA-BRCA
Saved TCGA-LUAD
Saved TCGA-LUSC
Saved PBMC


In [4]:
import scanpy as sc
import pandas as pd
import numpy as np
from scipy.sparse import issparse

adata_brca = sc.read_h5ad("../data/preprocessed/TCGA-BRCA_final.h5ad")
adata_luad = sc.read_h5ad("../data/preprocessed/TCGA-LUAD_final.h5ad")
adata_lusc = sc.read_h5ad("../data/preprocessed/TCGA-LUSC_final.h5ad")
def inspect_dataset(name, adata):
    print(f"\n Dataset: {name}")
    print("Shape:", adata.shape)

    # X matrix
    if issparse(adata.X):
        nonzeros = adata.X.nnz
        print(f".X is sparse. Non-zero elements: {nonzeros}")
    else:
        print(".X is dense.")

    # .var
    required_var_keys = {"gene_ids", "gene_id_clean", "sequence"}
    var_keys = set(adata.var.columns)
    print(".var keys:", var_keys)
    missing_keys = required_var_keys - var_keys
    if missing_keys:
        print(f"Missing .var keys: {missing_keys}")
    else:
        missing_seq = adata.var["sequence"].isna().sum() + (adata.var["sequence"] == "").sum()
        print(f"sequence column present. Missing sequences: {missing_seq} / {adata.var.shape[0]}")

    # .obs
    print("\n.obs shape:", adata.obs.shape)
    obs_preview = adata.obs.head()
    print(obs_preview)

    # check for useful .obs fields
    for field in ["batch", "age", "gender", "percent_mito", "cell_type"]:
        if field in adata.obs.columns:
            print(f".obs['{field}'] present.")
        else:
            print(f".obs['{field}'] missing.")


for name, adata in datasets.items():
    inspect_dataset(name, adata)


 Dataset: BRCA
Shape: (644, 60664)
.X is sparse. Non-zero elements: 20968018
.var keys: {'sequence', 'gene_id_x', 'gene_symbol', 'gene_symbol_from_fasta', 'transcript_id', 'gene_id_y'}
Missing .var keys: {'gene_id_clean', 'gene_ids'}

.obs shape: (644, 5)
              gender   age vital_status tumor_stage batch
TCGA-3C-AAAU  FEMALE  55.0        Alive     Stage X    3C
TCGA-3C-AALI  FEMALE  50.0        Alive   Stage IIB    3C
TCGA-3C-AALJ  FEMALE  62.0        Alive   Stage IIB    3C
TCGA-3C-AALK  FEMALE  52.0        Alive    Stage IA    3C
TCGA-4H-AAAK  FEMALE  50.0        Alive  Stage IIIA    4H
.obs['batch'] present.
.obs['age'] present.
.obs['gender'] present.
.obs['percent_mito'] missing.
.obs['cell_type'] missing.

 Dataset: LUAD
Shape: (418, 60664)
.X is sparse. Non-zero elements: 13715130
.var keys: {'gene_id', 'sequence', 'gene_symbol', 'gene_symbol_from_fasta', 'transcript_id'}
Missing .var keys: {'gene_id_clean', 'gene_ids'}

.obs shape: (418, 5)
              gender   age v