In [2]:
import scanpy as sc
from pathlib import Path
import tarfile

base_dir = Path(r"C:\Users\clark\OneDrive\Documents\GitHub\CNA_tool\data\data7\GSE195467_RAW")

for tar_file in base_dir.glob("*.tar.gz"):
    extract_dir = base_dir / tar_file.stem  # e.g., GSM5837936_sample1_t1
    extract_dir.mkdir(exist_ok=True)
    print(f"Extracting: {tar_file.name} → {extract_dir.name}")
    with tarfile.open(tar_file, mode="r:gz") as tar:
        tar.extractall(path=extract_dir)


Extracting: GSM5837936_sample1_t1.tar.gz → GSM5837936_sample1_t1.tar
Extracting: GSM5837937_sample1_t2.tar.gz → GSM5837937_sample1_t2.tar
Extracting: GSM5837938_sample1_t3.tar.gz → GSM5837938_sample1_t3.tar
Extracting: GSM5837939_sample1_t4.tar.gz → GSM5837939_sample1_t4.tar
Extracting: GSM5837940_sample2_t1.tar.gz → GSM5837940_sample2_t1.tar
Extracting: GSM5837941_sample2_t2.tar.gz → GSM5837941_sample2_t2.tar
Extracting: GSM5837942_sample2_t3.tar.gz → GSM5837942_sample2_t3.tar
Extracting: GSM5837943_sample3_t1.tar.gz → GSM5837943_sample3_t1.tar
Extracting: GSM5837944_sample3_t2.tar.gz → GSM5837944_sample3_t2.tar


In [None]:

adatas = []

for outer in base_dir.iterdir():
    if outer.is_dir():
        # Look for exactly one subdirectory inside
        subdirs = [d for d in outer.iterdir() if d.is_dir()]
        if len(subdirs) == 1 and (subdirs[0] / "matrix.mtx.gz").exists():
            print(f"Loading: {subdirs[0]}")
            adata = sc.read_10x_mtx(subdirs[0], var_names="gene_symbols")
            adata.obs["sample"] = outer.name
            adatas.append(adata)
        else:
            print(f"Skipping: {outer} (no valid matrix.mtx.gz found)")

# Merge and save
if adatas:
    adata_all = adatas[0].concatenate(
        *adatas[1:], 
        batch_key="sample_id",
        batch_categories=[a.obs["sample"][0] for a in adatas]
    )
    out_path = base_dir / "GSE195467_merged.h5ad"
    adata_all.write(out_path)
    print(f"Merged .h5ad saved to: {out_path}")
else:
    print("Still no valid samples found.")
