In [None]:
import os
import scanpy as sc
import pandas as pd
import scipy.io
import numpy as np

# Define file paths
base_path = "."
barcode_file = os.path.join(base_path, "Hs10X.data.barcodes.tsv.gz")
gene_file = os.path.join(base_path, "Hs10X.data.features.tsv.gz")
matrix_file = os.path.join(base_path, "Hs10X.data.mtx.gz")
metadata_file = os.path.join(base_path, "Hs.metadata.tsv")

# Verify file existence
for file in [barcode_file, gene_file, matrix_file, metadata_file]:
    if not os.path.exists(file):
        raise FileNotFoundError(f"File not found: {file}")

# Load barcodes
barcodes = pd.read_csv(barcode_file, header=None, sep='\t')[0].values

# Load gene names
genes = pd.read_csv(gene_file, header=None, sep='\t')
if genes.shape[1] >= 2:
    genes = genes.iloc[:, [1]]  # Retain gene names
genes.columns = ["gene_name"]
genes.set_index("gene_name", inplace=True)

# Load expression matrix
matrix = scipy.io.mmread(matrix_file).tocsc()

# Validate gene count
if genes.shape[0] != matrix.shape[0]:
    genes = genes.iloc[:matrix.shape[0], :]

# Construct AnnData object
adata = sc.AnnData(X=matrix.transpose())
adata.var = genes
adata.var_names = genes.index
adata.obs_names = barcodes

# Load metadata
metadata = pd.read_csv(metadata_file, sep='\t', index_col=0)

# Ensure metadata matches the barcode indices
metadata = metadata.loc[adata.obs_names.intersection(metadata.index)]

# Assign metadata to adata.obs
adata.obs = metadata

# Save AnnData object
h5ad_output_file = os.path.join(base_path, "Hs10X_processed.h5ad")
adata.write_h5ad(h5ad_output_file)

# Reload and validate data
adata = sc.read_h5ad(h5ad_output_file)

# Output dataset properties
print(f"Dataset shape: {adata.shape}")
print(f"First 5 gene names: {adata.var_names[:5].tolist()}")
print(f"First 5 cell barcodes: {adata.obs_names[:5].tolist()}")
print(f"First 5 metadata entries:\n{adata.obs.head()}")
print(f"Matrix sparsity: {adata.X.nnz / (adata.shape[0] * adata.shape[1]) * 100:.2f}% non-zero values")