In [2]:
from pathlib import Path
import warnings
import scanpy as sc
import scib 
import numpy as np
import pandas as pd
import sys 

import scgpt as scg
import matplotlib.pyplot as plt

plt.style.context('default')
warnings.simplefilter('ignore', ResourceWarning)

model_dir = r"C:\Users\annel\OneDrive\Documenten\Machine Learning\scGPT_data\scGPT_CP"
print (model_dir)

C:\Users\annel\OneDrive\Documenten\Machine Learning\scGPT_data\scGPT_CP


In [3]:
import warnings

# Filter all warnings
warnings.filterwarnings('ignore')



In [4]:
import sys

repo_dir = Path.cwd().parent.absolute()
sys.path.append(str(repo_dir))

In [5]:
sys.path.append(str(repo_dir / "GenePT-tools"))
from src.utils import setup_data_dir

setup_data_dir()
data_dir = repo_dir / "data"


In [6]:
import requests

dataset = "https://datasets.cellxgene.cziscience.com/10df7690-6d10-4029-a47e-0f071bb2df83.h5ad"
# dataset_id = "10df7690-6d10-4029-a47e-0f071bb2df83"

file_path = data_dir / "1m_cells.h5ad"  # adjust this path as needed


In [7]:
import requests
from tqdm import tqdm

def download_with_progress(url, file_path):
    response = requests.get(url, stream=True)
    total_size = int(response.headers.get('content-length', 0))
    
    with open(file_path, 'wb') as file, tqdm(
        desc=file_path.name,
        total=total_size,
        unit='iB',
        unit_scale=True,
        unit_divisor=1024,
    ) as pbar:
        for data in response.iter_content(chunk_size=8192):
            size = file.write(data)
            pbar.update(size)

# Usage
if not file_path.exists():
    download_with_progress(dataset, file_path)

In [17]:
import h5py

with h5py.File(file_path, 'r') as f:
    # Look at the structure of the X group
    print("Contents of X group:", list(f['X'].keys()))
    
    # Look at obs and var to get dimensions
    print("\nContents of obs group:", list(f['obs'].keys()))
    print("Contents of var group:", list(f['var'].keys()))
    
    # If X contains a sparse matrix, it likely has 'data', 'indices', and 'indptr'
    if 'data' in f['X']:
        print("\nShape of X/data:", f['X']['data'].shape)
        print("Shape of X/indices:", f['X']['indices'].shape)
        print("Shape of X/indptr:", f['X']['indptr'].shape)

Contents of X group: ['data', 'indices', 'indptr']

Contents of obs group: ['10X_run', '_index', '_scvi_batch', '_scvi_labels', 'ambient_removal', 'anatomical_position', 'assay', 'assay_ontology_term_id', 'broad_cell_class', 'cdna_plate', 'cdna_well', 'cell_type', 'cell_type_ontology_term_id', 'compartment', 'development_stage', 'development_stage_ontology_term_id', 'disease', 'disease_ontology_term_id', 'donor_assay', 'donor_id', 'donor_method', 'donor_tissue', 'donor_tissue_assay', 'ethnicity_original', 'free_annotation', 'is_primary_data', 'library_plate', 'manually_annotated', 'method', 'n_genes_by_counts', 'notes', 'observation_joinid', 'organism', 'organism_ontology_term_id', 'pct_counts_ercc', 'pct_counts_mt', 'published_2022', 'replicate', 'sample_id', 'sample_number', 'scvi_leiden_donorassay_full', 'self_reported_ethnicity', 'self_reported_ethnicity_ontology_term_id', 'sex', 'sex_ontology_term_id', 'suspension_type', 'tissue', 'tissue_in_publication', 'tissue_ontology_term_id'

In [18]:
import numpy as np
from scipy import sparse


def load_subset_sparse(h5py_file, start_row=0, n_rows=None):
    """
    Load a subset of rows from the sparse matrix.
    
    Args:
        file_path: Path to h5ad file
        start_row: Starting row index
        n_rows: Number of rows to load
    
    Returns:
        scipy.sparse.csr_matrix with the requested rows
    """
    with h5py.File(file_path, 'r') as f:
        # Get the indptr for the rows we want
        if n_rows is None:
            n_rows = len(f['X']['indptr']) - 1 - start_row

        indptr = f['X']['indptr'][start_row:start_row + n_rows + 1]
        # Find the indices in data array for our rows
        start_idx = indptr[0]
        end_idx = indptr[-1]
        
        # Load the relevant parts of the data and indices
        data = f['X']['data'][start_idx:end_idx]
        indices = f['X']['indices'][start_idx:end_idx]
        
        # Adjust indptr to start at 0
        indptr = indptr - start_idx
        
        # Get the total number of columns from the var group
        n_cols = len(f['var']['feature_name']['categories'])
        
        # Create the sparse matrix
        return sparse.csr_matrix((data, indices, indptr), shape=(n_rows, n_cols))

cell_gene_matrix = load_subset_sparse(file_path, start_row=0, n_rows=100000)
print("Matrix shape:", cell_gene_matrix.shape)
print("Matrix density:", cell_gene_matrix.nnz / (cell_gene_matrix.shape[0] * cell_gene_matrix.shape[1]))

Matrix shape: (100000, 61759)
Matrix density: 0.045159779789180524


In [21]:
import h5py
import scanpy as sc
import numpy as np
import pandas as pd
from scipy import sparse

def load_subset_scanpy_chunked(file_path, start_row=0, n_rows=10000):
    """
    Load a subset of rows from h5ad file using scanpy with chunked reading.
    """
    with h5py.File(file_path, 'r') as f:
        # Get var names using ensembl_id
        var_names = f['var']['ensembl_id'][:]
        
        # Get the sparse matrix data for the subset
        indptr = f['X']['indptr'][start_row:start_row + n_rows + 1]
        start_idx = indptr[0]
        end_idx = indptr[-1]
        
        data = f['X']['data'][start_idx:end_idx]
        indices = f['X']['indices'][start_idx:end_idx]
        indptr = indptr - start_idx
        
        # Create sparse matrix
        X = sparse.csr_matrix(
            (data, indices, indptr),
            shape=(n_rows, len(var_names))
        )
        
        # Create basic AnnData object with var annotations
        var_df = pd.DataFrame(index=var_names)
        var_df['mean_counts'] = f['var']['mean_counts'][:]
        var_df['n_cells'] = f['var']['n_cells_by_counts'][:]
        
        # Get cell metadata
        obs_df = pd.DataFrame(index=range(start_row, start_row + n_rows))
        
        # Safely add obs annotations
        obs_fields = ['total_counts', 'n_genes_by_counts', 'pct_counts_mt']
        for field in obs_fields:
            if field in f['obs'] and isinstance(f['obs'][field], h5py.Dataset):
                obs_df[field] = f['obs'][field][start_row:start_row + n_rows]
        
        # Handle categorical data
        if 'cell_type' in f['obs']:
            if 'categories' in f['obs']['cell_type']:
                categories = f['obs']['cell_type']['categories'][:]
                indices = f['obs']['cell_type']['codes'][start_row:start_row + n_rows]
                obs_df['cell_type'] = pd.Categorical.from_codes(indices, categories)
        
        adata = sc.AnnData(X=X, obs=obs_df, var=var_df)
    
    print("Matrix shape:", adata.shape)
    print("Matrix density:", adata.X.nnz / (adata.shape[0] * adata.shape[1]))
    print("\nAvailable obs annotations:", list(adata.obs.columns))
    print("Available var annotations:", list(adata.var.columns))
    
    return adata

# Load first 100,000 cells
adata_subset = load_subset_scanpy_chunked(file_path, start_row=0, n_rows=100000)

MemoryError: Unable to allocate 2.08 GiB for an array with shape (278902284,) and data type int64

In [22]:
with h5py.File(file_path, 'r') as f:
    print(f['var']['feature_name']["categories"])
    # print(f['X']['indices'][:10])

<HDF5 dataset "categories": shape (61759,), type "|O">


In [23]:
cell_gene_matrix

<100000x61759 sparse matrix of type '<class 'numpy.float32'>'
	with 278902284 stored elements in Compressed Sparse Row format>

In [24]:
with h5py.File(file_path, 'r') as f:
    # Get the indptr for the rows we want
    print(len(f['obs']['scvi_leiden_donorassay_full']['codes']))
    

    

1136218


with h5py.File(file_path, 'r') as f:
    # Get the indptr for the rows we want
    gene_names = f['var']['feature_name']
    ensembl_ids = f['var']['ensembl_id']
    scvi_leiden_donorassay_full = f['obs']['scvi_leiden_donorassay_full']['codes']
    major_ensembl_ids = pd.Series(
        ensembl_id.decode('utf-8').split('.')[0]
        for ensembl_id in ensembl_ids
    )