<a href="https://colab.research.google.com/github/DPariser/DataScience/blob/main/Preprocessing/Creating_H5AD_and_Loom_Files_ID_missing_patients.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Creating H5AD and Loom Files

The following command will generate an RNA count matrix of cells (rows) by genes (columns) in H5AD format, which is a binary format used to store Anndata (https://anndata.readthedocs.io/en/stable/) objects. Also, since the reads were generated with the 10x Genomics Chromium Single Cell v2 &v3 Chemistry, the -x 10xv2 and v3 argument is used (see the alignment file). To view other supported technologies, run kb --list.

Note: To output a Loom (https://linnarssonlab.org/loompy/format/index.html) file instead, replace the --h5ad flag with --loom. To obtain the raw matrix output by kb instead of the H5AD or Loom converted files, omit these flags.

https://colab.research.google.com/github/pachterlab/kallistobustools/blob/master/docs/tutorials/kb_getting_started/python/kb_intro_2_python.ipynb#scrollTo=ijU_u6uj3Sio

https://www.kallistobus.tools/tutorials/kb_getting_started/python/kb_intro_2_python/

In [None]:
%run Commonimports.ipynb

In [None]:
import os
import numpy as np
import anndata

In [None]:
cd

/home/dpariser


In [None]:
!pip install anndata loompy

Our matrix, scipy.sparse._coo.coo_matrix, is a COO (Coordinate List) sparse matrix. While Anndata can generally work with COO matrices, it often works more efficiently with CSR (Compressed Sparse Row) matrices. Converting your COO matrix to a CSR matrix before creating the Anndata object could potentially resolve the IORegistryError we have encountering.

H5AD Files: Anndata, used for handling H5AD files, typically expects the data in a cell x gene format. Our code works without transposing the matrix for H5AD file creation, it implies that the original matrix is already in the cell x gene format (rows represent cells, columns represent genes).

Loom Files: Loom files conventionally expect data in a gene x cell format. If transposing the matrix is necessary for the Loom file creation to work correctly, this further suggests that the original matrix is in the cell x gene format. By transposing it, we are converting it to the gene x cell format, which is expected by Loom.

# H5AD File for One Patient

In [None]:
# print(matrix.shape)  # Should be genes x cells
# print(len(genes))    # Should match the number of genes
# print(len(barcodes)) # Should match the number of cells

In [None]:
# import anndata

# # Paths to your files
# matrix_path = "Datafiles/HRR339728/HRR339728_output/counts_unfiltered/cells_x_genes.mtx"
# genes_path = "Datafiles/HRR339728/HRR339728_output/counts_unfiltered/cells_x_genes.genes.txt"
# barcodes_path = "Datafiles/HRR339728/HRR339728_output/counts_unfiltered/cells_x_genes.barcodes.txt"

# # Load data
# matrix = scipy.io.mmread(matrix_path)

# # Convert the COO matrix to CSR format
# matrix_csr = csr_matrix(matrix)

# genes = pd.read_csv(genes_path, header=None)
# barcodes = pd.read_csv(barcodes_path, header=None)

# # Create an Anndata object with the CSR matrix
# adata = anndata.AnnData(X=matrix_csr)
# adata.var_names = genes[0]
# adata.obs_names = barcodes[0]

# # Ensure uniqueness
# adata.var_names_make_unique()
# adata.obs_names_make_unique()

# # Save as H5AD
# adata.write("Datafiles/HRR339728/HRR339728_output/HRR339728_H5AD.h5ad")

In [None]:
# adata = anndata.read_h5ad("Datafiles/HRR339728/HRR339728_output/HRR339728_H5AD.h5ad")

In [None]:
# adata.shape

In [None]:
# adata.obs.head()

In [None]:
# adata.var.head()

# Loom for one Patient

In [None]:
!pip install --upgrade loompy numba

Defaulting to user installation because normal site-packages is not writeable


In [None]:
# # Paths to your files
# matrix_path = "Datafiles/HRR339728/HRR339728_output/counts_unfiltered/cells_x_genes.mtx"
# genes_path = "Datafiles/HRR339728/HRR339728_output/counts_unfiltered/cells_x_genes.genes.txt"
# barcodes_path = "Datafiles/HRR339728/HRR339728_output/counts_unfiltered/cells_x_genes.barcodes.txt"

# # Load data
# matrix = scipy.io.mmread(matrix_path)

# # Convert the COO matrix to CSR format and transpose
# matrix_csr = csr_matrix(matrix).T

# # Load and convert genes and barcodes to numpy arrays
# genes = np.array(pd.read_csv(genes_path, header=None)[0])
# barcodes = np.array(pd.read_csv(barcodes_path, header=None)[0])

# # Create a Loom file with the transposed matrix
# loompy.create("Datafiles/HRR339728/HRR339728_output/HRR339728.loom", matrix_csr, {"Gene": genes}, {"CellID": barcodes})

In [None]:
# # Open the Loom file
# with loompy.connect("Datafiles/HRR339728/HRR339728_output/HRR339728.loom") as ds:
#     # Print the shape of the dataset (genes x cells)
#     print("Shape of the dataset:", ds.shape)

#     # Print row attributes keys (usually genes)
#     print("Row attributes (genes):", ds.ra.keys())

#     # Print column attributes keys (usually cells)
#     print("Column attributes (cells):", ds.ca.keys())

#     # Optionally, view a subset of the data
#     if ds.shape[0] > 0 and ds.shape[1] > 0:
#         subset = ds[:, :5]  # First 5 cells
#         print("Subset of data (first 5 cells):", subset)

In [None]:
# import os
# import pandas as pd
# import scipy.io
# from scipy.sparse import csr_matrix
# import anndata
# import loompy
# import numpy as np
# import datetime
# import gc
# import concurrent.futures

# def print_with_timestamp(message):
#     timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
#     print(f"[{timestamp}] {message}")

# def get_file_size(file_path):
#     # Get the file size in bytes
#     size_bytes = os.path.getsize(file_path)
#     # Convert bytes to megabytes
#     size_megabytes = size_bytes / (1024 * 1024)
#     return size_megabytes

# # Base directory where patient folders are located
# base_dir = "Datafiles"

# # List all patient folders
# patient_folders = [folder for folder in os.listdir(base_dir) if folder.startswith("HRR")]
# patient_folders.sort()

# # Specify the start patient number
# start_patient = "HRR339754"
# start_index = patient_folders.index(start_patient) if start_patient in patient_folders else 0

# def process_patient(folder):
#     print_with_timestamp(f"Processing {folder}...")

#     # Define paths to the files
#     matrix_path = os.path.join(base_dir, folder, folder + "_output/counts_unfiltered/cells_x_genes.mtx")
#     genes_path = os.path.join(base_dir, folder, folder + "_output/counts_unfiltered/cells_x_genes.genes.txt")
#     barcodes_path = os.path.join(base_dir, folder, folder + "_output/counts_unfiltered/cells_x_genes.barcodes.txt")

#     # Check if the necessary files exist
#     if os.path.exists(matrix_path) and os.path.exists(genes_path) and os.path.exists(barcodes_path):
#         print_with_timestamp("Required files found, loading data...")

#         # Load data
#         matrix = scipy.io.mmread(matrix_path)
#         genes = pd.read_csv(genes_path, header=None)
#         barcodes = pd.read_csv(barcodes_path, header=None)

#         # H5AD File Creation
#         matrix_csr = csr_matrix(matrix)
#         adata = anndata.AnnData(X=matrix_csr)
#         adata.var_names = genes[0]
#         adata.obs_names = barcodes[0]
#         adata.var_names_make_unique()
#         adata.obs_names_make_unique()
#         h5ad_path = os.path.join(base_dir, folder, folder + "_H5AD.h5ad")
#         adata.write(h5ad_path)
#         h5ad_size = get_file_size(h5ad_path)
#         print_with_timestamp(f"H5AD file created successfully. Size: {h5ad_size:.2f} MB")

#         # Loom File Creation
#         matrix_csr_transposed = csr_matrix(matrix).T
#         genes_array = np.array(genes[0])
#         barcodes_array = np.array(barcodes[0])
#         loom_path = os.path.join(base_dir, folder, folder + ".loom")
#         loompy.create(loom_path, matrix_csr_transposed, {"Gene": genes_array}, {"CellID": barcodes_array})
#         loom_size = get_file_size(loom_path)
#         print_with_timestamp(f"Loom file created successfully. Size: {loom_size:.2f} MB")

#         # Clear memory
#         del matrix, genes, barcodes, matrix_csr, adata, matrix_csr_transposed, genes_array, barcodes_array
#         gc.collect()

#         print_with_timestamp(f"Finished processing {folder}. H5AD and Loom files created.")
#     else:
#         print_with_timestamp(f"Required files for {folder} are missing. Skipping.")

# # Using ThreadPoolExecutor with a controlled number of workers
# with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
#     futures = [executor.submit(process_patient, folder) for folder in patient_folders[start_index:]]
#     concurrent.futures.wait(futures)

# Files for all patients

In [None]:
# # check if all loom and h5ad files are created

# import os
# import datetime

# def print_with_timestamp(message):
#     timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
#     print(f"[{timestamp}] {message}")

# def check_files_exist_and_non_empty(folder):
#     h5ad_path = os.path.join(base_dir, folder, folder + "_H5AD.h5ad")
#     loom_path = os.path.join(base_dir, folder, folder + ".loom")

#     h5ad_exists = os.path.exists(h5ad_path) and os.path.getsize(h5ad_path) > 0
#     loom_exists = os.path.exists(loom_path) and os.path.getsize(loom_path) > 0

#     if h5ad_exists and loom_exists:
#         return True
#     else:
#         if not h5ad_exists:
#             print_with_timestamp(f"H5AD file missing or empty for {folder}")
#         if not loom_exists:
#             print_with_timestamp(f"Loom file missing or empty for {folder}")
#         return False

# base_dir = "Datafiles"
# patient_folders = [folder for folder in os.listdir(base_dir) if folder.startswith("HRR")]
# patient_folders.sort()

# all_files_valid = True
# for folder in patient_folders:
#     if not check_files_exist_and_non_empty(folder):
#         all_files_valid = False

# if all_files_valid:
#     print_with_timestamp("All patient folders have valid H5AD and Loom files.")
# else:
#     print_with_timestamp("Some patient folders are missing valid H5AD or Loom files.")

[2024-01-22 09:13:35] Loom file missing or empty for HRR339754
[2024-01-22 09:13:36] H5AD file missing or empty for HRR340077
[2024-01-22 09:13:36] Loom file missing or empty for HRR340077
[2024-01-22 09:13:36] H5AD file missing or empty for HRR340078
[2024-01-22 09:13:36] Loom file missing or empty for HRR340078
[2024-01-22 09:13:36] H5AD file missing or empty for HRR340113
[2024-01-22 09:13:36] Loom file missing or empty for HRR340113
[2024-01-22 09:13:36] H5AD file missing or empty for HRR340265
[2024-01-22 09:13:36] Loom file missing or empty for HRR340265
[2024-01-22 09:13:36] H5AD file missing or empty for HRR340267
[2024-01-22 09:13:36] Loom file missing or empty for HRR340267
[2024-01-22 09:13:36] H5AD file missing or empty for HRR340269
[2024-01-22 09:13:36] Loom file missing or empty for HRR340269
[2024-01-22 09:13:36] H5AD file missing or empty for HRR340271
[2024-01-22 09:13:36] Loom file missing or empty for HRR340271
[2024-01-22 09:13:36] H5AD file missing or empty for HR

In [None]:
# # Edited for loading only necessary data at the moment it is being processed rather than
# # the whole database, reuse variables, and clear memory aggressively.
# import os
# import pandas as pd
# import scipy.io
# from scipy.sparse import csr_matrix
# import anndata
# import loompy
# import numpy as np
# import datetime
# import gc
# import concurrent.futures

# def print_with_timestamp(message):
#     timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
#     print(f"[{timestamp}] {message}")

# def get_file_size(file_path):
#     size_bytes = os.path.getsize(file_path)
#     size_megabytes = size_bytes / (1024 * 1024)
#     return size_megabytes

# base_dir = "Datafiles"
# patient_folders = [folder for folder in os.listdir(base_dir) if folder.startswith("HRR")]
# patient_folders.sort()

# start_patient = "HRR341172"
# start_index = patient_folders.index(start_patient) if start_patient in patient_folders else 0

# def process_patient(folder):
#     print_with_timestamp(f"Processing {folder}...")

#     matrix_path = os.path.join(base_dir, folder, folder + "_output/counts_unfiltered/cells_x_genes.mtx")
#     genes_path = os.path.join(base_dir, folder, folder + "_output/counts_unfiltered/cells_x_genes.genes.txt")
#     barcodes_path = os.path.join(base_dir, folder, folder + "_output/counts_unfiltered/cells_x_genes.barcodes.txt")

#     if os.path.exists(matrix_path) and os.path.exists(genes_path) and os.path.exists(barcodes_path):
#         print_with_timestamp("Required files found, loading data...")

#         matrix = scipy.io.mmread(matrix_path)
#         genes = pd.read_csv(genes_path, header=None)
#         barcodes = pd.read_csv(barcodes_path, header=None)

#         # Using csr_matrix to keep the data sparse
#         matrix_csr = csr_matrix(matrix)

#         adata = anndata.AnnData(X=matrix_csr)
#         adata.var_names = genes[0]
#         adata.obs_names = barcodes[0]
#         adata.var_names_make_unique()
#         adata.obs_names_make_unique()

#         h5ad_path = os.path.join(base_dir, folder, folder + "_H5AD.h5ad")
#         adata.write(h5ad_path)
#         print_with_timestamp(f"H5AD file for {folder} created. Size: {get_file_size(h5ad_path):.2f} MB")

#         matrix_csr_transposed = matrix_csr.T
#         loom_path = os.path.join(base_dir, folder, folder + ".loom")
#         loompy.create(loom_path, matrix_csr_transposed, {"Gene": genes[0].values}, {"CellID": barcodes[0].values})
#         print_with_timestamp(f"Loom file for {folder} created. Size: {get_file_size(loom_path):.2f} MB")

#         del matrix, genes, barcodes, matrix_csr, adata, matrix_csr_transposed
#         gc.collect()

#         print_with_timestamp(f"Finished processing {folder}. H5AD and Loom files for {folder} created.")
#     else:
#         print_with_timestamp(f"Required files for {folder} are missing. Skipping.")

# def process_batch(folders):
#     with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
#         futures = [executor.submit(process_patient, folder) for folder in folders]
#         concurrent.futures.wait(futures)
#     gc.collect()

# # Define batch size
# batch_size = 2

# # Process in batches
# for i in range(start_index, len(patient_folders), batch_size):
#     batch_folders = patient_folders[i:i+batch_size]
#     print_with_timestamp(f"Processing batch {i//batch_size + 1}")
#     process_batch(batch_folders)

In [None]:
import os
import pandas as pd
import scipy.io
from scipy.sparse import csr_matrix
import anndata
import loompy
import numpy as np
import datetime
import gc

def print_with_timestamp(message):
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(f"[{timestamp}] {message}")

def get_file_size(file_path):
    size_bytes = os.path.getsize(file_path)
    size_megabytes = size_bytes / (1024 * 1024)
    return size_megabytes

base_dir = "Datafiles"

# List of patient folders for which files are missing
missing_patients = [
    "HRR339754", "HRR340077", "HRR340078", "HRR340113", "HRR340265", "HRR340267",
    "HRR340269", "HRR340271", "HRR340273", "HRR340299", "HRR340301", "HRR340303",
    "HRR340305", "HRR340307", "HRR340309", "HRR340311", "HRR340313", "HRR340315",
    "HRR340317", "HRR340319", "HRR340321", "HRR340323", "HRR340325", "HRR340622",
    "HRR340623", "HRR340635", "HRR340661", "HRR340683", "HRR340685", "HRR340708",
    "HRR340717", "HRR340718", "HRR340719", "HRR340720", "HRR340729", "HRR340730",
    "HRR340731", "HRR340732", "HRR340744", "HRR340800", "HRR340845", "HRR340846",
    "HRR340847", "HRR340848", "HRR340885", "HRR340888", "HRR340891", "HRR340894",
    "HRR340897", "HRR340900", "HRR340902", "HRR340905", "HRR340907", "HRR340910",
    "HRR340913", "HRR340915", "HRR340918", "HRR340920", "HRR340923", "HRR340926",
    "HRR340929", "HRR340931", "HRR340932", "HRR340935", "HRR340965", "HRR340966",
    "HRR340967", "HRR340968", "HRR340969", "HRR340970", "HRR340972", "HRR340973",
    "HRR340974", "HRR340975", "HRR340976", "HRR340977", "HRR340978", "HRR340979",
    "HRR341220", "HRR341221", "HRR341222", "HRR341223", "HRR341228", "HRR341229",
    "HRR341230", "HRR341231", "HRR341236", "HRR341237", "HRR341238", "HRR341239",
    "HRR341244", "HRR341245", "HRR341246", "HRR341247", "HRR341252", "HRR341253",
    "HRR341254", "HRR341255", "HRR341260", "HRR341261", "HRR341262", "HRR341263",
    "HRR341268", "HRR341269", "HRR341270", "HRR341271", "HRR341276", "HRR341277",
    "HRR341278", "HRR341279", "HRR341284", "HRR341285", "HRR341286", "HRR341287",
    "HRR341292", "HRR341293", "HRR341294", "HRR341295", "HRR341300", "HRR341301",
    "HRR341302", "HRR341303", "HRR341308", "HRR341309", "HRR341310", "HRR341311",
    "HRR341316", "HRR341317", "HRR341318", "HRR341319"
]

def process_patient(folder):
    print_with_timestamp(f"Processing {folder}...")

    matrix_path = os.path.join(base_dir, folder, folder + "_output/counts_unfiltered/cells_x_genes.mtx")
    genes_path = os.path.join(base_dir, folder, folder + "_output/counts_unfiltered/cells_x_genes.genes.txt")
    barcodes_path = os.path.join(base_dir, folder, folder + "_output/counts_unfiltered/cells_x_genes.barcodes.txt")

    if os.path.exists(matrix_path) and os.path.exists(genes_path) and os.path.exists(barcodes_path):
        print_with_timestamp("Required files found, loading data...")

        matrix = scipy.io.mmread(matrix_path)
        genes = pd.read_csv(genes_path, header=None)
        barcodes = pd.read_csv(barcodes_path, header=None)

        matrix_csr = csr_matrix(matrix)

        adata = anndata.AnnData(X=matrix_csr)
        adata.var_names = genes[0]
        adata.obs_names = barcodes[0]
        adata.var_names_make_unique()
        adata.obs_names_make_unique()

        h5ad_path = os.path.join(base_dir, folder, folder + "_H5AD.h5ad")
        adata.write(h5ad_path)
        print_with_timestamp(f"H5AD file for {folder} created. Size: {get_file_size(h5ad_path):.2f} MB")

        matrix_csr_transposed = matrix_csr.T
        loom_path = os.path.join(base_dir, folder, folder + ".loom")
        loompy.create(loom_path, matrix_csr_transposed, {"Gene": genes[0].values}, {"CellID": barcodes[0].values})
        print_with_timestamp(f"Loom file for {folder} created. Size: {get_file_size(loom_path):.2f} MB")

        del matrix, genes, barcodes, matrix_csr, adata, matrix_csr_transposed
        gc.collect()

        print_with_timestamp(f"Finished processing {folder}. H5AD and Loom files for {folder} created.")
    else:
        print_with_timestamp(f"Required files for {folder} are missing. Skipping.")

# Process each missing patient
for folder in missing_patients:
    process_patient(folder)