# Code for converting H5AD files to TDB
### At the beginning of this file, please indicate whether you want to convert raw or log2 files to TDB.
### At the end of this file, the terminal output is saved to a log file.

In [1]:
# Specify the file type to convert: 'both', 'raw', or 'log2'
file_type_to_convert = 'log2' #'both'  or 'raw' or 'log2'

## Preliminary Steps
Import Libraries: Necessary Python packages like anndata, tiledb, logging, etc., are imported.
Print Versions: TileDB versions are printed for debugging and traceability.
Specify Directories: input_dir and output_dir are specified.
List Files: All files in the input_dir are printed for a visual display.

In [2]:
# Import libraries
import logging
import io
import sys
import os
import numpy as np
import anndata as ad
import pandas as pd
import tiledb
import tiledbsoma
import tiledbsoma.io
import tiledbsoma.logging
import logging
# from concurrent.futures import ThreadPoolExecutor # For later for parallel conversion
from tqdm import tqdm

In [3]:
# Print versions for debugging and traceability
print("TileDB core version: {}".format(tiledb.libtiledb.version()))
print(f"""TileDB-Py v{tiledb.version()} TileDB-SOMA v{tiledbsoma.__version__}""")

TileDB core version: (2, 15, 2)
TileDB-Py v(0, 21, 3) TileDB-SOMA v1.2.2


In [4]:
download_base = '/shared/Data/abc_download_root'
# input and output directories
input_dir = download_base + "/expression_matrices/WMB-10Xv3/20230630"
output_dir = download_base + "/expression_matrices/WMB-10Xv3-TDB/20230630"
print(f"Input directory: {input_dir}")
print(f"Output directory: {output_dir}")

Input directory: /shared/Data/abc_download_root/expression_matrices/WMB-10Xv3/20230630
Output directory: /shared/Data/abc_download_root/expression_matrices/WMB-10Xv3-TDB/20230630


## File Listing with Sizes
Initialize Lists: Two lists raw_files and log2_files are initialized to store filenames with sizes.
Check File Sizes: For each file in the input_dir, if it's a raw or log2 file, its size in GB is calculated and stored alongside the filename in the corresponding list.

In [5]:
# Get a list of files for visual display
for filename in (os.listdir(input_dir)):
    print(filename)

WMB-10Xv3-PAL-log2.h5ad
WMB-10Xv3-CB-log2.h5ad
WMB-10Xv3-OLF-log2.h5ad
WMB-10Xv3-HY-log2.h5ad
WMB-10Xv3-STR-log2.h5ad
WMB-10Xv3-Isocortex-2-log2.h5ad
WMB-10Xv3-MB-log2.h5ad
WMB-10Xv3-HPF-log2.h5ad
WMB-10Xv3-Isocortex-1-log2.h5ad
WMB-10Xv3-MY-log2.h5ad
WMB-10Xv3-CTXsp-log2.h5ad
WMB-10Xv3-P-log2.h5ad
WMB-10Xv3-TH-log2.h5ad


In [6]:
# Initialize lists to hold filenames and sizes
raw_files = []
log2_files = []

# Function to get file size in GB
def get_size_in_gb(file_path):
    size_bytes = os.path.getsize(file_path)
    size_gb = size_bytes / (1024 ** 3)
    return f"{size_gb:.2f} GB"

# Get a list of files for visual display
for filename in os.listdir(input_dir):
    full_path = os.path.join(input_dir, filename)
    if filename.endswith("raw.h5ad"):
        size = get_size_in_gb(full_path)
        raw_files.append(f"{filename} ({size})")
    elif filename.endswith("log2.h5ad"):
        size = get_size_in_gb(full_path)
        log2_files.append(f"{filename} ({size})")

# Print filenames in two columns: raw and log2
print(f"{'Raw Files':<50}{'Log2 Files':<50}")
print("=" * 100)

# Determine the maximum number of rows to print
max_rows = max(len(raw_files), len(log2_files))

# Print the filenames and sizes side by side
for i in range(max_rows):
    raw_file = raw_files[i] if i < len(raw_files) else ''
    log2_file = log2_files[i] if i < len(log2_files) else ''
    print(f"{raw_file:<50}{log2_file:<50}")

Raw Files                                         Log2 Files                                        
                                                  WMB-10Xv3-PAL-log2.h5ad (3.79 GB)                 
                                                  WMB-10Xv3-CB-log2.h5ad (5.23 GB)                  
                                                  WMB-10Xv3-OLF-log2.h5ad (2.90 GB)                 
                                                  WMB-10Xv3-HY-log2.h5ad (6.75 GB)                  
                                                  WMB-10Xv3-STR-log2.h5ad (11.10 GB)                
                                                  WMB-10Xv3-Isocortex-2-log2.h5ad (7.78 GB)         
                                                  WMB-10Xv3-MB-log2.h5ad (12.78 GB)                 
                                                  WMB-10Xv3-HPF-log2.h5ad (6.90 GB)                 
                                                  WMB-10Xv3-Isocortex-1-log2.h5ad (10.96 GB

## Logging Setup
Initialize Logging: Logging is set up to capture messages at the INFO level.
Log File: A conversion_all.log file is created in a conversionLog directory under output_dir. This file will capture all the logging messages.

In [7]:
# Initialize logging
conversion_log_dir = os.path.join(output_dir, "conversionLog")

# Create the conversionLog directory if it doesn't exist
if not os.path.exists(conversion_log_dir):
    os.makedirs(conversion_log_dir)

# Setup Python logging
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    handlers=[logging.FileHandler(os.path.join(conversion_log_dir, 'conversion_all.log'), 'w', 'utf-8'),
                              logging.StreamHandler()])

logging.info("Starting conversion...")

2023-09-23 18:33:34,610 - INFO - Starting conversion...


Counters Initialization
Initialize Counters: Counters for completed_files and total_files are initialized to keep track of conversion progress.

In [8]:
# Initialize counters for completed files and total files
completed_files = {'raw': 0, 'log2': 0}
total_files = {'raw': len([f for f in os.listdir(input_dir) if f.endswith("raw.h5ad")]),
               'log2': len([f for f in os.listdir(input_dir) if f.endswith("log2.h5ad")])}

## The Conversion Function (convert_file)
File Type Check: Validates the file_type argument and logs an error if invalid.
Conversion: Checks if the file should be converted based on its suffix and the file_type argument.
Buffer Size: Dynamically sets the TileDB buffer size based on the size of the AnnData object.
Column Check: Checks for and removes columns with non-ASCII characters in the AnnData object.
TileDB Conversion: Calls tiledbsoma.io.from_anndata() to convert the AnnData object to a TileDB array.
Progress Logging: Logs the conversion progress to both the terminal and the conversion_all.log file.
TileDB Logging: Captures TileDB-specific logging into individual log files, one for each converted file.

In [9]:
# Function to convert a single file
def convert_file(filename, file_type='both'):
    try:
        if file_type not in ['both', 'raw', 'log2']:
            logging.error("Invalid file_type. Use 'both', 'raw', or 'log2'")
            return
        
        if (filename.endswith("raw.h5ad") and file_type in ['both', 'raw']) or \
           (filename.endswith("log2.h5ad") and file_type in ['both', 'log2']):
            
            key = 'raw' if filename.endswith("raw.h5ad") else 'log2'
            
            fname = os.path.join(input_dir, filename)
            uri = os.path.join(output_dir, os.path.splitext(filename)[0])

            logging.info(f"Starting conversion: {fname} to {uri}")

            if os.path.exists(uri):
                logging.info(f"Output file {uri} already exists. Skipping.")
                return

            adata = ad.read_h5ad(fname, 'r')

            # Update buffer size dynamically based on the AnnData object size
            # Calculate the size of the AnnData object's X attribute in bytes
            adata_nbytes = adata.X.nbytes if hasattr(adata.X, 'nbytes') else 0
            # Calculate the dynamic buffer size
            buffer_size = min(50 * 1024**2, adata_nbytes * 2)
            # Update the config with the new buffer size
            cfg = tiledb.Ctx().config()
            cfg.update({'py.init_buffer_bytes': buffer_size})

            for col in adata.obs.select_dtypes(include=[pd.CategoricalDtype]).columns:
                if adata.obs[col].str.contains(r'[^\x00-\x7F]+').any():
                    logging.warning(f"Column '{col}' in file {filename} contains records with non-ASCII characters. Removing column.")
                    adata.obs.drop(col, axis=1, inplace=True)

            tiledbsoma.io.from_anndata(uri, adata, measurement_name="RNA")
            
            # Increment and display the counter
            completed_files[key] += 1
            logging.info(f"Successfully converted {filename} to {uri}. Progress: {completed_files[key]}/{total_files[key]}")

            # Save tiledb-specific logging output to individual log file
            # log_file = os.path.join(conversion_log_dir, f"{os.path.splitext(filename)[0]}.log")
            # with open(log_file, 'w') as f:
            #     f.write(tiledbsoma.logging.info())   # currently spits out an error since the value returned by tiledbsoma.logging.info() is None             
                
    except Exception as e:
        logging.error(f"An error occurred while processing {filename}: {e}")

## Run &Capturing Terminal Output
Capture stdout: The standard output is captured into a StringIO object.
Run Conversion: The convert_file function is called for each file in the input_dir while displaying a progress bar.
Restore stdout: The standard output is restored.
Save Captured Output: The captured terminal output is saved into a log file named conversion_terminal_output.log in the output_dir.
### Output & Logging
Terminal: You will see logging messages and a progress bar.
conversion_all.log: This file will contain all the logging messages.
Individual Log Files: TileDB-specific logging for each file will be saved in individual .log files in the conversionLog directory.
conversion_terminal_output.log: This will contain everything that was printed to the terminal.

In [10]:
# Capture stdout
old_stdout = sys.stdout
new_stdout = io.StringIO()
sys.stdout = new_stdout

# Run conversion and display progress bar
for filename in tqdm(os.listdir(input_dir)):
    convert_file(filename, file_type=file_type_to_convert)
    
# Restore stdout
sys.stdout = old_stdout

# Get the captured content and write to file
captured_output = new_stdout.getvalue()

# Save captured terminal output to a log file
log_file = os.path.join(output_dir, "conversion_terminal_output.log")
with open(log_file, 'w') as f:
    f.write(captured_output)

  0%|          | 0/13 [00:00<?, ?it/s]2023-09-23 18:33:34,854 - INFO - Starting conversion: /shared/Data/abc_download_root/expression_matrices/WMB-10Xv3/20230630/WMB-10Xv3-PAL-log2.h5ad to /shared/Data/abc_download_root/expression_matrices/WMB-10Xv3-TDB/20230630/WMB-10Xv3-PAL-log2
2023-09-23 18:37:08,642 - INFO - Successfully converted WMB-10Xv3-PAL-log2.h5ad to /shared/Data/abc_download_root/expression_matrices/WMB-10Xv3-TDB/20230630/WMB-10Xv3-PAL-log2. Progress: 1/13
  8%|▊         | 1/13 [03:33<42:45, 213.79s/it]2023-09-23 18:37:08,647 - INFO - Starting conversion: /shared/Data/abc_download_root/expression_matrices/WMB-10Xv3/20230630/WMB-10Xv3-CB-log2.h5ad to /shared/Data/abc_download_root/expression_matrices/WMB-10Xv3-TDB/20230630/WMB-10Xv3-CB-log2
2023-09-23 18:41:30,332 - INFO - Successfully converted WMB-10Xv3-CB-log2.h5ad to /shared/Data/abc_download_root/expression_matrices/WMB-10Xv3-TDB/20230630/WMB-10Xv3-CB-log2. Progress: 2/13
 15%|█▌        | 2/13 [07:55<44:21, 241.97s/it]

In [11]:
# # Parallel conversion of files
# with ThreadPoolExecutor() as executor:
#     # executor.map(convert_file, os.listdir(input_dir))
#     list(tqdm(executor.map(convert_file, os.listdir(input_dir)), total=len(os.listdir(input_dir))))
# # Save terminal output to log file (if needed in addition to logging)
# log_file = os.path.join(output_dir, "conversion_terminal_output.log")
# with open(log_file, 'w') as f:
#     f.write(tiledbsoma.logging.info())