In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
import os
import glob


In [2]:
# Allocate the base directory of the data
base_dir = "/scratch/bvdberg/SoloTE/"

# Use glob to find directories matching the pattern
directories = glob.glob(os.path.join(base_dir,"run_*/", "*_SoloTE_output/", "*_locustes_MATRIX"))
anndata = {}


In [21]:
for directory in directories:

    # Set the wildcard per directory
    wildcard = os.path.basename(os.path.dirname(directory))

    # Check if the required files exist in the directory
    matrix_file = os.path.join(directory, "matrix.mtx")
    barcodes_file = os.path.join(directory, "barcodes.tsv")
    features_file = os.path.join(directory, "features.tsv")

    if os.path.exists(matrix_file) and os.path.exists(barcodes_file) and os.path.exists(features_file):
        # Create a new AnnData object for each directory
        adata = sc.AnnData()

        # Read the matrix using scanpy
        adata.X = sc.read_mtx(matrix_file)
        print(adata)
        
        # Read barcodes and features using pandas
        barcodes = pd.read_csv(barcodes_file, sep='\t', header=None, names=["barcode"])
        features = pd.read_csv(features_file, sep='\t', header=None, names=["gene_name"])
        print (barcodes)
        print (features)
        
        # Set obs_names and var_names
        adata.obs_names = barcodes[0].values
        adata.var_names = features[0].values

        anndata[wildcard] = adata

    else:
        print(f"Required files not found in directory: {directory}")


In [7]:
# Give each dataset their own label to keep track of where they came from
adatas = {"A018": adata1, "A019": adata2, "A020": adata3}

# Combine the adata sets to one data set with the Concatenate function, we use 'outer' to preserve as much data as possible. Missing variablles will become NaN values
adatas = ad.concat(adatas, label="dataset_origin",  join="outer")


  utils.warn_names_duplicates("obs")


In [None]:
adatas.write_h5ad(filename="bonemarrow_collection.h5ad")
