In [11]:
import scanpy as sc
import scrublet as scr
import numpy as np
import h5py

def process_and_filter_data(adata):
    # Define the filter criteria
    min_genes = 500
    max_genes = 6000
    min_umis = 500
    max_umis = 40000
    max_mito_ratio = 0.10  # 10% mitochondrial genes ratio
    max_ribo_ratio = 0.40  # 40% ribosomal genes ratio

    # Filter cells based on gene counts and UMI counts
    sc.pp.filter_cells(adata, min_genes=min_genes)
    sc.pp.filter_cells(adata, max_genes=max_genes)
    sc.pp.filter_cells(adata, min_counts=min_umis)
    sc.pp.filter_cells(adata, max_counts=max_umis)

    # Calculate the mitochondrial and ribosomal gene ratios
    mito_genes = adata.var_names.str.startswith('MT-')
    ribo_genes = adata.var_names.str.startswith('RPL') | adata.var_names.str.startswith('RPS')

    # Calculate the mitochondrial and ribosomal gene sums for each cell
    mito_counts = adata[:, mito_genes].X.sum(axis=1)
    ribo_counts = adata[:, ribo_genes].X.sum(axis=1)
    
    # Calculate the mitochondrial and ribosomal gene ratios
    mito_ratio = mito_counts / len(adata.obs['n_counts'])
    ribo_ratio = ribo_counts / len(adata.obs['n_counts'])

    # Filter cells based on mitochondrial and ribosomal gene ratios
    adata = adata[(mito_ratio <= max_mito_ratio) & (ribo_ratio <= max_ribo_ratio), :]

    # Remove the temporary columns used for calculations
    adata.obs.drop(columns=['n_counts'], inplace=True)

    # Scrublet doublet detection
    scrub = scr.Scrublet(adata.X)
    doublet_scores, predicted_doublets = scrub.scrub_doublets()

    # Define a threshold for doublet prediction
    doublet_threshold = 0.25

    # Create a boolean mask for doublets
    is_doublet = predicted_doublets > doublet_threshold

    # Filter out doublets from the dataset
    adata = adata[~is_doublet, :]

    return adata


In [12]:
#load data as adata objects
#path is the directory with the `.mtx` file

path = '/home/dwk681/workspace/CRA004660/CRR403690_Liver-Iso-Y/CRR403690/outs/filtered_feature_bc_matrix'
adata_Iso_Y = sc.read_10x_mtx(
    path,  
    cache=True)

path = '/home/dwk681/workspace/CRA004660/CRR403692_Liver-Het-Y/CRR403692/outs/filtered_feature_bc_matrix'
adata_Het_Y = sc.read_10x_mtx(
    path,  
    cache=True)

path = '/home/dwk681/workspace/CRA004660/CRR403691_Liver-Iso-O/CRR403691/outs/filtered_feature_bc_matrix'
adata_Iso_O = sc.read_10x_mtx(
    path,  
    cache=True)

path = '/home/dwk681/workspace/CRA004660/CRR403693_Liver-Het-O/CRR403693/outs/filtered_feature_bc_matrix/' 
adata_Het_O = sc.read_10x_mtx(
    path,  
    cache=True)



In [14]:
#adata_list = [adata_Iso_Y, adata_Het_Y, adata_Iso_O, adata_Het_O]
#processed_data_list = [process_and_filter_data(adata) for adata in adata_list]

adata_Iso_Y_filtered = process_and_filter_data(adata_Iso_Y)
adata_Iso_O_filtered = process_and_filter_data(adata_Iso_O)
adata_Het_Y_filtered = process_and_filter_data(adata_Het_Y)
adata_Het_O_filtered = process_and_filter_data(adata_Het_O)

Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.63
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 0.1%
Overall doublet rate:
	Expected   = 10.0%
	Estimated  = 16.7%
Elapsed time: 3.4 seconds
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.60
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 0.2%
Overall doublet rate:
	Expected   = 10.0%
	Estimated  = 20.7%
Elapsed time: 8.6 seconds
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.81
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 0.4%
Overall doublet rate:
	Expected   = 10.0%
	Estimated  = 0.0%
Elapsed time: 15.0 seconds
Preprocessing...
Simulating double

In [24]:
count_matrix_Iso_Y_filtered = adata_Iso_Y_filtered.X.toarray()
count_matrix_Iso_O_filtered = adata_Iso_O_filtered.X.toarray()
count_matrix_Het_Y_filtered = adata_Het_Y_filtered.X.toarray()
count_matrix_Het_O_filtered = adata_Het_O_filtered.X.toarray()



(11518, 68886)
(14029, 68886)
(9052, 68886)
(4273, 68886)


In [28]:
max_rows = max(count_matrix_Iso_Y_filtered.shape[0], count_matrix_Iso_O_filtered.shape[0], count_matrix_Het_Y_filtered.shape[0], count_matrix_Het_O_filtered.shape[0])

# Create new matrices with the maximum number of rows and fill in missing rows with NaN values
def pad_matrix(matrix, max_rows):
    num_rows, num_cols = matrix.shape
    if num_rows < max_rows:
        padding = np.full((max_rows - num_rows, num_cols), np.nan)
        return np.vstack((matrix, padding))
    else:
        return matrix

count_matrix_Iso_Y_padded = pad_matrix(count_matrix_Iso_Y_filtered, max_rows)
count_matrix_Iso_O_padded = pad_matrix(count_matrix_Iso_O_filtered, max_rows)
count_matrix_Het_Y_padded = pad_matrix(count_matrix_Het_Y_filtered, max_rows)
count_matrix_Het_O_padded = pad_matrix(count_matrix_Het_O_filtered, max_rows)

(14029, 68886)
(14029, 68886)
(14029, 68886)
(14029, 68886)


In [30]:
output_directory = '/home/dwk681/workspace/CRA004660/'
filename = "all_four_padded_filtered_count_matrices"  # Update this with the desired filename

# Define the count matrices (e.g., count_matrix_Iso_Y_padded, count_matrix_Iso_O_padded, etc.)

# Create an HDF5 file and save the count matrices
with h5py.File(output_directory + f'/{filename}.mat', 'w') as f:
    f.create_dataset('Iso_Y', data=count_matrix_Iso_Y_padded)
    f.create_dataset('Iso_O', data=count_matrix_Iso_O_padded)
    f.create_dataset('Het_Y', data=count_matrix_Het_Y_padded)
    f.create_dataset('Het_O', data=count_matrix_Het_O_padded)

