In [None]:
#This code processes the feature matrix that was outputted by cellranger count, and prepares for hypermatrix analysis
#Not sure how to make this code filter in the same way that the adata code works

In [51]:
#This cell import modules, define some functions for loading, saving and processing a gene-barcode matrix
#This code was taken from https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/python
#Also taken from https://github.com/mousepixels/sanbomics_scripts/blob/main/doublet_removal_SOLO_scVI.ipynb

%matplotlib inline
import collections
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.sparse as sp_sparse
import h5py
import scvi
import scanpy as sc
import scrublet as scr
import pandas as pd
from matplotlib.pyplot import rc_context
sc.set_figure_params(dpi=100)

np.random.seed(0)
FeatureBCMatrix = collections.namedtuple('FeatureBCMatrix', ['feature_ids', 'feature_names', 'barcodes', 'matrix'])

def get_matrix_from_h5(filename):
    with h5py.File(filename) as f:
        if u'version' in f.attrs:
            if f.attrs['version'] > 2:
                raise ValueError('Matrix HDF5 file format version (%d) is an newer version that is not supported by this function.' % version)
        else:
            raise ValueError('Matrix HDF5 file format version (%d) is an older version that is not supported by this function.' % version)
        
        feature_ids = [x.decode('ascii', 'ignore') for x in f['matrix']['features']['id']]
        feature_names = [x.decode('ascii', 'ignore') for x in f['matrix']['features']['name']]        
        barcodes = list(f['matrix']['barcodes'][:])
        matrix = sp_sparse.csc_matrix((f['matrix']['data'], f['matrix']['indices'], f['matrix']['indptr']), shape=f['matrix']['shape'])
        return FeatureBCMatrix(feature_ids, feature_names, barcodes, matrix)

    
def get_df_from_tuple(feature_bc_matrix):
    feature_ids = feature_bc_matrix.feature_ids
    feature_names = feature_bc_matrix.feature_names
    barcodes = feature_bc_matrix.barcodes
    matrix = feature_bc_matrix.matrix
    dense_matrix = matrix.toarray()
    
    # Create a pandas DataFrame
    df = pd.DataFrame(data=dense_matrix.T, columns=feature_names, index=barcodes)
    return df

def get_expression(fbm, gene_name):
    try:
        gene_index = feature_bc_matrix.feature_names.index(gene_name)
    except ValueError:
        raise Exception("%s was not found in list of gene names." % gene_name)
    return fbm.matrix[gene_index, :].toarray().squeeze()

  IPython.display.set_matplotlib_formats(*ipython_format)


In [52]:
#load data into variables

path = '/home/dwk681/workspace/CRA004660/CRR403693_Liver-Het-O/CRR403693/outs/filtered_feature_bc_matrix.h5'
feature_bc_matrix_Het_O = get_matrix_from_h5(path)
df_Het_O = get_df_from_tuple(feature_bc_matrix_Het_O)

path = '/home/dwk681/workspace/CRA004660/CRR403690_Liver-Iso-Y/CRR403690/outs/filtered_feature_bc_matrix.h5'
feature_bc_matrix_Iso_Y = get_matrix_from_h5(path)
df_Iso_Y = get_df_from_tuple(feature_bc_matrix_Iso_Y)

path = '/home/dwk681/workspace/CRA004660/CRR403692_Liver-Het-Y/CRR403692/outs/filtered_feature_bc_matrix.h5'
feature_bc_matrix_Het_Y = get_matrix_from_h5(path)
df_Het_Y = get_df_from_tuple(feature_bc_matrix_Het_Y)

path = '/home/dwk681/workspace/CRA004660/CRR403691_Liver-Iso-O/CRR403691/outs/filtered_feature_bc_matrix.h5'
feature_bc_matrix_Iso_O = get_matrix_from_h5(path)
df_Iso_O = get_df_from_tuple(feature_bc_matrix_Iso_O)


In [53]:
df_Het_O 

Unnamed: 0,GRCh38_MIR1302-2HG,GRCh38_FAM138A,GRCh38_OR4F5,GRCh38_AL627309.1,GRCh38_AL627309.3,GRCh38_AL627309.2,GRCh38_AL627309.5,GRCh38_AL627309.4,GRCh38_AP006222.2,GRCh38_AL732372.1,...,mm10___Gm16367,mm10___AC163611.1,mm10___AC163611.2,mm10___AC140365.1,mm10___AC124606.2,mm10___AC124606.1,mm10___AC133095.2,mm10___AC133095.1,mm10___AC234645.1,mm10___AC149090.1
b'AAACCCAAGAGCCGAT-1',0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,15
b'AAACCCAAGCCAGAGT-1',0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5
b'AAACCCAAGCGCTGCT-1',0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
b'AAACCCAAGGAGAGTA-1',0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
b'AAACCCAAGTTTGTCG-1',0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
b'TTTGTTGTCCTGTAGA-1',0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
b'TTTGTTGTCCTTGAAG-1',0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5
b'TTTGTTGTCGATACAC-1',0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
b'TTTGTTGTCTCATTTG-1',0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3


In [48]:
#Remove some genes and cells according to paper's parameters

def process_dataframe(df, min_genes=500, max_genes=6000, min_umis=500, max_umis=40000, max_mito_ratio=0.10, max_ribo_ratio=0.40, doublet_threshold=0.25):
    # Filter cells based on gene counts and UMI counts
    df = df[(df >= min_genes).sum(axis=1) > 0]
    df = df[(df <= max_genes).sum(axis=1) > 0]
    df = df[df.sum(axis=1) >= min_umis]
    df = df[df.sum(axis=1) <= max_umis]

    # Calculate the mitochondrial and ribosomal gene ratios
    mito_genes = df.columns[df.columns.str.startswith('MT-')]
    ribo_genes = df.columns[df.columns.str.startswith('RPL') | df.columns.str.startswith('RPS')]

    # Calculate the mitochondrial and ribosomal gene sums for each cell
    mito_counts = df[mito_genes].sum(axis=1)
    ribo_counts = df[ribo_genes].sum(axis=1)

    # Calculate the mitochondrial and ribosomal gene ratios
    mito_ratio = mito_counts / df.sum(axis=1)
    ribo_ratio = ribo_counts / df.sum(axis=1)

    # Filter cells based on mitochondrial and ribosomal gene ratios
    df = df[(mito_ratio <= max_mito_ratio) & (ribo_ratio <= max_ribo_ratio)]

    # Define a function to remove doublets
    #def remove_doublets(df, doublet_threshold):
    #    scrub = scr.Scrublet(df.values)
    #    doublet_scores, predicted_doublets = scrub.scrub_doublets()
    #    is_doublet = predicted_doublets > doublet_threshold
    #    return df[~is_doublet]

    # Filter out doublets from the dataset
    #df = remove_doublets(df, doublet_threshold)

    return df



In [49]:
df_Het_O = process_dataframe(df_Het_O)
df_Iso_Y = process_dataframe(df_Iso_Y)
df_Het_Y = process_dataframe(df_Het_Y)
df_Iso_O = process_dataframe(df_Iso_O)


In [50]:
#original
#Het_O     19,362 × 68,886


#second run, filtering without scrublet, with annData Object
#df_Het_O         #11,519 × 68,886
#df_Iso_Y         #261 rows × 68886 columns
#df_Het_Y         #938 rows × 68886 columns
#df_Iso_O    

#first run, seems to eliminate too much
#df_Het_O      #1,444 rows × 68,886 columns
#df_Iso_Y      #247 rows × 68,886 columns
#df_Het_Y      #929 rows × 68,886 columns
#df_Iso_O      #755 rows × 68886 columns


Unnamed: 0,GRCh38_MIR1302-2HG,GRCh38_FAM138A,GRCh38_OR4F5,GRCh38_AL627309.1,GRCh38_AL627309.3,GRCh38_AL627309.2,GRCh38_AL627309.5,GRCh38_AL627309.4,GRCh38_AP006222.2,GRCh38_AL732372.1,...,mm10___Gm16367,mm10___AC163611.1,mm10___AC163611.2,mm10___AC140365.1,mm10___AC124606.2,mm10___AC124606.1,mm10___AC133095.2,mm10___AC133095.1,mm10___AC234645.1,mm10___AC149090.1
b'AAACCCAAGTTTGTCG-1',0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6
b'AAACCCATCCTCGCAT-1',0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,14
b'AAACGAAGTCTCACAA-1',0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5
b'AAAGAACGTGCCTATA-1',0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,21
b'AAAGGATCAATAACCC-1',0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
b'TTTGGAGCACTACCCT-1',0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,15
b'TTTGGAGGTCGAATGG-1',0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,15
b'TTTGGTTAGCTTAGTC-1',0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8
b'TTTGTTGCAGTCCGTG-1',0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,23


In [None]:
#notes

# matrix contians 68,886 features x 19,362 samples
# rows are cells, colomns are features (genes) 
# feature_bc_matrix[0] are the feature IDs                                     length 68,886
# feature_bc_matrix[1] are the feature_names                                   length 68,886
# feature_bc_matrix[2] are the numpy barcodes                                 length 19,362
# feature_bc_matrix[3] is the 68,886x19,362 matrix in sparse matrix format

# calculate UMIs and genes per cell
#umis_per_cell = np.asarray(feature_bc_matrix.matrix.sum(axis=0)).squeeze()
#genes_per_cell = np.asarray((feature_bc_matrix.matrix > 0).sum(axis=0)).squeeze()

# You can access expression values for a specific gene, for example, 'MY_GENE', like this:
#gene_expression = df['MY_GENE']

# You can also access the expression values for a specific barcode, for example, 'MY_BARCODE', like this:
#barcode_expression = df.loc['MY_BARCODE']