In [1]:
%matplotlib inline
import scrublet as scr
import scipy.io
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os

In [2]:
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = 'Arial'
plt.rc('font', size=14)
plt.rcParams['pdf.fonttype'] = 42

In [8]:
input_dir = '/project/gca/yuzhao1/work/final_RC2rna/preprocessing/'
output_doubletscore_path = '/project/gca/yuzhao1/work/final_RC2rna/preprocessing/doublet_scores/'
meta_sampleID_path = '/project/gca/yuzhao1/work/final_RC2rna/metadata/SampleIDs_pppou.csv'
meta_sampleID = pd.read_csv(meta_sampleID_path)

In [12]:
# loop index i for samples
for i in range(0,14):
    counts_matrix_path = input_dir + 'matrix_removedAmbientRNA/' + meta_sampleID['SampleID'][i] + '.mtx'
    gene_path = input_dir + 'gene_removedAmbientRNA/' + meta_sampleID['SampleID'][i] + '.tsv'
    
    #Load the raw counts matrix as a scipy sparse matrix with cells as rows and genes as columns.
    counts_matrix = scipy.io.mmread(counts_matrix_path).T.tocsc()
    genes = np.array(scr.load_genes(gene_path, delimiter='\t', column=0))
    
    # expected_doublet_rate: the expected fraction of transcriptomes that are doublets, typically 0.05-0.1. 
    # Results are not particularly sensitive to this parameter.
    scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=0.06)
    
    # Run the default pipeline
    doublet_scores, predicted_doublets = scrub.scrub_doublets(min_counts=2, 
                                                          min_cells=3, 
                                                          min_gene_variability_pctl=85, 
                                                          n_prin_comps=30)
    # save doublet score to a folder, named by each sample
    i_doubletscore_path = output_doubletscore_path + meta_sampleID['SampleID'][i] + '.csv'
    np.savetxt(i_doubletscore_path, doublet_scores, delimiter=",")



Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.51
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 0.1%
Overall doublet rate:
	Expected   = 6.0%
	Estimated  = 0.0%
Elapsed time: 9.4 seconds
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.44
Detected doublet rate = 0.5%
Estimated detectable doublet fraction = 12.7%
Overall doublet rate:
	Expected   = 6.0%
	Estimated  = 3.9%
Elapsed time: 4.7 seconds
Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.27
Detected doublet rate = 1.6%
Estimated detectable doublet fraction = 29.9%
Overall doublet rate:
	Expected   = 6.0%
	Estimated  = 5.5%
Elapsed time: 7.1 seconds
Preprocessing...
Simulating doublets..

In [5]:
print('Counts matrix shape: {} rows, {} columns'.format(counts_matrix.shape[0], counts_matrix.shape[1]))
print('Number of genes in gene list: {}'.format(len(genes)))

Counts matrix shape: 9207 rows, 36601 columns
Number of genes in gene list: 36601
