# Find the target of downsample for RNA

In [1]:
suppressPackageStartupMessages({
    library(Seurat)
    library(repr)
    library(patchwork)
    library(ggplot2)
    library(Signac)
    library(tidyverse)
    library(GenomicRanges)
    library(edgeR)
    library(SingleCellExperiment)
    library(Matrix)
    library(scran)
    library(scater)
    library(ggrepel)
    library(fs)
})
options(future.globals.maxSize = Inf)
options(Seurat.object.assay.version = "v5")
options(ggrepel.max.overlaps = Inf)

In [4]:
root_dir <- "/tscc/projects/ps-epigen/users/biy022/biccn/data/SNAREdata"
setwd(root_dir)

In [3]:
meta_sheet <- "20230313_RNA_metadata.xls"
meta_df <- read.table(meta_sheet, sep = "\t", header = T, row.names = 1)

In [6]:
all_barcodes <- c()
for (subclass in unique(meta_df$Subclass_F)) {
    subclass_name <- gsub("[ /]", "_", subclass)
    if (subclass_name == "Micro_PVM") {
        subclass_name <- "Micro"
    }
    rna_mat <- readRDS(file.path("cell_gene_matrix/", sprintf("%s.rds", subclass_name)))
    atac_mat <- readRDS(file.path("cell_peak_matrix/", sprintf("%s.rds", subclass_name)))
    rna_bc <- colnames(rna_mat)
    atac_bc <- colnames(atac_mat)
    tmp_bc <- intersect(rna_bc, atac_bc)
    all_barcodes <- c(all_barcodes, tmp_bc)
}

In [10]:
meta_df <- meta_df[all_barcodes, ]

In [11]:
table(meta_df$Subclass_F, meta_df$Region)

            
                A1C    A24     A9    AnG     FI    M1C    MTG    S1C    V1C
  Astro       14649  13226  10920  29533  19811  23533   9592  18082  10270
  Chandelier    714    641    929   1485   1558   1890    768   1062    630
  Endo          312    459    473    312    448    931    415    430    364
  L2/3 IT     27167  13243  21608  45307  32170  51090  26804  32751  48555
  L4 IT        4676   1592   4342  10747   6671   5580   7610   3791  13865
  L5 ET         179    489    329    278    623    920    161    656    137
  L5 IT        6365   2947   6942  12085   9173   7754   8126   5468   8984
  L5/6 NP       942    929   1268   2008   1834   1785   1258   1513    815
  L6 CT        3139   2546   2644   2650   3850   7478   2311   4162   6337
  L6 IT        4924   2586   4892   5537   7489   7400   3861   6407   4813
  L6 IT Car3   3969    507   1266   2439   4207   2693   1915   2019   2218
  L6B          1542   1581   2480   1691   4206   3041   1235   1996   1157

In [12]:
table(meta_df$Subclass_F, meta_df$Region) < 1000

Unnamed: 0,A1C,A24,A9,AnG,FI,M1C,MTG,S1C,V1C
Astro,False,False,False,False,False,False,False,False,False
Chandelier,True,True,True,False,False,False,True,False,True
Endo,True,True,True,True,True,True,True,True,True
L2/3 IT,False,False,False,False,False,False,False,False,False
L4 IT,False,False,False,False,False,False,False,False,False
L5 ET,True,True,True,True,True,True,True,True,True
L5 IT,False,False,False,False,False,False,False,False,False
L5/6 NP,True,True,False,False,False,False,False,False,True
L6 CT,False,False,False,False,False,False,False,False,False
L6 IT,False,False,False,False,False,False,False,False,False


In [14]:
target <- 1000
output_dir <- "downsample_analysis/sample_barcodes/"
for (subclass in unique(meta_df$Subclass_F)) {
    subclass_name <- gsub("[ /]", "_", subclass)
    tmp_df <- meta_df[meta_df$Subclass_F == subclass, ]
    sel_barcodes <- c()
    for (region in unique(tmp_df$Region)) {
        tmp_barcodes <- rownames(tmp_df[tmp_df$Region == region, ])
        n <- min(target, length(tmp_barcodes))
        sel_barcodes <- c(sel_barcodes, sample(tmp_barcodes, n, replace = FALSE))
    }
    out_df <- data.frame(barcode = sel_barcodes)
    out_file <- file.path(output_dir, sprintf("%s.tsv", subclass_name))
    write.table(out_df, out_file, col.names = T, sep = "\t", quote = F, row.names = F)
}

In [16]:
# Sanity Check
subclass <- "L5_IT"
object <- readRDS(sprintf("cell_gene_matrix/%s.rds", subclass))
barcodes <- read.table(file.path(output_dir, sprintf("%s.tsv", subclass)), sep = "\t", header = T)

In [17]:
all(barcodes$barcode %in% colnames(object))

In [18]:
length(barcodes$barcode)

In [19]:
nrow(meta_df)