# Downsample the dataset

In [1]:
suppressPackageStartupMessages({
    library(Seurat)
    library(repr)
    library(patchwork)
    library(ggplot2)
    library(Signac)
    library(tidyverse)
    library(GenomicRanges)
    library(edgeR)
    library(SingleCellExperiment)
    library(Matrix)
    library(scran)
    library(scater)
    library(ggrepel)
    library(fs)
    library(tidyverse)
    library(randomForest)
    library(reticulate)
    library(pheatmap)
    library(gridExtra)
    library(RColorBrewer)
    library(MAST)
    library(data.table)
    library(ComplexHeatmap)
})
options(future.globals.maxSize = Inf)
options(Seurat.object.assay.version = "v5")
options(ggrepel.max.overlaps = Inf)

In [2]:
setwd("/tscc/projects/ps-epigen/users/biy022/biccn/data/SNAREdata/")

## Preprocessing

In [3]:
regions_bed <- read.table(
    "cell_peak_matrix/peaks.bed", header = FALSE, sep = "\t", colClasses = "character")
regions <- apply(regions_bed[, 1:3], 1, base::paste, collapse = "-")

In [4]:
write.table(
    regions, "cell_peak_matrix/regions.csv", col.names = FALSE, row.names = FALSE,
    quote = FALSE, sep = ",")

In [5]:
# First create dgCMatrix rds objects for atac (WTF?)
for (file in dir_ls("cell_peak_matrix/", glob = "*mtx")) {
    subclass_name <- path_ext_remove(path_file(file))
    barcodes <- read.table(
        path_join(c("cell_peak_matrix/", paste0(subclass_name, "-barcodes.tsv"))),
        header = FALSE, sep = "\t")
    regions <- read.table("cell_peak_matrix/regions.csv", header = FALSE, sep = ",")
    curr_mat_raw <- readMM(file)
    curr_mat <- as(curr_mat_raw, "dgCMatrix")
    colnames(curr_mat) <- barcodes[, 1]
    rownames(curr_mat) <- regions[, 1]
    saveRDS(curr_mat, path_join(c("cell_peak_matrix/", paste0(subclass_name, ".rds"))))
}

'as(<dgTMatrix>, "dgCMatrix")' is deprecated.
Use 'as(., "CsparseMatrix")' instead.
See help("Deprecated") and help("Matrix-deprecated").



In [6]:
# RNA, for some reason IT neurons do not have the same number of genes as other cell types
L2_3_IT_rna <- readRDS("cell_gene_matrix/L2_3_IT.rds")
filtered_genes <- rownames(L2_3_IT_rna)

In [9]:
all_downsampled_barcodes <- c()
for (file in dir_ls("cell_gene_matrix/", glob = "*rds")) {
    subclass_name <- path_ext_remove(path_file(file))
    print(subclass_name)
    flush.console()
    curr_rna <- readRDS(file)
    curr_rna <- curr_rna[filtered_genes, ]
    curr_atac <- readRDS(path_join(c("cell_peak_matrix/", paste0(subclass_name, ".rds"))))
    downsample_barcodes <- sample(colnames(curr_atac), replace = FALSE, size = min(5000, ncol(curr_atac)))
    all_downsampled_barcodes <- c(all_downsampled_barcodes, downsample_barcodes)
    curr_atac_down <- curr_atac[, downsample_barcodes]
    curr_rna_down <- curr_rna[, downsample_barcodes]
    
    saveRDS(
        curr_rna_down, 
        path_join(c("scenicplus/all/downsample_5k_rna_mtx/", paste0(subclass_name, ".rds"))))
    saveRDS(
        curr_atac_down, 
        path_join(c("scenicplus/all/downsample_5k_atac_mtx/", paste0(subclass_name, ".rds"))))
}

[1] "Astro"
[1] "Chandelier"
[1] "Endo"
[1] "L2_3_IT"
[1] "L4_IT"
[1] "L5_6_NP"
[1] "L5_ET"
[1] "L5_IT"
[1] "L6B"
[1] "L6_CT"
[1] "L6_IT"
[1] "L6_IT_Car3"
[1] "LAMP5"
[1] "LAMP5_LHX6"
[1] "Micro"
[1] "OPC"
[1] "Oligo"
[1] "PAX6"
[1] "PVALB"
[1] "SNCG"
[1] "SST"
[1] "SST_CHODL"
[1] "VIP"
[1] "VLMC"


In [10]:
meta_data <- read.table(
    "cell_gene_matrix/20230313_RNA_metadata.xls", header = TRUE, row.names = 1, sep = "\t")

In [11]:
meta_data_downsample <- meta_data[all_downsampled_barcodes, ]

In [12]:
write.table(
    meta_data_downsample, 
    "scenicplus/all/BICCN_5K_meta.tsv", 
    col.names = TRUE, 
    row.names = TRUE, 
    quote = FALSE, 
    sep = "\t")

## Combine matrices into one

In [3]:
rna_mtx <- c()
for (file in dir_ls("scenicplus/all/downsample_5k_rna_mtx/", glob = "*rds", recurse = TRUE)) {
    rna_mtx <- c(rna_mtx, readRDS(file))
}

In [4]:
rna_matrix <- do.call(cbind, rna_mtx)

In [5]:
meta_table <- read.table(
    "scenicplus/all/BICCN_5K_meta.tsv", 
    header = TRUE, row.names = 1, sep = "\t")

In [6]:
all(colnames(rna_matrix) == rownames(meta_table))

In [7]:
dim(rna_matrix)

In [8]:
writeMM(rna_matrix, "scenicplus/all/BICCN_5K_rna.mtx")

NULL

In [9]:
write.table(
    rownames(rna_matrix), 
    "scenicplus//all/BICCN_5K_gene.tsv", 
    col.names = FALSE, 
    row.names = FALSE, 
    quote = FALSE, 
    sep = "\t")

In [10]:
atac_mtx <- c()
for (file in dir_ls("scenicplus/all/downsample_5k_atac_mtx/", glob = "*rds", recurse = TRUE)) {
    atac_mtx <- c(atac_mtx, readRDS(file))
}

In [11]:
atac_matrix <- do.call(cbind, atac_mtx)

In [12]:
all(colnames(atac_matrix) == rownames(meta_table))

In [13]:
dim(atac_matrix)

In [14]:
writeMM(atac_matrix, "scenicplus/all/BICCN_5K_atac.mtx")

NULL

In [15]:
region_names <- read.table("scenicplus/all/BICCN_5K_region.tsv", header = FALSE, sep = "\t")

In [16]:
all(rownames(atac_matrix) == region_names[, 1])

In [17]:
write.table(
    colnames(atac_matrix), 
    "scenicplus/all/BICCN_5K_barcodes.tsv", 
    col.names = FALSE, 
    row.names = FALSE, 
    quote = FALSE, 
    sep = "\t")