In [4]:
if (!requireNamespace("dplyr", quietly = TRUE)) {
    install.packages("dplyr")
}


also installing the dependencies ‘magrittr’, ‘tibble’


“installation of package ‘magrittr’ had non-zero exit status”
“installation of package ‘tibble’ had non-zero exit status”
“installation of package ‘dplyr’ had non-zero exit status”
Updating HTML index of packages in '.Library'

Making 'packages.html' ...
 done



In [5]:
library(dplyr

ERROR: Error in parse(text = input): <text>:2:0: unexpected end of input
1: library(dplyr
   ^


In [1]:
## ==============================
## PARAMETERS
## ==============================

# Input files
rna_path  <- "rna_data_filtered wtumor.csv"          # raw RNA matrix (csv or tsv)
meta_path <- "complet_cell_metadata_grouped.csv"   # cleaned metadata with 'cell_id' and 'group'

# Output files
rna_standardized_out <- "rna_standardized.csv"   # standardized Gene × Cells
expr_pseudobulk_out  <- "expr_pseudobulk_symbols.csv" # Gene × Samples after pseudobulk
design_out           <- "design.csv"             # SampleID × Group (Tumor/Normal)

# Options
pseudobulk_method <- "mean"   # "mean" for normalized data, "sum" if raw counts
min_overlap_prop  <- 0.9      # threshold for barcode overlap warning


In [2]:
## ==============================
## RNA Reader + Standardizer
## ==============================

## ---- Helper functions
read_tab <- function(p) {
  if (grepl("\\.tsv$", p, ignore.case = TRUE)) {
    read.delim(p, stringsAsFactors = FALSE, check.names = FALSE)
  } else if (grepl("\\.csv$", p, ignore.case = TRUE)) {
    read.csv(p, stringsAsFactors = FALSE, check.names = FALSE)
  } else stop("Unknown format: expecting .csv or .tsv, got: ", p)
}

looks_barcode <- function(x) {
  grepl("[-_]", x) | grepl("-\\d+$", x)
}

looks_gene_symbol <- function(v) {
  v <- as.character(v)
  all(!grepl("-", v, fixed = TRUE)) &&
    mean(grepl("[A-Za-z]", v), na.rm = TRUE) > 0.6 &&
    mean(!grepl("^\\d+$", v), na.rm = TRUE) > 0.9
}

is_strict_index_col <- function(v) {
  v <- suppressWarnings(as.numeric(v))
  if (all(is.na(v))) return(FALSE)
  n <- length(v)
  sum(!is.na(v)) == n && identical(v, seq_len(n))
}

trim_all <- function(df) {
  names(df) <- trimws(names(df))
  for (j in seq_along(df)) if (is.character(df[[j]])) df[[j]] <- trimws(df[[j]])
  df
}

## ---- Read file
rna_raw <- read_tab(rna_path)
rna_raw <- trim_all(rna_raw)

empty_cols <- which(sapply(rna_raw, function(col) all(is.na(col) | col == "")))
if (length(empty_cols)) rna_raw <- rna_raw[ , -empty_cols, drop = FALSE]

stopifnot(ncol(rna_raw) >= 2)
dy
## ---- Detect format
cn <- colnames(rna_raw)

drop_first <- FALSE
if (is_strict_index_col(rna_raw[[1]]) || toupper(cn[1]) %in% c("X","INDEX","ROW","IDS","ID")) {
  drop_first <- TRUE
}

rna1 <- if (drop_first) rna_raw[, -1, drop = FALSE] else rna_raw
cn1 <- colnames(rna1)

gene_col_idx <- NA_integer_
cand_idx <- seq_len(min(2, ncol(rna1)))
for (k in cand_idx) {
  samp <- head(rna1[[k]], 200)
  if (looks_gene_symbol(samp)) { gene_col_idx <- k; break }
}

transposed <- FALSE
if (is.na(gene_col_idx)) {
  prop_barcode_in_colnames <- mean(looks_barcode(cn1))
  prop_barcode_in_firstcol <- mean(looks_barcode(head(rna1[[1]], 200)))
  if (!is.na(prop_barcode_in_firstcol) && prop_barcode_in_firstcol > 0.6 &&
      !is.na(prop_barcode_in_colnames) && prop_barcode_in_colnames < 0.4) {
    transposed <- TRUE
  } else {
    gene_col_idx <- 1L
  }
}

## ---- Transpose if needed
if (transposed) {
  message("Detected transposed file (barcodes in rows). Transposing ...")
  barcodes <- as.character(rna1[[1]])
  mat <- as.data.frame(t(rna1[, -1, drop = FALSE]), stringsAsFactors = FALSE, check.names = FALSE)
  colnames(mat) <- barcodes
  mat <- cbind(Gene = rownames(mat), mat, row.names = NULL, check.names = FALSE)
  rna2 <- mat
} else {
  if (is.na(gene_col_idx)) gene_col_idx <- 1L
  colnames(rna1)[gene_col_idx] <- "Gene"
  rna2 <- rna1
}

## ---- Clean up
cn2 <- colnames(rna2)
if (!"Gene" %in% cn2) stop("Could not determine 'Gene' column. Please check file.")

barcode_cols <- setdiff(cn2, "Gene")

for (bc in barcode_cols) {
  if (is.character(rna2[[bc]])) {
    rna2[[bc]] <- gsub(",+$", "", rna2[[bc]])
  }
}
for (bc in barcode_cols) {
  rna2[[bc]] <- suppressWarnings(as.numeric(rna2[[bc]]))
}

rna2$Gene <- as.character(rna2$Gene)
rna2 <- rna2[!is.na(rna2$Gene) & rna2$Gene != "", , drop = FALSE]

if (any(duplicated(rna2$Gene))) {
  message("Duplicate gene names found -> aggregating by mean.")
  rna2 <- stats::aggregate(. ~ Gene, data = rna2, FUN = function(x) mean(as.numeric(x), na.rm = TRUE))
}

for (bc in barcode_cols) {
  nas <- is.na(rna2[[bc]])
  if (any(nas)) rna2[[bc]][nas] <- 0
}

## ---- Write standardized file
write.csv(rna2, rna_standardized_out, row.names = FALSE)
message("rna_standardized.csv written: column 'Gene' + ", length(barcode_cols), " barcode columns.")

## ---- Optional: check overlap with metadata
if (file.exists(meta_path)) {
  meta <- tryCatch(read.csv(meta_path, stringsAsFactors = FALSE, check.names = FALSE), error = function(e) NULL)
  if (!is.null(meta) && "cell_id" %in% colnames(meta)) {
    meta_ids <- trimws(as.character(meta$cell_id))
    cols <- setdiff(colnames(rna2), "Gene")
    in_both <- intersect(meta_ids, cols)
    prop <- if (length(cols) == 0) 0 else length(in_both) / length(cols)
    cat("\n===== Barcode-Check (optional) =====\n")
    cat("Barcodes in RNA (columns): ", length(cols), "\n", sep = "")
    cat("Barcodes in Metadata:      ", length(meta_ids), "\n", sep = "")
    cat("Overlap:                   ", length(in_both), " (", sprintf("%.2f", 100*prop), "%)\n", sep = "")
    if (prop < min_overlap_prop) {
      warning("Less than 90% overlap between RNA barcodes and Metadata cell_id. Check naming/subsetting.")
    } else {
      message("Barcode overlap OK (>=90%).")
    }
  } else {
    message("Metadata file present, but no 'cell_id' column found – skipping barcode check.")
  }
}


rna_standardized.csv written: column 'Gene' + 1000 barcode columns.




===== Barcode-Check (optional) =====
Barcodes in RNA (columns): 1000
Barcodes in Metadata:      84526
Overlap:                   1000 (100.00%)


Barcode overlap OK (>=90%).



In [6]:
## ==============================
## Pseudobulk (Cells -> Samples) + Design
## Uses:
##   - rna_standardized_out (Gene × Cells)
##   - meta_path (metadata with 'cell_id', 'orig.ident', 'group')
## Produces:
##   - expr_pseudobulk_out  (Gene × Samples)
##   - design_out           (sample_id, group)
## ==============================

# ---- Read inputs
rna_std <- read.csv(rna_standardized_out, stringsAsFactors = FALSE, check.names = FALSE)
meta    <- read.csv(meta_path,             stringsAsFactors = FALSE, check.names = FALSE)

# ---- Basic checks
if (!"Gene" %in% colnames(rna_std)) stop("Standardized RNA must contain column 'Gene'.")
if (!"cell_id" %in% colnames(meta)) stop("Metadata must contain column 'cell_id'.")
if (!"orig.ident" %in% colnames(meta)) stop("Metadata must contain column 'orig.ident'.")
if (!"group" %in% colnames(meta)) stop("Metadata must contain column 'group' (Tumor/Normal).")

# ---- Align RNA barcodes (columns) with metadata cell_id
rna_barcodes <- setdiff(colnames(rna_std), "Gene")
meta_ids     <- trimws(as.character(meta$cell_id))
keep_cells   <- intersect(rna_barcodes, meta_ids)

if (length(keep_cells) == 0) {
  stop("No overlap between RNA barcodes and metadata cell_id.")
}

# Subset RNA to overlapping cells (keep Gene column)
rna_sub <- rna_std[, c("Gene", keep_cells), drop = FALSE]

# ---- Build mapping: cell_id -> orig.ident (sample) + group
meta_map <- meta[match(keep_cells, meta$cell_id), c("cell_id","orig.ident","group")]
colnames(meta_map) <- c("cell_id","sample_id","group")

# ---- Convert RNA to numeric matrix (cells only)
mat <- as.matrix(rna_sub[, -1, drop = FALSE])
mode(mat) <- "numeric"
rownames(mat) <- rna_sub$Gene

# ---- Aggregate per sample_id (orig.ident)
samples <- unique(meta_map$sample_id)

agg_one <- function(smpl) {
  cols <- which(meta_map$sample_id == smpl)
  if (length(cols) == 1L) {
    mat[, cols, drop = FALSE]
  } else {
    if (tolower(pseudobulk_method) == "sum") {
      matrix(rowSums(mat[, cols, drop = FALSE], na.rm = TRUE), ncol = 1,
             dimnames = list(rownames(mat), smpl))
    } else {
      matrix(rowMeans(mat[, cols, drop = FALSE], na.rm = TRUE), ncol = 1,
             dimnames = list(rownames(mat), smpl))
    }
  }
}

pb_list <- lapply(samples, agg_one)
pb_mat  <- do.call(cbind, pb_list)

# ---- Build pseudobulk dataframe
expr_pseudobulk <- data.frame(Gene = rownames(pb_mat), pb_mat, check.names = FALSE)

# ---- Collapse duplicated genes (mean)
if (any(duplicated(expr_pseudobulk$Gene))) {
  expr_pseudobulk <- aggregate(. ~ Gene, data = expr_pseudobulk, FUN = function(x) mean(as.numeric(x), na.rm = TRUE))
}

## ---- Build design: one row per sample_id, assign group by majority vote (Base R version)
design <- aggregate(group ~ sample_id, data = meta_map, FUN = function(x) {
  if (mean(x == "Tumor", na.rm = TRUE) >= 0.5) "Tumor" else "Normal"
})

# ---- Write outputs
write.csv(expr_pseudobulk, expr_pseudobulk_out, row.names = FALSE)
write.csv(design,           design_out,          row.names = FALSE)

# ---- Report
cat("\n===== PSEUDOBULK DONE =====\n")
cat("Samples (orig.ident): ", paste(colnames(expr_pseudobulk)[-1], collapse = ", "), "\n", sep = "")
cat("Genes (rows):         ", nrow(expr_pseudobulk), "\n", sep = "")
cat("Design rows:          ", nrow(design), "\n", sep = "")
cat("Files written:        ", expr_pseudobulk_out, ", ", design_out, "\n", sep = "")



===== PSEUDOBULK DONE =====
Samples (orig.ident): p786pos, p786neg, p811, p826, p846, p848, p4, p8pos, p8neg, p7, p9, p10, p11, p12, p13
Genes (rows):         31915
Design rows:          15
Files written:        expr_pseudobulk_symbols.csv, design.csv
