In [19]:
library(tidyverse)
library(DESeq2)
library(sva)

source("../../utils/plots_eda.R")

# Load data

In [20]:
datasets <- c("GSE129508", "GSE58135", "GSE149276")

In [21]:
all_expression <- NULL
raw_expression <- NULL
all_metadata <- NULL

for(dataset in datasets){
    metadata <- read.table(paste0("before/", dataset, "/", dataset, ".sample_info.tsv"), header = TRUE, sep = "\t")
    expr_data <- read.table(paste0("before/", dataset, "/", dataset, ".counts.tsv"), header = TRUE, sep = "\t")


    # if sample GCSM3714613 is in expression or metadata, remove it
    if("GSM3714613" %in% colnames(expr_data)){
        print(paste0("Removing sample GSM3714613 from ", dataset))
        expr_data <- expr_data[, -which(colnames(expr_data) == "GSM3714613")]
        metadata <- metadata[metadata$sample_id != "GSM3714613",]
    }
    
    metadata$Dataset <- dataset
    metadata <- metadata %>% 
        mutate(Status = ifelse(Basal == 1, "Basal", "Luminal")) %>%
            # ifelse(is_LumA == 1, "LumA", "LumB"))) %>%
        mutate(Status = as.factor(Status))

    print(paste0("Samples: ", nrow(metadata), "; Features: ", nrow(expr_data)))

    #########################################################################################################################
    # Normalization # 
    # Create a DESeq2 dataset object
    expr_data <- expr_data %>% column_to_rownames("gene_id")

    # srop rows with only zeros
    expr_data <- expr_data[rowSums(expr_data) > 0,]

    dds <- DESeqDataSetFromMatrix(countData = expr_data, colData = metadata, design = ~ Status)
    # Normalize using median of ratios method
    dds <- estimateSizeFactors(dds)
    normalized_counts <- counts(dds, normalized = TRUE)
    
    norm_expr <- as.data.frame(normalized_counts) %>% 
        rownames_to_column("gene_id")
    print(paste0("Normalized Samples: ", nrow(metadata), "; Features: ", nrow(norm_expr)))

    #########################################################################################################################
    #  plot
    # plot_res <- plot_diagnostic(norm_expr, metadata, dataset,
    #                             log_transform = FALSE, with_rowname = FALSE)
    # layout <- (plot_res[[1]] + plot_res[[2]] ) / 
    #           (plot_res[[3]] )
    # ggsave(paste0("before/", dataset, "/diagnostic_plot.png"), 
    #             plot = layout, width = 12, height = 12)

    # save data
    if(is.null(all_metadata)){
        all_metadata <- metadata
        all_expression <- norm_expr
        raw_expression <- expr_data %>% rownames_to_column("gene_id")
    } else {        
        all_metadata <- rbind(all_metadata, metadata)
        all_expression <- full_join(all_expression, norm_expr, by = "gene_id")
        raw_expression <- inner_join(raw_expression, expr_data %>% rownames_to_column("gene_id"), by = "gene_id")
    }
    print(paste0("Combined Samples: ", nrow(all_metadata), "; Features: ", nrow(all_expression)))
    print(" ")
}

# # plot the combined data
# print("Plotting combined data")
# plot_res <- plot_diagnostic(all_expression, all_metadata, "Combined")
# layout <- (plot_res[[1]] + plot_res[[2]] ) / 
#           (plot_res[[3]] )
# ggsave("before/diagnostic_plot.png", 
#             plot = layout, width = 12, height = 12)

[1] "Removing sample GSM3714613 from GSE129508"
[1] "Samples: 25; Features: 35238"
[1] "Normalized Samples: 25; Features: 30174"
[1] "Combined Samples: 25; Features: 30174"
[1] " "
[1] "Samples: 75; Features: 35238"
[1] "Normalized Samples: 75; Features: 34675"
[1] "Combined Samples: 100; Features: 34784"
[1] " "
[1] "Samples: 31; Features: 35238"
[1] "Normalized Samples: 31; Features: 31377"
[1] "Combined Samples: 131; Features: 34818"
[1] " "


# Save data for correction and after correction

In [22]:
all_metadata$batch <- as.numeric(as.factor(all_metadata$Dataset)) - 1
# all_metadata$batch <- 0
all_metadata$lum <- as.numeric(as.factor(all_metadata$Status))
all_metadata$lum = all_metadata$lum - 1

all_expression <- all_expression %>% 
    column_to_rownames("gene_id")

for (dataset in unique(all_metadata$Dataset)) {
    print(paste0("Save data prior to batch correction for ", dataset))
    dataset_metadata <- all_metadata[all_metadata$Dataset == dataset,]
    dataset_metadata <- dataset_metadata %>% select(sample_id, lum, batch)
    
    dataset_expression <- all_expression[, dataset_metadata$sample_id]
    dataset_expression <- na.omit(dataset_expression)
    dataset_expression <- log2(dataset_expression + 1)
    dataset_expression <- dataset_expression[apply(dataset_expression, 1, var) > 0, ]

    dataset_expression <- dataset_expression %>% rownames_to_column("gene_id")
    print(paste0("Samples: ", nrow(dataset_metadata), "; Features: ", nrow(dataset_expression)))
    
    write.table(dataset_metadata, 
        file = paste0("before/", dataset, "/design.tsv"), sep = "\t", quote = FALSE, row.names = FALSE)
    write.table(dataset_expression, 
        file = paste0("before/", dataset, "/expr_for_correction.tsv"), sep = "\t", quote = FALSE, row.names = FALSE)

}

all_expression <- na.omit(all_expression)
all_expression <- log2(all_expression + 1)
all_expression <- all_expression[apply(all_expression, 1, var) > 0, ]
all_expression <- all_expression[, all_metadata$sample_id]

write.table(all_metadata %>% select(sample_id, lum, batch), 
    file = "before/all_design.tsv", sep = "\t", quote = FALSE, row.names = FALSE)
write.table(all_expression %>% rownames_to_column("gene_id"),
    file = "before/all_expr_for_correction.tsv", sep = "\t", quote = FALSE, row.names = FALSE)

[1] "Save data prior to batch correction for GSE129508"
[1] "Samples: 25; Features: 30174"
[1] "Save data prior to batch correction for GSE58135"
[1] "Samples: 75; Features: 34675"
[1] "Save data prior to batch correction for GSE149276"
[1] "Samples: 31; Features: 31377"
