# Prepare the datatsets

Rerun this script only if you want to re-create the datasets.  
Otherwise, you can skip this script and go directly to the data correction one.

# Harmonize metafiles

In [1]:
library(tidyverse)
library(GEOquery)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mggplot2  [39m 3.4.4     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
Loading required package: Biobase

Loading required package: BiocGenerics


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:lubridate’

## Load metadata

i - to check if some dataset contains the data from several sources/platforms

In [4]:
options(timeout = max(300, getOption("timeout")))
options(download.file.method.GEOquery = "wget")

metadata_files <- list()
samples_IDs <- NULL

for(dataset in c(
    "GSE6008", "GSE26712", "GSE40595", "GSE69428", "GSE38666", "GSE14407"
    )){

    # check if dir exists
    if(!dir.exists(paste0("before/", dataset))){
        dir.create(paste0("before/", dataset))
    }

    gset <- getGEO(dataset, GSEMatrix=TRUE, getGPL=FALSE)
    metadata_files[[dataset]] <- list()

    n_platforms <- length(gset)
    for(i in c(1:n_platforms)){
        metadata <- pData(gset[[i]]) %>% as.data.frame()
        write.table(metadata %>% as.data.frame() %>% rownames_to_column("Sample_geo_ID"), 
            file = paste0("before/", dataset, "/intermediate/", i, "_metadata.tsv"), sep = "\t", row.names = F,quote = FALSE)

        metadata_files[[dataset]][[i]] <- paste0("before/", dataset, "/intermediate/", i, "_metadata.tsv")
        if(is.null(samples_IDs)){
            samples_IDs <- metadata %>% rownames()
        }else{
            # add new samples, but without duplicates
            samples_IDs <- c(samples_IDs, metadata %>% rownames()) %>% unique()
        }
    }
}

Found 1 file(s)

GSE6008_series_matrix.txt.gz

Using locally cached version: /tmp/RtmpP8bGJq/GSE6008_series_matrix.txt.gz

Found 1 file(s)

GSE26712_series_matrix.txt.gz

Using locally cached version: /tmp/RtmpP8bGJq/GSE26712_series_matrix.txt.gz

Found 1 file(s)

GSE40595_series_matrix.txt.gz

Using locally cached version: /tmp/RtmpP8bGJq/GSE40595_series_matrix.txt.gz

Found 1 file(s)

GSE69428_series_matrix.txt.gz

Using locally cached version: /tmp/RtmpP8bGJq/GSE69428_series_matrix.txt.gz

Found 1 file(s)

GSE38666_series_matrix.txt.gz

Found 1 file(s)

GSE14407_series_matrix.txt.gz



## Read and preprocess metadata

In [5]:
prepare_GSE6008 <- function(metadata){
    metadata <- metadata %>%
        # remove all columns with mutation in colname
        select(-c(grep("mutation", colnames(metadata), value = T))) %>%
        select(-c(grep("characteristics_ch1", colnames(metadata), value = T))) %>%
        select(-c("CEL_file_name:ch1", "P53_immunohistochemistry:ch1", "B-canenin_nuclear_accumulation:ch1", "PTEN_immunohistochemistry:ch1")) %>%
        select(-c("title")) %>%
        rename("tissue" = "source_name_ch1", "Grade" = "grade:ch1", "Stage" = "stage:ch1", "Tumor_Type" = "Tumor_Type:ch1") %>%
        mutate(
            Status = ifelse(grepl("Normal", tissue), "normal", "ovarian tumour"),
            Grade2 = ifelse(Grade == 1, "Low", ifelse(Grade == 2 | Grade == 3 | Grade == "2-3", "High", NA)),
            Stage = ifelse(Stage == "N/A", NA, toupper(gsub(" ", "", Stage))),
            Grade = ifelse(Grade == "N/A", NA, Grade),
            Tumor_Type = ifelse(Tumor_Type == "N/A", NA, Tumor_Type),
        ) %>%
        mutate(
            HistSubtypes = ifelse(grepl("Serous", Tumor_Type) & Grade2 == "High", "high-grade serous carcinoma",
                ifelse(grepl("Serous", Tumor_Type) & Grade2 == "Low", "low-grade serous carcinoma",
                        ifelse(grepl("Endometrioid", Tumor_Type), "endometrioid carcinoma",
                            ifelse(grepl("Clear_Cell", Tumor_Type), "clear cell carcinoma",
                                ifelse(grepl("Mucinous", Tumor_Type), "mucinous carcinoma", NA)
                            )
                        )
                    )
                 ),
            ClinicopathologicSubtypes = ifelse(HistSubtypes == "high-grade serous carcinoma", "Type II", "Type I"),
            Stage2 = ifelse(Stage %in% c("3A","3C","3B","3", "3D"), "III", 
                ifelse(Stage %in% c("1A","1C", "1A", "1"), "I", 
                    ifelse(Stage %in% c("4"), "IV", 
                        ifelse(Stage %in% c("2B", "2C", "2A", "2"), "II", NA)))),
            Stage3 = NA
        ) %>%
        mutate(HistSubtypes = ifelse(is.na(HistSubtypes), ifelse(Tumor_Type == "Serous", "serous carcinoma", NA), HistSubtypes)) %>%
        select(-c("Tumor_Type", "description"))
    metadata <- metadata %>%
        select(Sample_geo_ID, Status, HistSubtypes, Stage, Grade2, ClinicopathologicSubtypes, everything())
    return(metadata)
}


prepare_GSE26712 <- function(metadata){
    
    metadata <- metadata %>%
        mutate(title = sub(" .*", "", title)) %>%
        rename("tissue" = "source_name_ch1", "Tissue" = "tissue:ch1") %>%
        select(-c("title", "characteristics_ch1", "characteristics_ch1.1", "characteristics_ch1.2", "characteristics_ch1.3",
            "surgery outcome:ch1", "survival years:ch1"))  %>%
        mutate(
            Stage = ifelse(grepl("Late-stage", Tissue), "Late(IIIB–IV)", NA),
            Grade2 = ifelse(grepl("high-grade", Tissue), "High", NA),
            SubtypeRep = ifelse(grepl("high-grade", Tissue), "Ser/PapSer", NA),
            Status = ifelse(grepl("Normal", Tissue), "normal", "ovarian tumour"),
            HistSubtypes = ifelse(SubtypeRep == "Ser/PapSer", "high-grade serous carcinoma", NA)) %>%
        mutate(
            ClinicopathologicSubtypes = ifelse(HistSubtypes == "high-grade serous carcinoma", "Type II", "Type I"),
            Stage2 = Stage,
            Stage3 = NA) %>%
        select(-c("treatment_protocol_ch1", "description", "Tissue"))
    metadata <- metadata %>% 
        select(Sample_geo_ID, Status, HistSubtypes, Stage, Grade2, ClinicopathologicSubtypes, everything())
        
    return(metadata)
}

prepare_GSE40595 <- function(metadata){
    metadata <- metadata %>%
        filter(!grepl("stroma", title)) %>%
        mutate(
            Status = ifelse(grepl("Microdissected ovarian surface epthelium", source_name_ch1), "normal", "ovarian tumour"),
            tissue = "Ovary",
            HistSubtypes = ifelse(grepl("tumor", source_name_ch1), "high-grade serous carcinoma", NA),
            Grade2 = ifelse(grepl("tumor", source_name_ch1), "High", NA),
            ClinicopathologicSubtypes = ifelse(grepl("tumor", source_name_ch1), "Type II", NA),
            Stage = NA,
            Stage2 = NA,
            Stage3 = NA
        ) %>%
        select(-c("characteristics_ch1", "title", "tissue:ch1", "description", "source_name_ch1"))
    metadata <- metadata %>%
        select(Sample_geo_ID, Status, HistSubtypes, Stage, Grade2, ClinicopathologicSubtypes, everything())
    return(metadata)
}


prepare_GSE69428 <- function(metadata){
    metadata <- metadata %>%
        filter(!grepl("FTSC", title)) %>%
        mutate(
            Grade2 = ifelse(grepl("HGSOC", title), "High", NA),
            ClinicopathologicSubtypes = ifelse(grepl("HGSOC", title), "Type II", NA),
            HistSubtypes = ifelse(grepl("HGSOC", title), "high-grade serous carcinoma", NA),
            Status = ifelse(grepl("normal", title), "normal", "ovarian tumour"),
            tissue = ifelse(grepl("HGSOC", title), "serous ovarian cancer", "oviduct"),
            Stage = NA,
            Stage2 = NA,
            Stage3 = NA
        ) %>%
        select(-c("treatment_protocol_ch1", "characteristics_ch1.1", "characteristics_ch1", 
            "sample type:ch1", "source_name_ch1", "description", "title", "tissue:ch1"))
    metadata <- metadata %>%
        select(Sample_geo_ID, Status, HistSubtypes, Stage, Grade2, ClinicopathologicSubtypes, everything())
    return(metadata)
}


prepare_GSE38666 <- function(metadata){
    metadata <- metadata %>%
        filter(!grepl("Stroma", title)) %>%
        select(-c(grep("characteristics_ch1", colnames(metadata), value = T))) %>%
        rename("Stage" = "Stage:ch1", "tissue" = "tissue:ch1", "Grade" = "grade:ch1") %>%
        mutate(
            Status = ifelse(grepl("normal", title), "normal", "ovarian tumour"),
            Stage =  toupper(gsub(" ", "", Stage)),
            Grade2 = ifelse(Grade == 1, "Low", ifelse(Grade == 2 | Grade == 3, "High", NA)),
            HistSubtypes = ifelse(Status == "ovarian tumour" & Grade2 == "High", "high-grade serous carcinoma", NA),
            ClinicopathologicSubtypes = ifelse(HistSubtypes == "high-grade serous carcinoma", "Type II", "Type I"),
            Stage2 = ifelse(Stage %in% c("IIIA","IIIC","IIIB","III", "IIIB/C"), "III", 
                ifelse(Stage %in% c("IB","IC", "IA"), "I", 
                    ifelse(Stage %in% c("III/IV", "IIIC/IV", "IV"), "IV", 
                        ifelse(Stage %in% c("IIB", "IIC", "IIA", "II"), "II", NA)))),
            Stage3 = NA
        ) %>%
        select(-c("source_name_ch1", "description", "cell type:ch1", "title"))
    
    metadata <- metadata %>%
        select(Sample_geo_ID, Status, HistSubtypes, Stage, Grade2, ClinicopathologicSubtypes, everything())
    return(metadata)
}



prepare_GSE14407 <- function(metadata){
    metadata <- metadata %>%
        select(-c(grep("characteristics_ch1", colnames(metadata), value = T))) %>%
        select(-c("biomaterial_provider_ch1", "treatment_protocol_ch1")) %>%
        mutate(Stage = ifelse(grepl("CEPI", title), sub(".*_(.*)_[0-9]*", "\\1", title), NA)) %>%
        mutate(
            Stage = toupper(gsub(" ", "", Stage)),
            Status = ifelse(grepl("CEPI", title), "ovarian tumour", "normal"),
            HistSubtypes = ifelse(!grepl("OSE", title), "high-grade serous carcinoma", NA),
            Grade2 = ifelse(grepl("CEPI", title), "High", NA),
            ClinicopathologicSubtypes = ifelse(HistSubtypes == "high-grade serous carcinoma", "Type II", NA),
            Stage2 = ifelse(Stage %in% c("IIIA","IIIC","IIIB","III"), "III", 
                ifelse(Stage %in% c("IB","IC", "IA"), "I", 
                    ifelse(Stage %in% c("IV"), "IV", 
                        ifelse(Stage %in% c("IIB", "IIC", "IIA", "II"), "II", NA)))),
            Stage3 = NA
        ) %>%
        select(-c("title", "source_name_ch1", "description", "disease state:ch1", "specimen:ch1"))
    metadata <- metadata %>%
        select(Sample_geo_ID, Status, HistSubtypes, Stage, Grade2, ClinicopathologicSubtypes, everything())
    return(metadata)
}

In [6]:
metadata_list <- list()

for(dataset in c(
    "GSE6008", "GSE26712", "GSE40595", "GSE69428", "GSE38666", "GSE14407"
)){
    cat("Dataset: ", dataset, "\n")
    for(i in c(1:length(metadata_files[[dataset]]))){
        cat("\t\t", i, "\n")
        metadata_file <- metadata_files[[dataset]][[i]]
        metadata <- read_tsv(metadata_file, show_col_types = FALSE)
        # check if metadata$type is RNA
        if("type" %in% colnames(metadata)){
            if(!("RNA" %in% metadata$type)){
                cat("Not RNA dataset\n")
        }}

        metadata <- metadata %>%
            select(-matches("^(geo_accession|status|submission_date|last_update_date|type|channel_count|organism_ch1)")) %>%
            select(-matches("^(molecule_ch1|taxid_ch1|data_row_count|relation|contact_)")) %>%
            select(-matches("^(extract_protocol_ch1|label_ch1|label_protocol_ch1|scan_protocol|hyb_protocol|data_processing)")) %>%
            select(-matches("^(growth_protocol_ch1|data_processing.1|patient id:ch1|age:ch1|supplementary_file.1|gender:ch1)"))
        
        if (dataset == "GSE6008") { metadata_list[[paste0(dataset, "_", i)]] <- prepare_GSE6008(metadata) }
        if (dataset == "GSE26712") { metadata_list[[paste0(dataset, "_", i)]] <- prepare_GSE26712(metadata) }
        if (dataset == "GSE40595") { metadata_list[[paste0(dataset, "_", i)]] <- prepare_GSE40595(metadata) }
        if (dataset == "GSE69428") { metadata_list[[paste0(dataset, "_", i)]] <- prepare_GSE69428(metadata) }
        if (dataset == "GSE38666") { metadata_list[[paste0(dataset, "_", i)]] <- prepare_GSE38666(metadata) }
        if (dataset == "GSE14407") { metadata_list[[paste0(dataset, "_", i)]] <- prepare_GSE14407(metadata) }
    }
}

Dataset:  GSE6008 
		 1 
Dataset:  GSE26712 
		 1 
Dataset:  GSE40595 
		 1 
Dataset:  GSE69428 
		 1 
Dataset:  GSE38666 
		 1 
Dataset:  GSE14407 
		 1 


## Save the results

In [7]:
library(readr)

for(dataset in names(metadata_list)){
    dataset_path <- gsub("_\\d$", "", dataset)
    # get i from dataset
    i <- gsub(".*_", "", dataset)
    write_tsv(metadata_list[[dataset]], paste0("before/", dataset_path, "/intermediate/", i, "_metadata_short.tsv"))
}


In [8]:
# Function to process each dataset
process_dataset <- function(dataset) {
    # Read the metadata
    metadata <- metadata_list[[dataset]]
    # Get unique values for each column and format as a single string if multiple
    summarized <- metadata %>%
        summarise(across(everything(), ~paste(unique(.x), collapse = "; "))) %>%
        pivot_longer(cols = everything(), names_to = "column_name", values_to = "unique_values") %>%
        pivot_wider(names_from = column_name, values_from = unique_values)
    
    # Add dataset identifier
    summarized <- summarized %>% mutate(dataset = dataset)
    summarized$dataset <- as.character(dataset)
    # add column with total number of samples per dataset
    summarized$number_of_samples <- nrow(metadata)
    return(summarized)
}

# Process all datasets
processed_datasets <- map(names(metadata_list), process_dataset)

# Combine all processed datasets using full join, handling cases with no common columns
final_df <- reduce(processed_datasets, ~full_join(.x, .y, by = intersect(names(.x), names(.y)))) %>%
    as.data.frame()

# Print or view the final structured dataframe
# print(final_df)
final_df %>% write_tsv("preprocessing_info/metadata_summary.tsv", col_names = TRUE)

# Load the data

In [9]:
library(tidyverse)

library(affy)
library(GEOquery)


Attaching package: ‘affy’


The following object is masked from ‘package:lubridate’:

    pm




In [None]:
# Preparation (in case of errors)
# library(makecdfenv)

# make.cdf.package(
#     "GPL15048_HuRSTA_2a520709.CDF.gz",
#     packagename = 'hursta2a520709cdf',
#     # package.path = "./",
#     species = "Homo_sapiens",
#     cdf.path = "before/GSE66957_RAW/",
#     compress = TRUE)

# system("R CMD INSTALL hursta2a520709cdf", intern = TRUE)

In [10]:
options(timeout = max(300, getOption("timeout")))
options(download.file.method.GEOquery = "wget")

for(dataset in c(
    "GSE6008", "GSE26712", "GSE40595", "GSE69428", "GSE38666", "GSE14407"
    )){                   
    
    cat('Processing for dataset:', dataset)
    
    if (file.exists(paste0("before/", dataset, "/", dataset,"_RAW.tar"))) {
        untar(
            paste0("before/", dataset,"/", dataset, "_RAW.tar"), 
            exdir = paste0("before/", dataset, "_RAW")
        )
    } else {
        getGEOSuppFiles(dataset, baseDir="before")
        untar(
            paste0("before/", dataset,"/", dataset, "_RAW.tar"), 
            exdir = paste0("before/", dataset, "_RAW")
        )
    }

    # Save the metadata
    # gset <- getGEO(dataset, GSEMatrix=TRUE, getGPL=FALSE)
    gset = c(1)
    
    for(i in c(1:length(gset))){
        metadata <- read.table(paste0("before/", dataset, "/intermediate/", i, "_metadata_short.tsv"), header = TRUE, sep = "\t")
        platform_files <- sub(".*/", "", metadata$supplementary_file)
        
        # if any file contains "txt.gz" ending, remove them from the list
        if(any(grepl("txt.gz", platform_files))){
            platform_files <- platform_files[!grepl("txt.gz", platform_files)]
            cat("Removed txt.gz files from the list. CHECK THE DATASET!")
        }
        
        platform_files_list <- paste0("before/", dataset, "_RAW/", platform_files)

        filenames <- as.character(unlist(platform_files_list))
        rawData <- ReadAffy(
            filenames = unlist(filenames),
            # celfile.path = paste0("before/", dataset, "_RAW"), 
            verbose = FALSE)

        # Prepare sample names
        sample_names <- sampleNames(rawData)
        sample_names <- sub("_.*", "", sample_names)
        sample_names <- sub(".CEL.gz$", "", sample_names)

        # Normalize the data
        normalizedData <- rma(rawData)
        expr_data <- exprs(normalizedData)
    
        # Save the normalized data
        rownames(expr_data) <- featureNames(normalizedData)
        colnames(expr_data) <- sample_names
        write.table(expr_data %>% as.data.frame() %>% rownames_to_column("row_ID"), 
            file = paste0("before/", dataset, "/intermediate/", i, "_expr.tsv"), sep = "\t", row.names = F, quote = FALSE)
        
    }
}

# remove all *_RAW folders from before
system("rm -r before/*_RAW")

Processing for dataset: GSE6008

“replacing previous import ‘AnnotationDbi::tail’ by ‘utils::tail’ when loading ‘hgu133acdf’”
“replacing previous import ‘AnnotationDbi::head’ by ‘utils::head’ when loading ‘hgu133acdf’”




Background correcting
Normalizing
Calculating Expression
Processing for dataset: GSE26712Background correcting
Normalizing
Calculating Expression
Processing for dataset: GSE40595

“replacing previous import ‘AnnotationDbi::tail’ by ‘utils::tail’ when loading ‘hgu133plus2cdf’”
“replacing previous import ‘AnnotationDbi::head’ by ‘utils::head’ when loading ‘hgu133plus2cdf’”



Attaching package: ‘hgu133plus2cdf’


The following objects are masked from ‘package:hgu133acdf’:

    i2xy, xy2i




Background correcting
Normalizing
Calculating Expression
Processing for dataset: GSE69428Background correcting
Normalizing
Calculating Expression
Processing for dataset: GSE38666Background correcting
Normalizing
Calculating Expression
Processing for dataset: GSE14407Background correcting
Normalizing
Calculating Expression


## Preprocess the data

In [11]:
library(tidyverse)
library(WGCNA)

Loading required package: dynamicTreeCut



Loading required package: fastcluster


Attaching package: ‘fastcluster’


The following object is masked from ‘package:stats’:

    hclust





Attaching package: ‘WGCNA’


The following object is masked from ‘package:stats’:

    cor




In [12]:
platform_ids_list <- c()

for(dataset in c(
    "GSE6008", "GSE26712", "GSE40595", "GSE69428", "GSE38666", "GSE14407"
    )){
    if(dataset == "GSE131978" | dataset == "GSE68928"){ 
        subsets_len <- c(1, 2)
    } else {subsets_len <- c(1) }

    for(i in subsets_len){
        #read the expression data and metadata file
        metadata <- read.table(paste0("before/", dataset, "/intermediate/", i, "_metadata_short.tsv"), header = TRUE, sep = "\t")
        platform_name <- metadata$platform_id %>% unique()
        platform_ids_list <- c(platform_ids_list, platform_name) %>% unique()
    }
}

In [13]:
platform_ids_list

In [14]:
libraries_list <- list()

parse_triple <- function(x, delimiter = " /// ") {
    parts <- strsplit(x, delimiter)[[1]]
    # parts <- sort(parts)
    return(parts[1])
}


for (lib_name in platform_ids_list){
    cat("Processing", lib_name, "\n")
    lib_df <- read_tsv(paste0("preprocessing_info/libraries/", lib_name, ".txt"), comment="#", show_col_types = FALSE)
    
    if(lib_name %in% c("GPL570", "GPL96")){
        lib_df <- lib_df %>% 
                  select(ID, GB_ACC, `Gene Symbol`, ENTREZ_GENE_ID) %>%
                  rename(GeneSymbol = `Gene Symbol`) %>%
                  mutate(GeneSymbol = sapply(GeneSymbol, parse_triple, delimiter = " /// "),
                         ENTREZ_GENE_ID = sapply(ENTREZ_GENE_ID, parse_triple, delimiter = " /// "))
    }

    lib_df <- lib_df[!duplicated(lib_df$ID),]
    libraries_list[[lib_name]] <- lib_df
}

Processing GPL96 
Processing GPL570 


In [15]:
for(lib_name in names(libraries_list)){
    #print len of unqiue values of each column
    cat(lib_name, "\n")
    print(libraries_list[[lib_name]] %>% summarise(across(everything(), ~length(unique(.x)))))
}

GPL96 
[90m# A tibble: 1 × 4[39m
     ID GB_ACC GeneSymbol ENTREZ_GENE_ID
  [3m[90m<int>[39m[23m  [3m[90m<int>[39m[23m      [3m[90m<int>[39m[23m          [3m[90m<int>[39m[23m
[90m1[39m [4m2[24m[4m2[24m283  [4m2[24m[4m1[24m129      [4m1[24m[4m3[24m238          [4m1[24m[4m2[24m938
GPL570 
[90m# A tibble: 1 × 4[39m
     ID GB_ACC GeneSymbol ENTREZ_GENE_ID
  [3m[90m<int>[39m[23m  [3m[90m<int>[39m[23m      [3m[90m<int>[39m[23m          [3m[90m<int>[39m[23m
[90m1[39m [4m5[24m[4m4[24m675  [4m5[24m[4m1[24m277      [4m2[24m[4m2[24m881          [4m2[24m[4m1[24m181


In [16]:
length(intersect(unique(libraries_list[["GPL570"]]$GB_ACC), unique(libraries_list[["GPL96"]]$GB_ACC)))
length(intersect(unique(libraries_list[["GPL570"]]$GeneSymbol), unique(libraries_list[["GPL96"]]$GeneSymbol)))
length(intersect(unique(libraries_list[["GPL570"]]$ENTREZ_GENE_ID), unique(libraries_list[["GPL96"]]$ENTREZ_GENE_ID)))

In [17]:
platfrom_list <- list()

for(dataset in c(
    "GSE6008", "GSE26712", "GSE40595", "GSE69428", "GSE38666", "GSE14407"
    )){
    
    cat('\n\nProcessing for dataset:', dataset, "\n")
    if(dataset == "GSE131978" | dataset == "GSE68928"){ 
        subsets_len <- c(1, 2)
    } else {subsets_len <- c(1) }

    for(i in subsets_len){
        #read the expression data and metadata file
        metadata <- read.table(paste0("before/", dataset, "/intermediate/", i, "_metadata_short.tsv"), header = TRUE, sep = "\t")
        expr_data <- read.table(paste0("before/", dataset, "/intermediate/", i, "_expr.tsv"), header = TRUE, sep = "\t")
        cat('Loaded the data, shape of metadata:', dim(metadata), 'shape of expr_data:', dim(expr_data), '\n')

        platform_name <- metadata$platform_id %>% unique()

        # MaxVar# Collapse to GB_ACC
        gene_names <- libraries_list[[platform_name]]
        gene_names <- gene_names[!is.na(gene_names$GB_ACC),]
        expr_data_2 <- expr_data %>% filter(row_ID %in% gene_names$ID) %>% column_to_rownames("row_ID")
        cat('After filtering, shape of expr_data:', dim(expr_data_2), '\n')

        expr_data_ENTREZ <- WGCNA::collapseRows(
            expr_data_2, 
            rowGroup = gene_names$GB_ACC, 
            rowID = gene_names$ID,
            method="maxRowVariance", connectivityBasedCollapsing=FALSE,
            methodFunction=NULL, connectivityPower=1,
            selectFewestMissing=TRUE, thresholdCombine=NA)$datETcollapsed
        cat('After collapsing rows (Entrez IDs), shape of expr_data:', dim(expr_data_ENTREZ), '\n')
        # save to file
        write.table(expr_data_ENTREZ %>% as.data.frame() %>% rownames_to_column("row_ID"), 
            file = paste0("before/", dataset, "/intermediate/expr_GBacc_MaxVar.tsv"), sep = "\t", row.names = F, quote = FALSE)

}}



Processing for dataset: GSE6008 
Loaded the data, shape of metadata: 103 12 shape of expr_data: 22283 104 
After filtering, shape of expr_data: 22215 103 
After collapsing rows (Entrez IDs), shape of expr_data: 21128 103 


Processing for dataset: GSE26712 
Loaded the data, shape of metadata: 195 12 shape of expr_data: 22283 196 
After filtering, shape of expr_data: 22215 195 
After collapsing rows (Entrez IDs), shape of expr_data: 21128 195 


Processing for dataset: GSE40595 
Loaded the data, shape of metadata: 38 11 shape of expr_data: 54675 39 
After filtering, shape of expr_data: 54613 38 
After collapsing rows (Entrez IDs), shape of expr_data: 51276 38 


Processing for dataset: GSE69428 
Loaded the data, shape of metadata: 20 11 shape of expr_data: 54675 21 
After filtering, shape of expr_data: 54613 20 
After collapsing rows (Entrez IDs), shape of expr_data: 51276 20 


Processing for dataset: GSE38666 
Loaded the data, shape of metadata: 30 12 shape of expr_data: 54675 31 
A

In [18]:
GB_ACC_id_lists <- list()

for(dataset in c(
    "GSE6008", "GSE26712", "GSE40595", "GSE69428", "GSE38666", "GSE14407"
    )){                   
    
    cat('\nProcessing for dataset:', dataset, "\n")
    if(dataset == "GSE131978" | dataset == "GSE68928"){ 
        subsets_len <- c(1, 2)
    } else {subsets_len <- c(1) }

    for(i in subsets_len){
        # read data
        expr_data <- read.table(paste0("before/", dataset, "/intermediate/expr_GBacc_MaxVar.tsv"), header = TRUE, sep = "\t")
        # get the  names
        GB_ACC_id_lists[[paste0(dataset, "_", i)]] <- expr_data$row_ID
    }
} 


Processing for dataset: GSE6008 

Processing for dataset: GSE26712 

Processing for dataset: GSE40595 

Processing for dataset: GSE69428 

Processing for dataset: GSE38666 

Processing for dataset: GSE14407 


In [19]:
library(jsonlite)

write_json(GB_ACC_id_lists, "preprocessing_info/GBacc_IDs.json")


Attaching package: ‘jsonlite’


The following object is masked from ‘package:purrr’:

    flatten


