# seqFormatteR V2
### Contact Information

Anthony S. Castanza, PhD - acastanza@ucsd.edu
### Summary
https://github.com/ACastanza/seqformat

## Installation 
Install required R libraries for the pipeline.

In [None]:
install.packages("BiocManager", repos="http://cran.r-project.org")
library("BiocManager")
BiocManager::install(c("GEOquery","tximport", "GenomicFeatures", "biomaRt", "rhdf5", "DESeq2"))
install.packages("tidyverse")

## Configure Enviornment
Load required packages and configure global variables

In [None]:
library("tools")
library("dplyr")
library("tidyr")
library("GEOquery")
library("tximport")
library("GenomicFeatures")
library("biomaRt")
library("DESeq2")

In [None]:
if (exists("altnorm") == FALSE) {
  altnorm <- FALSE
}
## before running script set altnorm <- 'userlog' or 'usevst' or 'useall' to
## output additional normalize using deseq2's rlog or vst functions

iscounts <- FALSE
txlevel <- FALSE
FAIL <- FALSE
isnormalized <- FALSE
TYPE <- NULL
txtype <- 0
is_directory <- FALSE
getgeofiles <- FALSE
seondaryfactor <- FALSE
NORM <- FALSE


## Import Data Files from GEO Directly
Optional step to enable directly importing deposited data from a published experiment.

In [None]:
getgeofiles <- askYesNo("Attempt to get data directly from GEO \"supplementary files\"? ")
if (getgeofiles == TRUE) {
  geoid <- readline(prompt = ("Enter the GEOID for the datafiles (eg: GSE38786): "))
  geooutfiles <- getGEOSuppFiles(geoid, makeDirectory = TRUE, baseDir = getwd(),
    fetch_files = TRUE, filter_regex = NULL)
  cat("Getting experiment information from Series Matrix...\n")
  gsefile <- getGEO(geoid)
  phenotypedata <- as.data.frame(pData(phenoData(gsefile[[1]]))[, c("geo_accession",
    "title")], stringsAsFactors = FALSE)
  rownames(phenotypedata) <- 1:nrow(phenotypedata)
  geostructure <- pData(phenoData(gsefile[[1]]))
  cat("\n")
  print(phenotypedata)
  cat("\n")
  cat("Files acquired from GEO:\n")
  geoimporttable <- as.data.frame(rownames(geooutfiles), stringsAsFactors = FALSE)
  print(rownames(geooutfiles))
  # print('All')
  cat("\n")
  if (nrow(geoimporttable) > 1) {
    useall <- askYesNo("Use all downloaded files? (\"No\" allows you to select a specific file)")
    if (useall == TRUE) {
      genematrix <- paste0(getwd(), "/", geoid)
    }
    if (useall == FALSE) {
      geoselected <- readline(prompt = ("Select which GEO file to use for downstream processing: "))
      geoselectednumber <- match(geoselected, cbind(rownames(geoimporttable),
        geoimporttable)[, 1])
      if (is.na(geoselectednumber) == TRUE) {
        geoselectednumber <- match(geoselected, cbind(rownames(geoimporttable),
          geoimporttable)[, 2])
      }
      outfile <- rownames(geooutfiles)[geoselectednumber]
      if (file_ext(rownames(geooutfiles)[geoselectednumber]) == "tar") {
        untar(rownames(geooutfiles)[geoselectednumber], exdir = paste0(dirname(rownames(geooutfiles)[geoselectednumber]),
          "/", basename(tools::file_path_sans_ext(rownames(geooutfiles)[geoselectednumber]))))
        outfile <- paste0(dirname(rownames(geooutfiles)[geoselectednumber]),
          "/", basename(tools::file_path_sans_ext(rownames(geooutfiles)[geoselectednumber])))
      }
      genematrix <- outfile
    }
  } else if (nrow(geoimporttable) == 1) {
    if (file_ext(rownames(geooutfiles)[1]) == "tar") {
      untar(rownames(geooutfiles)[1], exdir = paste0(dirname(rownames(geooutfiles)[1]),
        "/", basename(tools::file_path_sans_ext(rownames(geooutfiles)[1]))))
      outfile <- paste0(dirname(rownames(geooutfiles)[1]), "/", basename(tools::file_path_sans_ext(rownames(geooutfiles)[1])))
    } else {
      outfile <- paste0(geoimporttable[1])
    }
    genematrix <- outfile
  }

  cat("\n")
  cat("Experiment Imported.\n")
  cat("\n")

  findcounts <- apply(geostructure, 2, function(x) {
    grepl("counts|Counts", x)
  })
  if (any(findcounts) == TRUE) {
    message("Series Matrix implies that this data consists of COUNTS:\n")
    print(unique(geostructure[findcounts]))
    iscounts <- askYesNo("Do you agree that this is gene COUNTS data? ")
    cat("\n")
    if (iscounts == TRUE) {
      countsdetected <- TRUE
      istx <- FALSE
      TYPE <- "COUNTS"
    }
  } else if (any(findcounts) == FALSE) {
    cat("We couldn't automatically set the datatype\n")
    cat("We'll prompt you to manually select datatype next.\n")
  }
  findtxquant <- apply(geostructure, 2, function(x) {
    grepl("almon|ailfish|allisto", x)
  })
  if (any(findtxquant) == TRUE) {
    cat("\n")
    message("Series Matrix implies that this data might be transcript level quantifications:\n")
    print(unique(geostructure[findtxquant]))
    cat("\n")
    message("Validate the presence of transcript level quantifications in data files, then continue.\n")
    cat("\n")
  }

  findnormal <- apply(geostructure, 2, function(x) {
    grepl("normalized|Normalized|NORMALIZED|normalised|Normalised|NORMALISED",
      x)
  })
  if (any(findnormal) == TRUE) {
    message("Series Matrix implies that this data might be ALREADY NORMALIZED:\n")
    print(unique(geostructure[findnormal]))
    isnormalized <- askYesNo("Is this data already normalized? ")
    if (isnormalized == TRUE) {
      NORM <- TRUE
      DESEQ2DONE <- FALSE
    }
  } else if (any(findnormal) == FALSE) {
    message("Series Matrix implies that this data might require normalization.\n")
    isnormalized <- askYesNo("Is this data already normalized? ")
    if (isnormalized == FALSE) {
      NORM <- FALSE
      DESEQ2DONE <- FALSE
    }
  }

}