In [36]:
#1. Copy raw data to data/temp folder 
dir.create("data/temp")

data.path <- paste0("data/raw/", list.files("data/raw/"))


In [37]:
#2. Unzip raw files in data/temp folder 
for (file in data.path){
    file.dir <- gsub(".zip", "", file)
    file.dir <- gsub("raw", "temp", file.dir)
    unzip(zipfile = file, exdir = file.dir) 
}

In [38]:
data.path.temp <- paste0("data/temp/", list.files("data/temp"), "/raw_sequences")

md5dfs <- c()
for (folder in data.path.temp) {
    namesFiles <- list.files(path = folder)
    # Select only names of fastq.gz files 
    namesFilesGz <- namesFiles[grepl(".gz", namesFiles)]

    # Calculate md5 sums
    md5 <- c()
    SampleID <- c()
    for (i in namesFilesGz) {
        f.name <- paste(folder, i, sep="/")
        md <- tools::md5sum(f.name)
        md5 <- c(md5, md)
        SampleID <- c(SampleID, i) 
        }
    
newfilesMD5 <- data.frame(SampleID, md5)
md5dfs <- rbind(md5dfs, newfilesMD5) }


In [39]:
# Read in and prepare md5 information provided by BaseClear
###########################################################

md5.files <- list.files(path = "data/temp", pattern = "md5", recursive = TRUE)

md5.run1 <- rbind(read.delim(paste0("data/temp/", md5.files[1]), sep = " ", header = FALSE), 
             read.delim(paste0("data/temp/", md5.files[2]), sep = " ", header = FALSE))

md5.run2 <- read.delim(paste0("data/temp/", md5.files[5]), sep = " ", header = FALSE)

md5.run3 <- read.delim(paste0("data/temp/", md5.files[3]), sep = "\t", header = FALSE)
md5.run3$V2 <- sub(".*/", "", md5.run3$V2)

md5.run123 <- cbind(c(as.character(md5.run1$V1), as.character(md5.run2$V1), 
                      as.character(md5.run3$V1)), 
                    c(as.character(md5.run1$V3), as.character(md5.run2$V3), 
                      as.character(md5.run3$V2)))



In [40]:
# Compare created fasta files with provided by BaseClear file 
#############################################################

md5dfs <- md5dfs[order(as.character(md5dfs$SampleID)), ]
md5.run123 <- md5.run123[order(md5.run123[,2]), ]

table(as.character(md5dfs$md5) == md5.run123[, 1])


TRUE 
 558 

In [41]:
# Create directory Dada2_input and move there all files 
unlink("data/Dada2_input", recursive=TRUE)

dir.create("data/Dada2_input")

gz.all <- list.files("data/temp/", pattern = ".fastq.gz", recursive = TRUE)

gz.all <- paste0("data/temp/", gz.all)

file.copy(from = gz.all, to = "data/Dada2_input")

In [42]:
# Remove temp folder 
unlink("data/temp", recursive=TRUE)


In [43]:
# Read in metadata and list files 
metadata.f <- read.csv("data/metadata/samples_metadata.csv")

fastFiles <- list.files("data/Dada2_input/")

length(fastFiles)

In [44]:
metadata.f$OriginalID <- gsub("-", "_", metadata.f$OriginalID)

In [45]:
# Replace all "-" with "_"
##########################
fastFiles <- list.files("data/Dada2_input/")

for (i in 1:length(fastFiles)) {
    
    f1 <- paste0("data/Dada2_input/", fastFiles[i])
    
    f2 <- paste0("data/Dada2_input/", gsub("-", "_", fastFiles[i])) 
   
    file.rename(from = f1, to = f2)
}


In [46]:
# Remove sample that didn't pass control 
########################################

fastFiles <- list.files("data/Dada2_input/")

s.remove <- metadata.f[metadata.f$FilesKeep %in% "Remove", ]

for (f in s.remove$OriginalID) {
    unlink(paste0("data/Dada2_input/", fastFiles[grep(f, fastFiles)]))
}


In [47]:
# Rename samples 
################
# Subset metadata 

fastFiles2 <- list.files("data/Dada2_input/")

metadata.f1 <- metadata.f[metadata.f$FilesKeep %in% "Keep",]

metadata.f1$OriginalID <- gsub("-", "_", metadata.f1$OriginalID)

for (i in 1:nrow(metadata.f1)) {
    
    fg <- metadata.f1$OriginalID[i]
    
    select.f <- fastFiles2[grep(fg, fastFiles2)]
    
    r1 <- paste0("data/Dada2_input/", select.f[grep("R1", select.f)])
    r1.new <- paste0("data/Dada2_input/", metadata.f1$NewId[i], "_R1", ".fastq.gz")
    file.rename(from = r1, to = r1.new)
    
    r2 <- paste0("data/Dada2_input/", select.f[grep("R2", select.f)])
    r2.new <- paste0("data/Dada2_input/", metadata.f1$NewId[i], "_R2", ".fastq.gz")
    file.rename(from = r2, to = r2.new)
    }

In [48]:
# Save final version of metadata 
write.csv(metadata.f1[,-c(1,2)], "data/metadata/samples_metadata_f.csv")

In [49]:
table(duplicated(metadata.f1$NewId))


FALSE 
  262 