## Process amplicon sequences (16S + ITS)

Original from Christian Santos Medellin, rewritten by Anneliek ter Horst, May 2023
- Take amplicon libraries
- Demultiplex
- Check quality of libraries
- Run clustering algorithm, generating final OTU table and taxonomy table


In [None]:
# Create a file structure
# I have 2 lanes, 16S and ITS

# go to file folder
cd /home/amhorst/amplicon_seq/ 

# Make needed directories
mkdir reads dada_files qual_plots scripts err log
cd reads
mkdir raw_16s raw_its 
mkdir raw_16s/FWD raw_16s/RVS raw_16s/filtered
mkdir raw_its/FWD raw_its/RVS raw_its/filtered

# Place reads in appropriate folder

# Demultiplex reads
- Use bananastand
- https://github.com/bulksoil/BananaStand

In [None]:
## Demultiplex reads
cd /home/amhorst/amplicon_seq/scripts ## change this to your corresponding directory
sbatch --output=../log/dmx1.log --error=../err/dmx1.err dmx1.sh

In [None]:
#dmx.sh

# loading conda env
source /home/csantosm/initconda
conda activate PYTHON2.7

# running commands
cd /home/amhorst/amplicon_seq/reads/raw_16s 
python /home/csantosm/BananaStand/demultiplex.py \
-f ./Undetermined_S0_L001_R1_001.fastq.gz \
-r ./Undetermined_S0_L001_R2_001.fastq.gz \
--I1 ./Undetermined_S0_L001_I1_001.fastq.gz \
--I2 ./Undetermined_S0_L001_I2_001.fastq.gz \
-m dmx1.map \
-p T16S \
-a "dada2"

    

## Check read quality
- Plot the quality of libraries using dada2 in R
- Plotting scripts from Christian
- How to inspect read quality profiles: https://benjjneb.github.io/dada2/tutorial.html


In [None]:
# qual plot code
# Load libraries
library(Rcpp, lib.loc="/home/csantosm/R_Packages/R3.6.3/")
library(farver, lib.loc="/home/csantosm/R_Packages/R3.6.3/")
library(labeling, lib.loc="/home/csantosm/R_Packages/R3.6.3/")
library(digest, lib.loc="/home/csantosm/R_Packages/R3.6.3/")
library(dada2, lib.loc="/home/csantosm/R_Packages/R3.6.3/")

# Setting the absolute paths for FWD and RVS files
fwd1.path <- "/home/amhorst/amplicon_seq/reads/raw_16s/FWD/" # Forward
rvs1.path <- "/home/amhorst/amplicon_seq/reads/raw_16s/RVS/" # reverse

# Set prefixes for the files
fnFs1 <- sort(list.files(fwd1.path, pattern="T16S.*fastq", full.names = TRUE)) 
fnRs1 <- sort(list.files(rvs1.path, pattern="T16S.*fastq", full.names = TRUE)) 

# Plot first 10 forward libraries
pdf("/home/amhorst/amplicon_seq/qual_plots/raw_T16S_FWD.pdf")
plotQualityProfile(fnFs1[1:10])
dev.off()

# Plot first 10 reverse libraries
pdf("/home/amhorst/amplicon_seq/qual_plots/raw_T16S_RVS.pdf")
plotQualityProfile(fnRs1[1:10])
dev.off()


## Run the clustering algorithm
- Using dada2 in R
- For taxonomy of 16S: Use silva_v138.1 database
- For taxonomy of ITS: Use the UNITE database (v 10.05.2021)

Output files:

- Plots for the error training models (stored in the qual_plots folder). Read this if you want to learn more: https://benjjneb.github.io/dada2/tutorial.html
- otu1.RDS (stored in the dada_files folder) - formatted OTU table ready to use in R
- taxa1.RDS (stored in the dada_files folder) - formatted taxa file ready to use in R


In [2]:
# Load libraries
library(Rcpp, lib.loc="/home/csantosm/R_Packages/R3.6.3/")
library(stringr, lib.loc="/home/csantosm/R_Packages/R3.6.3/")
library(farver, lib.loc="/home/csantosm/R_Packages/R3.6.3/")
library(labeling, lib.loc="/home/csantosm/R_Packages/R3.6.3/")
library(digest, lib.loc="/home/csantosm/R_Packages/R3.6.3/")
library(magrittr, lib.loc="/home/csantosm/R_Packages/R3.6.3/")
library(dplyr, lib.loc="/home/csantosm/R_Packages/R3.6.3/")
library(tidyr, lib.loc="/home/csantosm/R_Packages/R3.6.3/")
library(dada2, lib.loc="/home/csantosm/R_Packages/R3.6.3/")

# Setting the absolute paths for FWD and RVS files
fwd.path <- "/home/amhorst/amplicon_seq/reads/raw_16s/FWD/"  ### change this to your path with the demultiplexed forward reads for this lane (lane 1 in this case)
rvs.path <- "/home/amhorst/amplicon_seq/reads/raw_16s/RVS/"  ### change this to your path with the demultiplexed forward reads for this lane (lane 1 in this case)

# Forward and reverse fastq filenames have format: T16S.fastq
fnFs <- sort(list.files(fwd.path, pattern="T16S.*fastq", full.names = TRUE)) ### change the pattern to fit the prefix used for your SampleIDs, in this case the prefix is "WUF"
fnRs <- sort(list.files(rvs.path, pattern="T16S.*fastq", full.names = TRUE)) ### change the pattern to fit the prefix used for your SampleIDs, in this case the prefix is "WUF"

# Extract sample names, assuming filenames have format: SAMPLENAME_XXX.fastq
sample.names <- sapply(str_extract(basename(fnFs), "T16S\\d\\d\\d"), `[`, 1) ### change the pattern to the naming convention you use for the samples, in this case "WUF" followed by 3 digits

# Generate filtered file path
filtFs <- file.path("/home/amhorst/amplicon_seq/filtered/", paste0(sample.names, "_F_filt.fastq.gz")) ### change to the empty folder that will hold the filtered reads for this lane (lane 1 in this case)
filtRs <- file.path("/home/amhorst/amplicon_seq/filtered/", paste0(sample.names, "_R_filt.fastq.gz")) ### change to the empty folder that will hold the filtered reads for this lane (lane 1 in this case)
names(filtFs) <- sample.names
names(filtRs) <- sample.names


out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs, truncLen = c(240,160),  #### change truncLen based on your quality scores
                     maxN=0, maxEE=c(2,2), truncQ=2, rm.phix=TRUE, 
                     compress=TRUE, multithread=TRUE, verbose = TRUE)

out

errF <- learnErrors(filtFs, multithread=TRUE, randomize = T)
errR <- learnErrors(filtRs, multithread=TRUE, randomize = T)

# Write plot to file
pdf("/home/amhorst/amplicon_seq/qual_plots/err1_FWD.pdf")   
plotErrors(errF, nominalQ=TRUE)
dev.off()

# Write plot to file
pdf("/home/amhorst/amplicon_seq/qual_plots/err1_RVS.pdf")   
plotErrors(errR, nominalQ=TRUE)
dev.off()


dadaFs <- dada(filtFs, err=errF, multithread=TRUE)
dadaRs <- dada(filtRs, err=errR, multithread=TRUE)

dadaFs[[1]]
dadaRs[[1]]



mergers <- mergePairs(dadaFs, filtFs, dadaRs, filtRs, verbose=TRUE)
# Inspect the merger data.frame from the first sample
head(mergers[[1]])


seqtab <- makeSequenceTable(mergers)
dim(seqtab)
table(nchar(getSequences(seqtab)))



seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", multithread=TRUE, verbose=TRUE)
dim(seqtab.nochim)
sum(seqtab.nochim)/sum(seqtab)
table(nchar(getSequences(seqtab.nochim)))


getN <- function(x) sum(getUniques(x))
track <- cbind(out, sapply(dadaFs, getN), sapply(dadaRs, getN), sapply(mergers, getN), rowSums(seqtab.nochim))

colnames(track) <- c("input", "filtered", "denoisedF", "denoisedR", "merged", "nonchim")
rownames(track) <- sample.names
head(track)


taxa <- assignTaxonomy(seqtab.nochim, "/home/csantosm/databases/silva_v138.1/silva_nr99_v138.1_train_set.fa.gz", multithread=TRUE)
taxa <- addSpecies(taxa, "/home/csantosm/databases/silva_v138.1/silva_species_assignment_v138.1.fa.gz")

taxa.print <- taxa # Removing sequence rownames for display only
rownames(taxa.print) <- NULL
head(taxa.print)

otu <- t(seqtab.nochim)
otu.tax <- taxa %>% as.data.frame() %>% mutate(OTU_ID = row.names(.))
rownames(otu.tax) <- NULL

# Write data to RDS files
saveRDS(seqtab.nochim, "/home/amhorst/amplicon_seq/dada_files/otu_dada1_16s.RDS")  
saveRDS(taxa, "/home/amhorst/amplicon_seq/dada_files/tax_dada1_16s.RDS")           
saveRDS(otu, "/home/amhorst/amplicon_seq/dada_files/otu1_16s.RDS")                  
saveRDS(otu.tax, "/home/amhorst/amplicon_seq/dada_files/tax1_16s.RDS")             

# Write data to csv files
write.csv(seqtab.nochim, "/home/amhorst/amplicon_seq/dada_files/otu_dada1_16s.csv")  
write.csv(taxa, "/home/amhorst/amplicon_seq/dada_files/tax_dada1_16s.csv")         
write.csv(otu, "/home/amhorst/amplicon_seq/dada_files/otu1_16s.csv")                 
write.csv(otu.tax, "/home/amhorst/amplicon_seq/dada_files/tax1_16s.csv")   
