In [23]:
import glob
import hashlib
import pandas as pd
from Bio import SeqIO

### Extract qz files for dada2

In [10]:
!mkdir DB
!mkdir temp
!mkdir -p dada/ITS1 

for i in glob.glob('raw_data/ITS1/*.fastq.gz'):
    outf = 'temp/%s' % i.split('/')[-1].replace('.gz','')
    !zcat $i > $outf

mkdir: cannot create directory ‘DB’: File exists
mkdir: cannot create directory ‘temp’: File exists


### Write dada2 script

In [16]:
%%file ITS1_DADA2.R

library(Matrix);
library(dada2);

path <- "~/GeoDiversity/temp" # CHANGE ME to the directory containing the fastq files after unzipping.
# Forward and reverse fastq filenames have format: SAMPLENAME_R1_001.fastq and SAMPLENAME_R2_001.fastq
fnFs <- sort(list.files(path, pattern="_R1_001.fastq", full.names = TRUE))
fnRs <- sort(list.files(path, pattern="_R2_001.fastq", full.names = TRUE))
# Extract sample names, assuming filenames have format: SAMPLENAME_XXX.fastq
sample.names <- sapply(strsplit(basename(fnFs), "_"), `[`, 1)
# Place filtered files in filtered/ subdirectory
path1 = "~/GeoDiversity/cut_reads"                                                        
filtFs <- file.path(path1, "ITS1", paste0(sample.names, "_F_filt.fastq.gz"))
filtRs <- file.path(path1, "ITS1", paste0(sample.names, "_R_filt.fastq.gz"))                                                        
out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs, truncLen=c(125,125),
              maxN=0, maxEE=c(4,5), truncQ=10, rm.phix=TRUE, trimLeft=0,
              compress=TRUE, multithread=TRUE)
# learn F errors
errF <- learnErrors(filtFs, multithread=TRUE)
print('learn R errors')
errR <- learnErrors(filtRs, multithread=TRUE)

# derep F
derepFs <- derepFastq(filtFs, verbose=TRUE)
# derep R
derepRs <- derepFastq(filtRs, verbose=TRUE)
# Name the derep-class objects by the sample names
names(derepFs) <- sample.names
names(derepRs) <- sample.names

# dada F
dadaFs <- dada(derepFs, err=errF, multithread=TRUE)
# dada R
dadaRs <- dada(derepRs, err=errR, multithread=TRUE)

# merging
mergers <- mergePairs(dadaFs, derepFs, dadaRs, derepRs, verbose=TRUE, justConcatenate=TRUE)
seqtab <- makeSequenceTable(mergers)

# chimera removal                                                    
seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", multithread=TRUE, verbose=TRUE)

# assign taxonomy                                                        
taxa <- assignTaxonomy(seqtab, "~/GeoDiversity/DB/sh_general_release_dynamic_02.02.2019.fasta")

# write outputs                                                   
uniquesToFasta(seqtab, 'dada/ITS1/rep-seqs.fasta')
write.csv(seqtab, file = "dada/ITS1/table.csv")
write.csv(taxa, file = "dada/ITS1/taxa.csv")

Overwriting ITS1_DADA2.R


### Run dada2

In [17]:
!Rscript ITS1_DADA2.R

Loading required package: Rcpp
Creating output directory: /home/amir/GeoDiversity/cut_reads/ITS1
101497500 total bases in 811980 reads from 62 samples will be used for learning the error rates.
[1] "learn R errors"
101497500 total bases in 811980 reads from 62 samples will be used for learning the error rates.
Dereplicating sequence entries in Fastq file: ~/GeoDiversity/cut_reads/ITS1/ITS-AS3000_F_filt.fastq.gz
Encountered 5657 unique sequences from 20211 total sequences read.
Dereplicating sequence entries in Fastq file: ~/GeoDiversity/cut_reads/ITS1/ITS-AS3001_F_filt.fastq.gz
Encountered 3698 unique sequences from 15477 total sequences read.
Dereplicating sequence entries in Fastq file: ~/GeoDiversity/cut_reads/ITS1/ITS-AS3002_F_filt.fastq.gz
Encountered 3981 unique sequences from 17678 total sequences read.
Dereplicating sequence entries in Fastq file: ~/GeoDiversity/cut_reads/ITS1/ITS-AS3003_F_filt.fastq.gz
Encountered 1817 unique sequences from 8343 total sequences read.
Dereplica

### Import to qiime2

In [24]:
# add md5 hashes to table
df = pd.read_csv('dada/ITS1/table.csv',index_col=0).transpose()
df.index = [hashlib.md5(i.encode()).hexdigest() for i in df.index]
df.index.name = '#Feature_id'
df.to_csv('temp1',sep='\t')

# convert table to biom
!biom convert -i temp1 -o dada/ITS1/table.biom --table-type="OTU table" --to-hdf5
!rm temp1

# convert biom table to qza
!qiime tools import \
  --input-path dada/ITS1/table.biom \
  --type 'FeatureTable[Frequency]' \
  --input-format BIOMV210Format \
  --output-path dada/ITS1/table.qza

# add md5 hashes to sequence headers
repseqs = list(SeqIO.parse('dada/ITS1/rep-seqs.fasta','fasta'))
for r in repseqs:
    r.id = hashlib.md5(str(r.seq).encode()).hexdigest()
    r.description = ''
SeqIO.write(repseqs,'temp1','fasta')

# convert fasta to qza
!qiime tools import \
  --input-path temp1 \
  --output-path dada/ITS1/rep-seqs.qza \
  --type 'FeatureData[Sequence]'

!rm temp1

# add md5 hashes to taxonomy table
with open('temp.tsv','wt') as hndl:
    first = True
    for l in open('dada/ITS1/taxa.csv'):
        if l.startswith('""'):
            continue
        if first:
            first=False
        else:
            hndl.write('\n')
        parts = l.rstrip().replace('"','').split(',')
        hndl.write(hashlib.md5(str(parts.pop(0)).encode()).hexdigest())
        hndl.write('\t')
        hndl.write(';'.join(parts))
        
# convert taxonomy table to qza
!qiime tools import \
   --type FeatureData[Taxonomy] \
   --input-path temp.tsv \
   --output-path dada/ITS1/taxonomy.qza \
   --input-format HeaderlessTSVTaxonomyFormat

!rm temp.tsv
!rm -r temp

[32mImported dada/ITS1/table.biom as BIOMV210Format to dada/ITS1/table.qza[0m
[32mImported temp1 as DNASequencesDirectoryFormat to dada/ITS1/rep-seqs.qza[0m
[32mImported temp.tsv as HeaderlessTSVTaxonomyFormat to dada/ITS1/taxonomy.qza[0m


### make tree

In [26]:
!qiime phylogeny align-to-tree-mafft-fasttree \
  --i-sequences dada/ITS1/rep-seqs.qza \
  --o-alignment dada/ITS1/aligned-rep-seqs.qza \
  --o-masked-alignment dada/ITS1/masked-aligned-rep-seqs.qza \
  --o-tree dada/ITS1/unrooted-tree.qza \
  --o-rooted-tree dada/ITS1/rooted-tree.qza

[32mSaved FeatureData[AlignedSequence] to: dada/ITS1/aligned-rep-seqs.qza[0m
[32mSaved FeatureData[AlignedSequence] to: dada/ITS1/masked-aligned-rep-seqs.qza[0m
[32mSaved Phylogeny[Unrooted] to: dada/ITS1/unrooted-tree.qza[0m
[32mSaved Phylogeny[Rooted] to: dada/ITS1/rooted-tree.qza[0m


### ITS1 metadata

In [27]:
with open('ITS1_metadata.tsv','wt') as hndl:
    header = True
    for l in open('16SrRNA_metadata.tsv'):
        if header == True:
            hndl.write(l)
            header = False
            continue
        hndl.write('ITS-'+l)