#### Reformat DADA2 output as QIIME2 QZA files
This includes the replacement of sequence IDs with MD5 hashes

In [2]:
import pandas as pd
import hashlib
from Bio import SeqIO

#### reformat the feature table

In [3]:
df = pd.read_csv('dada_table.csv',index_col=0).transpose()
df.index = [hashlib.md5(i).hexdigest() for i in df.index]
df.index.name = '#Feature_id'
df.to_csv('temp',sep='\t')

!biom convert -i temp -o dada_table.biom --table-type="OTU table" --to-hdf5
!rm temp

In [4]:
%%bash
source activate qiime2-2019.4 && \
  qiime tools import \
  --input-path dada_table.biom \
  --type 'FeatureTable[Frequency]' \
  --input-format BIOMV210Format \
  --output-path dada_table.qza

Imported dada_table.biom as BIOMV210Format to dada_table.qza


QIIME is caching your current deployment for improved performance. This may take a few moments and should only happen once per deployment.


In [5]:
%%bash
source activate qiime2-2019.4 && \
  qiime feature-table summarize \
  --i-table dada_table.qza \
  --o-visualization dada_table.qzv

Saved Visualization to: dada_table.qzv


#### Reformat the representative sequences

In [6]:
repseqs = list(SeqIO.parse('dada-rep-seqs.fasta','fasta'))
for r in repseqs:
    r.id = hashlib.md5(str(r.seq)).hexdigest()
    r.description = ''
SeqIO.write(repseqs,'temp','fasta')

28142

In [7]:
%%bash
source activate qiime2-2019.4 && \
  qiime tools import \
  --input-path temp \
  --output-path dada-rep-seqs.qza \
  --type 'FeatureData[Sequence]'
rm temp

Imported temp as DNASequencesDirectoryFormat to dada-rep-seqs.qza


#### Reformat the taxonomy table

In [8]:
with open('temp.tsv','wt') as hndl:
    first = True
    for l in open('dada_taxa.csv'):
        if l.startswith('""'):
            continue
        if first:
            first=False
        else:
            hndl.write('\n')
        parts = l.rstrip().replace('"','').split(',')
        hndl.write(hashlib.md5(str(parts.pop(0))).hexdigest())
        hndl.write('\t')
        hndl.write(';'.join(parts))

In [9]:
%%bash
source activate qiime2-2019.4 && \
 qiime tools import \
 --type FeatureData[Taxonomy] \
 --input-path temp.tsv \
 --output-path dada_taxa.qza \
 --input-format HeaderlessTSVTaxonomyFormat
rm temp.tsv

Imported temp.tsv as HeaderlessTSVTaxonomyFormat to dada_taxa.qza


QIIME is caching your current deployment for improved performance. This may take a few moments and should only happen once per deployment.
