## Download
do not run - takes hours

In [None]:
%%bash
redbiom search metadata 'where sample_type == "stool"' > stool_samples
redbiom search metadata 'where sample_type == "Stool"' >> stool_samples
export CTX=Deblur-illumina-16S-v4-150nt-10d7e0
redbiom fetch samples --from stool_samples --context $CTX --output stool_sv.biom

## Preprocess
do not run - takes overnight

In [3]:
%%bash
qiime tools import --input-path 99_otus.fasta --output-path 99_otus.qza --type FeatureData[Sequence]
qiime feature-classifier extract-reads --i-sequences 99_otus.qza --p-f-primer GTGYCAGCMGCCGCGGTAA --p-r-primer GGACTACNVGGGTWTCTAAT --o-reads 99_otus_v4.qza
qiime tools export 99_otus_v4.qza --output-dir .
mv dna-sequences.fasta 99_otus_v4.fasta
biom table-ids --observations -i stool_sv.biom | awk '{print ">"$1"blast_rocks\n"$1}' > stool_sv.fasta
makeblastdb -in 99_otus_v4.fasta -dbtype nucl -out 99_otus_v4.db
blastn -num_threads 4 -query stool_sv.fasta -outfmt "6 qacc sacc" -db 99_otus_v4.db -max_target_seqs 1 -out stool_sv_map.blast

## Simulate

In [9]:
import csv
import shutil
import io

import biom
from numpy.random import choice
import skbio.io

In [2]:
stool_sv = biom.load_table('stool_sv.biom')

In [41]:
stool_sv_map = {}
with open('stool_sv_map.blast') as blast_results:
    blast_reader = csv.reader(blast_results, csv.excel_tab)
    for row in blast_reader:
        assert row[0].endswith('blast_rocks')
        sv = row[0][:-len('blast_rocks')]
        if sv in stool_sv_map:
            assert stool_sv_map[sv] == row[1],\
                ' '.join([sv, stool_sv_map[sv], row[1]])
            continue
        stool_sv_map[sv] = row[1]

In [42]:
ref_seqs = {}
with open('99_otus_v4.fasta') as ref_fh:
    fasta_reader = skbio.io.read(ref_fh, 'fasta')
    for seq in fasta_reader:
        ref_seqs[seq.metadata['id']] = str(seq)

In [48]:
random_sample = stool_sv.ids()[choice(stool_sv.length())]
random_sample = stool_sv.filter([random_sample], inplace=False)
random_sample.filter(lambda v, _, __: v[0] > 1e-9, axis='observation', inplace=True)

538 x 1 <class 'biom.table.Table'> with 538 nonzero entries (100% dense)

In [53]:
with open('abundance.fasta', 'w') as a_fh:
    for row in random_sample.iter(axis='observation'):
        abundance, sv, _ = row
        abundance = int(abundance[0])
        if sv in stool_sv_map:
            a_fh.write('>' + sv + ';size=' + str(abundance) + '\n')
            a_fh.write(ref_seqs[stool_sv_map[sv]] + '\n')

In [67]:
%%bash
vsearch --rereplicate abundance.fasta --output prior_art.fasta
export PATH=$PATH:art_bin_MountRainier
art_illumina -ss HS25 -sam -amp -i prior_art.fasta -l 150 -o post_art -c 1
cat > post_art.manifest << END
sample-id,absolute-filepath,direction
sample-1,$PWD/post_art.fq,forward
END
qiime tools import --input-path post_art.manifest --output-path post_art.qza --type SampleData[SequencesWithQuality] --source-format SingleEndFastqManifestPhred33
#qiime dada2 denoise-single --i-demultiplexed-seqs post_art.qza --o-representative-sequences dada2-seqs.qza --o-table simulated.qza --p-trunc-len 0 --p-n-threads 4 --p-no-hashed-feature-ids
qiime deblur denoise-16S --i-demultiplexed-seqs post_art.qza --o-representative-sequences dada2-seqs.qza --o-table simulated.qza --p-trim-length -1 --p-sample-stats --o-stats stats.qza --p-jobs-to-start 4 --p-no-hashed-feature-ids


             ART_Illumina (2008-2016)          
          Q Version 2.5.8 (June 6, 2016)       
     Contact: Weichun Huang <whduke@gmail.com> 
    -------------------------------------------

              Amplicon 5'-end sequencing simulation

Total CPU time used: 3.00006

The random seed for the run: 1521606071

Parameters used during run
	Read Length:	150
	Genome masking 'N' cutoff frequency: 	1 in 150
	# Reads per Amplion:       0
	Profile Type:             Combined
	ID Tag:                   

Quality Profile(s)
	First Read:   HiSeq 2500 Length 150 R1 (built-in profile) 

Output files

  FASTQ Sequence File:
	post_art.fq

  ALN Alignment File:
	post_art.aln

  SAM Alignment File:
	post_art.sam

Saved FeatureTable[Frequency] to: simulated.qza
Saved FeatureData[Sequence] to: dada2-seqs.qza
Saved DeblurStats to: stats.qzv.qza


vsearch v2.5.0_macos_x86_64, 16.0GB RAM, 8 cores
https://github.com/torognes/vsearch

Rereplicating 100%
Rereplicated 35947 reads from 538 amplicons
