## use sourmash to make a depth file similar to read depth but based on kmers
- symlink the read sketches to folder
- create a file from all contigs
- sketch the contigs into kmers (k21,31,51)
- collect the sigs into sql db
- Run fastmultigather of all contigs <-> all reads
- Run a mgmanysearch using fmg result as picklist, using snakemake


In [None]:
# srun 
srun --account=ctbrowngrp -p bmm -J sketch -t 5:00:00 -c 1 --mem 10gb --pty bash

In [None]:
# symlink the reads sigs
ln -s ../../../2023-swine-sra/sourmash/sig_files/sketch_reads/ERR113518* .
ln -s ../../../2023-swine-sra/sourmash/sig_files/sketch_reads/ERR113517* .

# Use manysearch to look for abundance of contigs in metag
- Does not work with full read files right now
- Try the following:
    - Run manysearch with a subsetted read file, made from forward and reverse reads
    - then try with the full read file
    


In [None]:
# get manysearch to work...
# first on small subset of reads (WORKS)
# make small set (get the first 100 reads)
gunzip -c /group/ctbrowngrp2/scratch/annie/2023-swine-sra/atlas/atlas_ERR1135178/ERR1135178/sequence_quality_control/ERR1135178_QC_R2.fastq.gz | head -n 1000 | gzip > ERR1135178_QC_R2.subset.fq.gz
gunzip -c /group/ctbrowngrp2/scratch/annie/2023-swine-sra/atlas/atlas_ERR1135178/ERR1135178/sequence_quality_control/ERR1135178_QC_R1.fastq.gz | head -n 1000 | gzip > ERR1135178_QC_R1.subset.fq.gz

# create a sig file
sourmash sketch dna \
-p k=21,k=31,k=51,abund,scaled=100 \
ERR1135178_QC_R1.subset.fq.gz ERR1135178_QC_R2.subset.fq.gz \
--name ERR1135178 -o ../sketch_reads/ERR1135178_QC.subset.zip

# manysearch
sourmash scripts manysearch \
../sourmash_sketches/sketch_reads/ERR1135178_QC.subset.zip \
../sourmash_sketches/ERR11351.zip \
-k 21 --scaled 100 -o ERR1135178_QC.subset.csv -c 1

#Now full read set, sketch and manysearch the exact same way:
sourmash sketch dna \
-p k=21,k=31,k=51,abund,scaled=100 \
ERR1135178_QC_R1.fastq.gz ERR1135178_QC_R2.fastq.gz \
--name ERR1135178 -o ../sketch_reads/ERR1135178.zip

sourmash scripts manysearch \
../sourmash_sketches/sketch_reads/ERR1135178.zip \
../sourmash_sketches/ERR11351.zip \
-k 21 --scaled 100 -o ERR1135178.csv -c 1



## Compare mgmanysearch and manysearch output
- Create a small subset of data
- Run manysearch and mgmanysearch on these
- Get manysearch to run in general...



In [None]:
# make small set (get the first 100 reads)
gunzip -c /group/ctbrowngrp2/scratch/annie/2023-swine-sra/atlas/atlas_ERR1135178/ERR1135178/sequence_quality_control/ERR1135178_QC_R2.fastq.gz | head -n 100 | gzip > ERR1135178_QC_R2.subset.fq.gz

# create a sig file
sourmash sketch dna \
-p k=21,k=31,k=51,abund,scaled=100 \
ERR1135178_QC_R2.subset.fq.gz \
--name ERR1135178 -o ../sketch_reads/ERR1135178_QC_R2.subset.zip

# run manysearch
sourmash scripts manysearch \
../sourmash_sketches/sketch_reads/ERR1135178_QC_R2.subset.zip \
../sourmash_sketches/ERR11351.zip \
-k 21 --scaled 100 -o ERR1135178_QC_R2.subset.csv -c 1

# grep the successful matches so we have to compare less
sourmash sig grep 'ERR1135178_5527' ERR11351.zip -o ERR11351_5527.zip

# and mgmanysearch: This works, but the output file created has abundances of 1 
sourmash scripts mgmanysearch \
--against ../sourmash_sketches/sketch_reads/ERR1135178_QC_R2.subset.zip \
--queries ../sourmash_sketches/ERR11351_5527.zip \
-k 21 --scaled 100 -o ERR1135178_QC_R2.subset.mgm.csv 

In [None]:
# retry at a scaled of 100, treshold of 10
# sketch the reads, save as sig.gz 
# make a zip for each fasta, concat those then make a mf
for f in *_R1.fastq.gz
do
echo sourmash sketch dna \
-p abund,k=31,scaled=100,abund,k=21,scaled=100,abund,k=51,scaled=100 $f ${f%_R1*}_R2.fastq.gz \
--name ${f%_QC*} -o ${f%_QC*}.abund.sig.gz 
done | parallel -j 40


sourmash sketch dna \
-p abund,k=31,scaled=100,abund,k=21,scaled=100,abund,k=51,scaled=100 \
ERR1135186_QC_R1.fastq.gz \
--name ERR1135186 -o ERR1135186_R1.abund.zip
done | parallel -j 16



#manysketch??

In [None]:
# run the snakefile

srun --account=ctbrowngrp -p med2 -J sketch -t 8:30:00 -c 4 --mem=50gb --pty bash
mamba activate branchwater
snakemake --resources mem_mb=50000 --rerun-triggers mtime \
-c 24 --rerun-incomplete -k --latency-wait 1 -n


In [None]:
# try manysearch
srun --account=ctbrowngrp -p high2 -J bwabund -t 1:00:00 -c 100 --mem=50gb --pty bash
mamba activate branchwater-abund


# manysearch
mamba activate branchwater-abund
sourmash scripts manysearch \
../sourmash_sketches/sketch_reads/ERR1135186.abund.sig.gz \
../sourmash_sketches/ERR11351.zip \
-k 21 --scaled 100 -o ERR1135186.fullread.fulldb.csv -c 100

# # normal mgmany
# mamba activate branchwater
# sourmash scripts mgmanysearch \
# --against ../sourmash_sketches/sketch_reads/ERR1135181.k21.sig.gz \
# --queries ../sourmash_sketches/split_sig/ERR11351_01.zip \
# -k 21 --scaled 100 -o fmg_mgmanysearch.csv -c 16

sourmash sketch dna \
-p abund,k=31,scaled=100,abund,k=21,scaled=100,abund,k=51,scaled=100 \
ERR1135186_QC_R1.fastq.gz \
-o ERR1135186_R1.abund.noname.zip


sourmash scripts manysearch \
../sourmash_sketches/sketch_reads/ERR1135186_R1.abund.zip \
../test_mgsearch/ERR11351_3contig.zip \
-k 21 --scaled 100 -o ERR1135186_testwfullreads_smalldb.csv -c 2

sourmash scripts manysearch \
../test_mgsearch/ERR11351_R1.zip \
../sourmash_sketches/ERR11351.zip \
-k 21 --scaled 100 -o ERR11351_testwsmallreads_fulldb.csv -c 100



sourmash scripts manysearch \
../sourmash_sketches/sketch_reads/ERR1135186_R1.k21.abund.zip \
../test_mgsearch/ERR11351_3contig.zip \
-k 21 --scaled 100 -o ERR1135186_testwfullreads_smalldb.csv -c 2

In [None]:
sourmash scripts manysearch \
../sourmash_sketches/sketch_reads/all_reads.zip \
ERR11351_3contig.zip \
-k 21 --scaled 100 -o all_reads_smalldb.csv -c 10