## use sourmash to make a depth file similar to read depth but based on kmers
- symlink the read sketches to folder
- create a file from all contigs
- sketch the contigs into kmers (k21,31,51)
- collect the sigs into sql db
- Run fastmultigather of all contigs <-> all reads
- Run a mgmanysearch using fmg result as picklist, using snakemake


In [None]:
# symlink the reads sigs
ln -s ../../../2023-swine-sra/sourmash/sig_files/sketch_reads/ERR113518* .
ln -s ../../../2023-swine-sra/sourmash/sig_files/sketch_reads/ERR113517* .

In [None]:
# srun 
srun --account=ctbrowngrp -p bmm -J fmg_bin -t 24:00:00 -c 100 --mem 100gb --pty bash

In [None]:
# retry at a scaled of 100, treshold of 10
# sketch the reads, save as sig.gz 
# make a zip for each fasta, concat those then make a mf
for f in *_R1.fastq.gz
do
echo sourmash sketch dna \
-p k=31,scaled=100 $f ${f%_R1*}_R2.fastq.gz \
--name ${f%_QC*} -o ${f%_QC*}.k31.sig.gz 
done | parallel -j 24

In [None]:
# fastmultigather with scale of 100.

mamba activate branchwater
sourmash scripts fastmultigather \
../sketch_reads/.k21.txt \
../sketch_contigs/ERR11351.k21.zip \
-c 100 -k 21 -t 1000 -s 100

mamba activate branchwater
sourmash scripts fastmultigather \
../sketch_reads/ERR1135178.k21.zip \
../sketch_contigs/ERR11351.k21.zip \
-c 100 -k 21 -t 1000 -s 100

In [None]:
# can give it a list of query files!!
sourmash scripts mgmanysearch \
--queries sketch_contigs/ERR1135178.sig \
--against sketch_reads/ERR1135178.sig.gz \
-k 21 --scaled 1000 -o ERR1135178.mgm.csv

In [None]:
# run the snakefile

srun --account=ctbrowngrp -p med2 -J sketch -t 8:30:00 -c 4 --mem=50gb --pty bash
mamba activate branchwater
snakemake --resources mem_mb=50000 --rerun-triggers mtime \
-c 4 --rerun-incomplete -k --latency-wait 1 -n


In [None]:
# do you remove sigs with n=1?
# these aren't abundance sketched so wont work

# split the fasta
# use bbmap (gives us ±3500 per file)
mamba activate bbmap
partition.sh in=../ERR11351.fasta out=./ERR11351_%.fa ways=100



In [None]:
# split the fasta contig file into many smaller dbs

# then fmg for each smaller db per read file
sourmash fastmultigather <metagenome> <database-split-N> -o <database-split-N.csv>

# fastmultigather --output only works with a rocksdb..


# concat the picklists
concatenate the resulting <database-split-N.csv> files with csvtk concat

# run mgmanysearch using this combined picklist to all the db files. 
# create picklist first
sourmash sig check -k 21 <database-split-*.zip> --picklist <combined_csv>:match_md5:md5 -m <manifest>

# mgmany usig a manifest
sourmash scripts mgmanysearch --queries <manifest> \
--against <reads> -k 21 --scaled 100 -o {output.csv} 

In [None]:
# create a rocksdb and run it?
# needs to be done from a sig.gz file
sourmash sketch dna -p k=21,scaled=100 --singleton ERR11351_0.fa -o ../split_sig/ERR11351_0.sig.gz

# make a txt file pointing to the sig.gz
readlink -f  ERR11351_0.zip > ERR11351_0.txt

# create rocksdb
sourmash scripts index \
ERR11351_0.txt -m DNA -k 21 --scaled 100 \
-o ../rocksdb/ERR11351_0.rocksdb


sourmash scripts fastmultigather \
../sketch_reads/ERR1135178.k21.zip \
ERR11351_0.rocksdb -c 4 -k 21 -t 1000 -s 100 -m DNA \
-o ../results/sourmash/fastmultigather/ERR1135178xERR11351_0.csv