## use sourmash to make a depth file similar to read depth but based on kmers
- symlink the read sketches to folder
- create a file from all contigs
- sketch the contigs into kmers (k21,31,51)
- collect the sigs into sql db
- Run fastmultigather of all contigs <-> all reads
- Run a mgmanysearch using fmg result as picklist, using snakemake


In [None]:
# srun 
srun --account=ctbrowngrp -p bmm -J sketch -t 10:00:00 -c 4 --mem 10gb --pty bash

In [None]:
# symlink the reads sigs
ln -s ../../../2023-swine-sra/sourmash/sig_files/sketch_reads/ERR113518* .
ln -s ../../../2023-swine-sra/sourmash/sig_files/sketch_reads/ERR113517* .

## Run manysearch to score abundance of contig in each metagenome
- manysearch needs a treshold of 0, because there will never be 1% of the reads linked to the genome, bc its a shit ton of reads. 
- Compare manysearch to mgmanysearch
- Run Tessa's scripts to create a depth file from manysearch output

### Compare mgmanysearch and manysearch output
- Create a small subset of data
- Run manysearch and mgmanysearch on these


Workflow:
- sketch all reads with manysketch (abundance, k21,k31,k51)
- sketch the contigs (no abundance, k21,k31,k51)
- run manysearch (k31, scaled=100, t=0)


In [None]:
# manysketch
# How to run manysketch??
# https://github.com/sourmash-bio/sourmash_plugin_branchwater/tree/main/doc#running-manysketch
# one file for all the sketches?

# make a read file
echo name,read1,read2 > manysketch.csv
for i in *_R1.fastq.gz
do
echo ${i%_QC*},$i,${i%_R1*}_R2.fastq.gz
done >> manysketch.csv

#sketch from readfile
mamba activate branchwater
sourmash scripts manysketch manysketch.csv \
-p k=21,k=31,k=51,scaled=100,abund -c 100 \
-o ../all_reads.resketch.zip

In [None]:
# sketch contigs
# symlink the atlas folders
ln -s /group/ctbrowngrp2/scratch/annie/2023-swine-sra/atlas/atlas_ERR113517*/
ln -s /group/ctbrowngrp2/scratch/annie/2023-swine-sra/atlas/atlas_ERR113518*/

# cat all contigs to a results file (n=349,706)
cat atlas_*/*/*_contigs.fasta > ../../results/sourmash_sketches/ERR11351.fasta

# mamba activate branchwater
sourmash sketch dna -p k=21,scaled=100,k=31,scaled=100,k=51,scaled=100 \
--singleton ERR11351.fasta -o ERR11351.zip

In [None]:
# run manysearch (from ./results/manysearch/)
mamba activate branchwater-abund
sourmash scripts manysearch \
../sourmash_sketches/all_reads.zip \
../sourmash_sketches/ERR11351.zip \
-k 31 --scaled 100 -o allreads_x_allcontigs.k31.csv -c 100 -t 0

In [None]:
# create a contig length file and the depth file for binning
python ../workflow/scripts/get-contig-lengths.py ../resources/ERR11351.fasta depth_files/contig_lengths.csv

# make a list of input csv files (in this case just 1)
readlink -f allreads_x_allcontigs.k31.csv > file_list.txt

# create the depth file
python ../workflow/scripts/manysearch-to-mb-depth.py \
./manysearch/file_list.txt \
--lengths ./depth_files/contig_lengths.csv \
-o ./depth_files/depth_manysearch.txt

In [None]:
# Now the goal is to use this depth file in metabat and or vamb and does it improve bins?
# creating bins with either vamb or metabat:

# run with intravariance var
# METABAT
srun --account=ctbrowngrp -p med2 -J metabat2 -t 10:00:00 -c 8 --mem=40gb --pty bash
mamba activate metabat2
metabat2 -m 1500 \
-i ./resources/ERR11351.fasta \
-o ./results/manysearch/metabat \
-a ./results/depth_files/depth_manysearch.txt -t 8
 
 # VAMB
srun --account=ctbrowngrp -p med2 -J vamb -t 10:00:00 -c 1 --mem=40gb --pty bash
mamba activate vamb
vamb --outdir ./results/manysearch/vamb \
--fasta ./resources/ERR11351.fasta \
--jgi ./results/depth_files/depth_manysearch.txt \
--minfasta 50000


In [None]:
# compare manysearch and mgmanysearch
# subset a set of reads (n=100,000):
gunzip -c /group/ctbrowngrp2/scratch/annie/2023-swine-sra/atlas/atlas_ERR1135178/ERR1135178/sequence_quality_control/ERR1135178_QC_R1.fastq.gz | head -n 100000 | gzip > ERR1135178_QC_R1.subset.fq.gz
gunzip -c /group/ctbrowngrp2/scratch/annie/2023-swine-sra/atlas/atlas_ERR1135178/ERR1135178/sequence_quality_control/ERR1135178_QC_R2.fastq.gz | head -n 100000 | gzip > ERR1135178_QC_R2.subset.fq.gz

# create a signature
mamba activate branchwater
sourmash sketch dna \
-p k=21,k=31,k=51,abund,scaled=100 \
ERR1135178_QC_R1.subset.fq.gz ERR1135178_QC_R2.subset.fq.gz \
--name ERR1135178 -o ../sketch_reads/ERR1135178_QC.subset.zip

# subset the fasta file for 1000 contigs
mamba activate bbmap
reformat.sh in=ERR11351.fasta out=ERR11351_1000.fasta reads=1000

# sketch
mamba activate branchwater
sourmash sketch dna -p k=21,scaled=100,k=31,scaled=100,k=51,scaled=100 \
--singleton ERR11351_1000.fasta -o ERR11351_1000.zip

# mgmanysearch for spot check (from ./results/mgmanysearch) and timed
mamba activate branchwater
/usr/bin/time -v sourmash scripts mgmanysearch \
--against ../sourmash_sketches/sketch_reads/ERR1135178_QC.subset.zip \
--queries ../sourmash_sketches/ERR11351_1000.zip \
-k 31 --scaled 100 -o ERR1135178.timed.mgm.csv

# manysearch
mamba activate branchwater-abund
/usr/bin/time -v sourmash scripts manysearch \
../sourmash_sketches/sketch_reads/ERR1135178_QC.subset.zip \
../sourmash_sketches/ERR11351_1000.zip \
-k 31 --scaled 100 -o ERR1135178.mn.timed.csv -c 1 -t 0

In [None]:
# run a snakefile
srun --account=ctbrowngrp -p med2 -J sketch -t 8:30:00 -c 4 --mem=50gb --pty bash
mamba activate branchwater
snakemake --resources mem_mb=50000 --rerun-triggers mtime \
-c 24 --rerun-incomplete -k --latency-wait 1 -n
