## Linking viruses to hosts
- Predicting viral sequences using Genomad
- Using Minced to predict spacer regions (minced.smk)
- Need to use DRAM for functional annotation, and cas-genes

In [None]:
## Running minced
# srun bc needs mem and time
srun --account=ctbrowngrp -p high2 -J minced -t 1:00:00 -c 10 --mem=50gb --pty bash
mamba activate minced

snakemake -s minced.smk --use-conda --resources mem_mb=50000 --rerun-triggers mtime \
-c 10 --rerun-incomplete -k -n

In [None]:
# try cctyper:
cctyper genome.fa output_dir -t {threads}

In [None]:
# all cctyper results ned to be concatenated:
mamba activate csvtk
csvtk concat -t */CRISPR_Cas.tab > ../250416_CRISPR_Cas.cctyper.tsv
csvtk concat -t */cas_operons.tab > ../250416_cas_operons.cctyper.tsv
csvtk concat -t */crisprs_all.tab > ../250416_crisprs_all.cctyper.tsv
cat */spacers/*.fa > ../250416_spacers.ccytper.fa

### CRISPR sequences of ETEC
- Check if there are crispr-spacers in the etecs that are public
- Download sequences (needs: accession,name)
- Run minced
- blast x votus

In [None]:
# download genomes
srun --account=ctbrowngrp -p med2 -J gbsketch -t 1:00:00 -c 2 --mem=10gb --pty bash

mamba activate branchwater-skipmer
sourmash scripts gbsketch -c 2 \
--keep-fasta --genomes-only etec_genbank.csv \
-f etec_genomes --download-only --failed etec.fail.csv

In [None]:
# Run cctyper 
# download genomes
srun --account=adamgrp -p high2 -J cctyper -t 8:00:00 -c 40 --mem=20gb --pty bash

snakemake -s cctyper.smk --use-conda --resources mem_mb=20000 --rerun-triggers mtime \
-c 40 --rerun-incomplete -k -n

In [None]:
rule directsketch:
    input:
        csv = f"{OUTPUT_DIR}/{pang_name_out}/{pang_name_out}xgtdb.csv",
    output:
        sig = f"{OUTPUT_DIR}/{pang_name_out}/sourmash/{pang_name_out}.gtdb.zip",
        failed_test = f"{OUTPUT_DIR}/{pang_name_out}/check/{pang_name_out}.failed.csv",
        fail_checksum= f"{OUTPUT_DIR}/{pang_name_out}/check/{pang_name_out}.checksum.failed.csv",
    conda: 
        "branchwater-skipmer"
    threads: 10
    params:
        output_folder=f"{OUTPUT_DIR}/{pang_name_out}/MAGs"
    shell:
        """ 
        sourmash scripts gbsketch  --keep-fasta --genomes-only \
        {input.csv} -o {output.sig} -p dna,k=21,k=31,scaled=100,abund \
        -f {params.output_folder} -k -c {threads} \
        --failed {output.failed_test} -r 1 --checksum-fail {output.fail_checksum}

## Making a blastdb from viral sequences:
- Need to make a blastdb, then blast viruses to the spacers. 

In [None]:
# blastdb
# in: /group/ctbrowngrp2/amhorst/2025-pigparadigm/results/viral
mkdir blastdb
srun --account=ctbrowngrp -p high2 -J blastn -t 12:00:00 -c 24 --mem=100gb --pty bash

# db
mamba activate blast
makeblastdb -in RVD_owncontigs.fa -dbtype nucl -out ./blastdb/RVD_owncontigs

In [None]:

# make a blastdb of all vOTU sequences (no dereplication, may be interesting strain diversity))
srun --account=ctbrowngrp -p high2 -J blastn -t 4:00:00 -c 20 --mem=50gb --pty bash

# make a db 
mamba activate blast
makeblastdb -in 250419_viral_sequences.95.cluster.fa \
-dbtype nucl -out ./blastdb/250419_viral_sequences

# Blast spacers to vOTUs 
blastn -task blastn-short -num_threads 24 \
-query ../crispr/250416_spacers.ccytper.fa \
-db ./blastdb/250419_viral_sequences \
-evalue 1e-5 -perc_identity 95 -outfmt 6 \
-out 250421_spacers_vOTUs.pig.cctyper 

# same for the etec genomes
blastn -task blastn-short -num_threads 20 \
-query ../crispr/ETEC_crispr/250409_all_ETEC_spacers.fa \
-db ./blastdb/250419_viral_sequences \
-evalue 1e-5 -perc_identity 95 -outfmt 6 \
-out 250421_spacers_vOTUs.ETEC.cctyper 

In [None]:
seqkit fx2tab -l -n 250419_plasmid_sequences.95.cluster.fa | awk '{print ">"$1"\t"$2}' > plasmid_lengths.txt
