## Steps that weren't done with snakemake 

In [None]:
# getting stuck on RdRp-Scan and building trees, also want to use Genomad
# link genomad db 
ln -s /group/jbemersogrp/databases/genomad/genomad_db . 

# srun it, needs quite some mem
srun --account=ctbrowngrp -p bmm -J genomad -t 12:00:00 -c 50 --mem=100gb --pty bash

# end to end for everything, need to annotate for classification
# after genomad, screen the results by hand.
# For the ones that are predicted to be RdRp, create phylo trees, and map reads to contigs
mamba activate genomad
genomad end-to-end \
Hugo_metaT.over600.trim.fa ../results/genomad_out genomad_db \
--threads 50 --enable-score-calibration \
--splits 10 --cleanup \

In [None]:

# virsorter2 for RNA viral genome screening (n=18261), if needed.
srun --account=ctbrowngrp -p bmm -J vs2 -t 5:00:00 -c 12 --mem=30gb --pty bash

mamba activate virsorter2
virsorter run all -w ./RNA_virus_genomad \
-i contig.virprot.genomad.fa -d /group/jbemersogrp/databases/virsorter/ \
--min-length 1000 -j 12 --min-score 0.5 --include-groups RNA


In [None]:
# Should we use a different protein prediction model??
# Genomad uses prodigal-gv, may use those proteins for the rest of the pipeline (https://github.com/apcamargo/prodigal-gv)
# Use IpHop for host predictions. 
# Cross reff iphop result with the genomad taxonomy.
# Run pharokka for crispr repeat sequences? 



In [None]:
# Iphop
# Only run iphop on sequences with an RdRp predicted. I don't think it makes sense to use the rest
# We can run iphop on DNA viruses too
mamba activate bbmap
filterbyname.sh in=../../resources/Hugo_metaT.assembly.fa out=contigs_w_rdrp.fa \
names=../genomad_out/contigs_w_rdrp.txt include=t

ln -s /group/jbemersogrp/databases/iphop . 

# run it
srun --account=ctbrowngrp -p med2 -J iphop -t 24:00:00 -c 30 --mem=70gb --pty bash

mamba activate iphop_env
iphop predict -f contigs_w_rdrp.fa \
-o ./ -d ./iphop/latest/Aug_2023_pub_rw -t 30

In [None]:
# dereplicate the RNA contigs (95% ani, n=3,491)
# run drep on all viral contigs predicted by vibrant (n=17703) + all found by PIGEON (581)
# split contigs into individual fastas (dRep accepts individual contigs only)
mkdir contigs
cd ./contigs
awk '/^>/ {OUT=substr($0,2) ".fa"}; OUT {print >OUT}' ../*.fa 

srun --account=ctbrowngrp -p med2 -J drep -t 2:00:00 -c 30 --mem=50gb --pty bash
mamba activate drep

# Run dRep at 95% ANI over 85% of length of longest contigs
dRep dereplicate ./drep --S_algorithm ANImf --ignoreGenomeQuality -l 300 -sa 0.95 -nc 0.85 \
-g ./contigs/*.fa -p 30

In [None]:
# create a bowtie index and map the reads (n=2378)
cat ../drep/drep/dereplicated_genomes/*.fa > 240708_rdrp_contigs.fa
mamba activate bowtie2
bowtie2-build 240708_rdrp_contigs.fa 240708_rdrp_contigs -p 30

# i think clean reads are in: /home/hfm/Rumen_Microbiome_Genomics/1_Sequences_Guanhui/TRIMMED/
ln -s /home/hfm/Rumen_Microbiome_Genomics/1_Sequences_Guanhui/TRIMMED/*R1_001_trim* .
ln -s /home/hfm/Rumen_Microbiome_Genomics/1_Sequences_Guanhui/TRIMMED/*R2_001_trim* .

# Use snakemake for mapping
snakemake --use-conda --resources mem_mb=50000 --rerun-triggers mtime \
-c 30 --rerun-incomplete -k -n

## Old notes, for rdrp scan

In [None]:
## Step 5 : nr blast all contigs with an RdRp hit
# pull all contigs with a hit to an RdRp from fasta file to perform blastx against nr

mamba activate bbmap
filterbyname.sh in=../resources/Hugo_metaT.assembly.fa out=../results/concat_files/metaT.Rdrphits.fa \
names=all_contigs_rdrphit.txt include=t

# perform blastx using diamond

mamba activate diamond
diamond blastx -q metaT.Rdrphits.fa \
--min-orf 600 --max-target-seqs 1 \
-e 1e-5 --threads 50 \
--very-sensitive \
--db nr.dmnd \
-o ../diamond/RdRp_hits.to.nr.csv

## manually compare output hits to NCBI protein names, remove all that have hit to non-viral protein.

### Step 6: Cluster resulting proteins with match at 99% identity using cdhit for alignment

Before alignment: From contigs that only had a blastx hit, and no HMM hit, find the specific proteins that are an RdRp. 
- take out the contigs that had only a blastx hit (n=42)
- predict proteins (n=46) and we want only those 42 RdRps
- run blastp against RdRp db 
- results in 42 matches! Good

In [None]:
# clear protein headers from pyrodigal (remove everything after space)
cut -d ' ' -f1 ../protein/Hugo.metaT_all.faa > ../protein/Hugo.metaT_all.trim.faa 

# pull out all proteins that are predicted to be RdRp (5,139)

mamba activate seqkit
seqkit grep -n -f all_proteins_rdrphit.txt Hugo.metaT_all.cdhit.99.trim.faa -o metaT.Rdrphits.faa

# deduplicate at 99% ident, 3,859 sequences left

mamba activate cdhit
cd-hit -i  metaT.Rdrphits.faa -o  metaT.Rdrphits.dedup.faa \
-c 0.99 -T 1

In [None]:
# Step 7: Use clustalo for alignment
# ClustalO file contains spaces. ClustalO will remove everything after spaces so fix that
sed 's, ,_,g' -i RdRp-scan.CLUSTALO_0.4.fasta 
srun --account=ctbrowngrp -p bmm -J clustalo -t 12:00:00 -c 50 --mem=50gb --pty bash

mamba activate clustalo
clustalo --p1 ../resources/RdRp-scan/Phylogenies/RdRp-scan.CLUSTALO_0.4.fasta \
--auto -i ./concatenated_files/metaT.Rdrphits.dedup.faa -t Protein --threads=50 \
-o ./alignments/alignrdrp.clustalo.faa

TODO:
- trim alignments (TrimAL)
- phylo trees (FastTree)

https://apsjournals.apsnet.org/doi/full/10.1094/PBIOMES-12-21-0080-R



In [None]:
# align all RdRp (around 3800) like sequences with the ones in their diamond db 
# --> Based on that see what clade it best matches with
# The info file contains for each sequence what fam they belong to 
# Get alignment for 3516. 

# remove spaces from their fasta headers
sed 's, ,_,g' RdRp-scan_0.90.fasta > RdRp-scan_0.90.ns.fasta

# create a diamond db
diamond makedb --in RdRp-scan_0.90.ns.fasta --db RdRp-scan_0.90.ns.dmnd

# do blastp
mamba activate diamond
diamond blastp -q metaT.Rdrphits.dedup.faa \
--max-target-seqs 1 \
-e 1 --threads 1 \
--very-sensitive \
--db ../../resources/RdRp-scan/RdRp-scan_0.90.ns.dmnd \
-o prot_RdRp_todb_foralign.csv

In [None]:
# trimal (189 seqs removed bc only gaps)
mamba activate trimal 

# do trimal
trimal -in alignrdrp.clustalo \
-out alignrdrp.clustalo_trimal.faa -gappyout


In [None]:
# Fasttree
srun --account=ctbrowngrp -p bmm -J fastree -t 5:00:00 -c 2 --mem=50gb --pty bash

# Use FastTree to make phylogenetic trees
mamba activate fasttree
FastTree < alignrdrp.clustalo_trimal.faa > rdrd.align.all.fasttree.nwk \
-wag -log all.fasttree.log 

In [None]:
# Do an alignment for each phylum with all seqs? See what fits best? It will be easy in a snakefile
PHYLA, = glob_wildcards('../resources/RdRp-scan/Phylogenies/{ident}.CLUSTALO_0.4.fa')

rule all:
    input:
        expand("../results/alignments/all.vs.{ident}.faa", ident=PHYLA),

rule clustalo:
    input:
        fasta = './concatenated_files/metaT.Rdrphits.dedup.faa', 
        alignment = "../resources/RdRp-scan/Phylogenies/{ident}.CLUSTALO_0.4.fa",
    output:
        res='../results/alignments/all.vs.{ident}.faa',
    conda: 
        "clustalo"
    threads: 12
    shell:
        """
        clustalo --p1 {input.alignment} \
        --auto -i {input.fasta} -t Protein --threads={threads} \
        -o {output.res}
        """

In [None]:
# Make sure to export each new env into a yml
conda env export > environment.yml

see https://github.com/AnneliektH/2024-caleb-snakemake/ on how to call on them

In [None]:
# run snake
srun --account=ctbrowngrp -p bmm -J clustalo -t 1:00:00 -c 36 --mem=50gb --pty bash


snakemake --use-conda --resources mem_mb=50000 --rerun-triggers mtime \
-c 36 --rerun-incomplete -k -n

In [None]:
# Should putative RdRps be checked for motifs?
# Motif dbs should be dedupt to make it easier

mamba activate seqkit
for f in *.fasta
do 
seqkit rmdup -s < $f > ../motif_dedup/$f
done

In [None]:
# test making a smaller tree, with specific viral taxa (Hepelivirales)
mamba activate clustalo
clustalo --p1 ../../resources/RdRp-scan/Phylogenies/Kitrinoviricota.CLUSTALO_0.4.fasta \
--auto -i ./hepelevir.faa -t Protein -o hepelevir.align.faa --threads=36

sed 's, ,_,g' hepelevir.align.faa > hepelevir.align.ns.faa


mamba activate trimal 
# do trimal
trimal -in hepelevir.align.ns.faa \
-out hepelevir.align.trimal.faa -gappyout

mamba activate fasttree
FastTree < hepelevir.align.trimal.faa > hepelivir.fasttree.nwk \
-wag -log all.fasttree.log 
