# Extracted total soil RNA from rhizosphere samples

Anneliek ter Horst, March 2023

- Use this data to look for RNA viruses
- Looking for RdRp sequences in the data
- 3 samples were unsuccesful in extracting RNA (14C2RT1,78C1RT1,78T1RT1)

## Trimmomatic
- Trim adapter sequences
- http://www.usadellab.org/cms/?page=trimmomatic
- Bolger, A. M., Lohse, M., & Usadel, B. (2014). Trimmomatic: A flexible trimmer for Illumina Sequence Data. Bioinformatics, btu170.

In [None]:
# Load java
# Had multiple sequencing files per sample
module load java
        
java -jar /home/amhorst/programs/Trimmomatic-0.39/trimmomatic-0.39.jar \
PE -threads 8 -phred33 \
$1 ${1%_R1_*}_R2_001.fastq.gz \
${1%_R1_*}_R1.fastq.gz ${1%_R1_*}_R1_unpaired.fastq.gz \
${1%_R1_*}_R2.fastq.gz .${1%_R1_*}_R1_unpaired.fastq.gz \
ILLUMINACLIP:/programs/Trimmomatic-0.39/adapters/TruSeq3-PE.fa:2:30:10 \
SLIDINGWINDOW:4:30 MINLEN:50 
                            

## Use Bowtie2 to map back reads to tomato genome
- Mapping to https://www.ncbi.nlm.nih.gov/assembly/GCA_012431665.1#/st (tomato)
- Make a bowtie2 index
- map reads to index using bowtie2
- http://bowtie-bio.sourceforge.net/bowtie2/index.shtml
- to save room we write to --un-conc-gz --> will be fq.gz file with reads not mapped to reference
- the sam file will contain only mapped reads (mapped to host genome) and will be removed as soon script is done


In [None]:
# create folders in the reads folders
cd reads
mkdir rmhost
mkdir rmhost/samfiles
mkdir rmhost/unpaired

# Load bowtie
module load bowtie2

# Reference file is in /home/amhorst/plant_genomes/
'''PAIRED'''

#!/bin/bash
#SBATCH --job-name=bowtie
#SBATCH --nodes=1
#SBATCH -t 4:00:00 
#SBATCH --cpus-per-task=80
#SBATCH --output=./bowtiehost_%j.out
#SBATCH --error=./bowtiehost_%j.err
#SBATCH --partition=high2

#load modules
module load bowtie2
aklog

for f in *_R1*.fastq.gz
do
echo $f
bowtie2 --threads 80 --sensitive \
-x /home/amhorst/plant_genomes/tomato_genome \
-1 $f -2 ${f%_R1*}_R2.fastq.gz \
--un-conc-gz ../rmhost/${f%%_R1*} -S ../rmhost/samfiles/${f%%_R1*}_paired.sam --no-unal \
&& rm ../rmhost/samfiles/${f%%_R1*}_paired.sam
done


'''UNPAIRED'''

#!/bin/bash
#SBATCH --job-name=bowtie
#SBATCH --nodes=1
#SBATCH -t 02:00:00 
#SBATCH --cpus-per-task=33
#SBATCH --output=./bowtiehost_%j.out
#SBATCH --error=./bowtiehost_%j.err
#SBATCH --partition=high2

#load modules
module load bowtie2
aklog

for f in *_R1*.fastq.gz; do
echo $f
bowtie2 --threads 33 --sensitive \
-x /home/amhorst/plant_genomes/tomato_genome \
-U ./unpaired/${f%_R1*}_R2_unpaired.fastq.gz,./unpaired/${f%_R1_*}_R1_unpaired.fastq.gz \
--un-gz ../rmhost/unpaired/${f%%_R1*} -S ../rmhost/samfiles/${f%%_R1*}_unpaired.sam --no-unal \
&& rm ../rmhost/samfiles/${f%%_R1*}_unpaired.sam
done

'''RENAME'''

# Bowtie is done but gives files weird names: Rename

# for the unpaired ones
for f in $(ls *) ; do 
echo $f
echo ${f%%}_unpaired.fq.gz
mv $f ${f%%}_unpaired.fq.gz
done

# for paired
for f in $(ls *.1) ; do 
echo $f
echo ${f%%.1*}_R1_rmhost.fq.gz
mv $f ${f%%.1*}_R1_rmhost.fq.gz
done

for f in $(ls *.2) ; do 
echo $f
echo ${f%%.2*}_R2_rmhost.fq.gz
mv $f ${f%%.2*}_R2_rmhost.fq.gz
done

## Assemble using MEGAHIT
- https://github.com/voutcn/megahit
- https://academic.oup.com/bioinformatics/article/31/10/1674/177884
- Single sample assemblies

In [None]:
# load modules
module load megahit

for f in *R1*
do
echo $f
megahit -1 $f -2 ${f%%_R1*}_R2_rmhost.fq.gz \
-r ./unpaired/${f%%_R1*}_unpaired.fq.gz \
-t 120 --continue --k-min 27 --min-contig-len 200 -m 0.095 \
--presets meta-large \
-o  ../../assemblies/${f%%_R1*} && mv $f ./done
done

## Rename contigs using bbmap package
- https://jgi.doe.gov/data-and-tools/software-tools/bbtools/
- https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0185056

In [None]:
# load module
module load bbmap

# rename all contigs in each fasta file after the fasta file
for f in *.fa
do 
echo $f
rename.sh in=$f out=${f%%.fa*}_renamed.fa prefix=${f%%.fa*}
done 

## Predict genes using Prodigal
Use predicted proteins for 2 things:
1. Look for RdRp sequences using HMM profiles
2. BlastP against DIAMOND prokaryote db(RefSeq nr v 203)


In [None]:
# Load prodigal env
source ~/.bashrc
conda activate prodigal

# make prodigal directory
mkdir ../prodigal/

# Loop through fasta files (contigs)
for f in *.fa
do
# predict genes with prodigal for each file
prodigal -i $f -a ../prodigal/${f%%.fa*}_proteins.faa -p meta -q
done

## HMM search for RdRp sequences
- Read more about RdRps here: http://pfam.xfam.org/family/PF00680
- Download the alignment in selex format from the PFAM website
- Hmms from Star et al., 2019
- evalue treshold of 0.00001

In [None]:
# make output directories
mkdir ../hmm_search
mkdir ../hmm_search/out

#Load module
module load hmmer/3.3.2

# Loop through the gene prediction files (proteins)
# Need a domtblowout for python input. 
for f in 230110_all_3rdco_proteins.faa
do
echo $f
hmmsearch --cpu 1 -o ../hmm_search/out/${f%%_proteins.faa*}.out \
--domtblout ../hmm_search/${f%%_proteins.faa*}_tabular.out \
/home/amhorst/hmm_profiles_rdrp/all_RdRp.hmm $f 
done

## BlastP against DIAMOND prokaryote db 
- Use the DIAMOND prokaryote db for blasting (RefSeq nr v 203)
- do a DIAMOND blastP of all predicted proteins to the DIAMOND db of prokaryotes. 
- Then isolate the contigs that have predicted proteins from prokaryotes
- Map reads against these contigs and remove
- re-assemble remaining reads

In [None]:
# create the database in correct diamond form
module load diamond/0.9.22.123 

diamond makedb --in bact_nr_prot.faa \
--db ./prok.dmnd -p 60

In [None]:
# remove spaces from protein fasta headers
sed '/^>/ s/ .*//' predicted_proteins.faa >> predicted_proteins_nospaces.faa


# DIAMOND blast (very fast)
module load diamond/0.9.22.123 

diamond blastp \
-d /home/amhorst/diamond/prok.dmnd \
-q ./predicted_proteins_nospaces.faa \
-o ../diamond/all_proteins_diamond_prok.results \
-f 6 qseqid sseqid evalue \
--evalue 1e-6 \
-k 1 -p 160

# Now for the result file, keep only the list with protein sequence names that have a prokaryote hit
for f in *.results
do
tr -d ' ' < $f | cut -d$'\t' -f1 | sort -u > ${f%%.results*}.txt 
done

# remove the last 2 chars from each of the protein names (bc we want the contig names)
sed 's/..$//' < all_proteins_diamond_prok.txt > diamond_prok_contigs.txt


In [None]:
# Use bbmap to keep all contigs with prokaryotic protien predicted
module load bbmap
filterbyname.sh in=all_nucl_contigs_rename.cluster.fa out=../diamond/diamond_prok.fa \
names=../diamond/diamond_prok_contigs.txt include=t 



# Make a bowtie2 db from all the nucleotide sequences that had a predicted prokaryote protein
module load bowtie2
bowtie2-build --threads 150 pred_prok.fa pred_prok


# map reads and only keep reads that don't map to these

## Re-assemble using MEGAHIT
- After removing prokaryote reads, re-assemble
- Do co-assembly (cause smaller dataset)
- after re-assembling, predict proteins again, and do the same thing as above

In [None]:
# example co-assembly for one set of data (14)
module load megahit

megahit \
-1 14C1RT1.1,14C1RT2.1,14C1RT3.1,14C2RT2.1,14C2RT3.1 \
-2 14C1RT1.2,14C1RT2.2,14C1RT3.2,14C2RT2.2,14C2RT3.2 \
-r ./unpaired/14C1RT1_unpaired,./unpaired/14C1RT2_unpaired,./unpaired/14C1RT3_unpaired,./unpaired/14C2RT2_unpaired,./unpaired/14C2RT3_unpaired \
-t 120 --continue --k-min 27 --min-contig-len 200 -m 0.095 \
--presets meta-large \
-o ../../co_assemblies/14C1R

## Use dRep to dereplicate final set of sequences
- Dereplicate the nucleotide sequences that had a predicted RdRp
- Use same set of conditions as for DNA viruses 
- https://drep.readthedocs.io/en/latest/
- https://www.nature.com/ismej/articles

In [None]:
# Now drep
source ~/.bashrc
conda activate drep

module load mummer
module load mash
module load bowtie2

dRep dereplicate ./drep --S_algorithm ANImf --ignoreGenomeQuality -l 200 -sa 0.95 -nc 0.85 \
-g ./contigs/*.fa -p 14

## Use Bowtie2 to map back reads to viral contigs
- Make a bowtie2 index
- map reads to index using bowtie2
- http://bowtie-bio.sourceforge.net/bowtie2/index.shtml
- https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3322381/
- Convert samfiles to bamfiles using samtools
- index bamfiles using samtools
- http://www.htslib.org/
- https://academic.oup.com/bioinformatics/article/25/16/2078/204688


In [None]:
# Map to derepped set of viral like sequences
for f in *_R1_*.fq.gz
do 
bowtie2 --threads 40 \
-x ../../mapping/viral_map_db \
-1 $f \
-2 ${f%%_R1_*}_R2_rmhost.fq.gz \
-S  ../../mapping/${f%%_R1_*}.sam --no-unal --sensitive
done

## Use samtools to convert 
- Convert sam to bam files
- Index the bam files
- https://github.com/samtools/samtools

In [None]:
# sam to bam 
for f in *.sam
do
samtools view -@ 12 -F 4 -bS $f | samtools sort > ${f%.sam*}.bam
done

# index the bam files
for f in *.bam
do
samtools index $f
done


## Use CoverM to make a coverage table
- https://github.com/wwood/CoverM
- https://wwood.github.io/CoverM/coverm-genome.html#author

In [None]:
source ~/.bashrc
conda activate coverm
    
coverm contig -m mean --min-covered-fraction 0.75 -b *.bam > 230226_covtab_RNA_tomato.tsv
