CBW 2024 Advanced Module 2: Metagenomic Assembly and Binning

Setup

mkdir amb_module2
cd amb_module2
ln -s ~/CourseData/MIC_data/AMB_data/mapped_matched_fastq/ .
ln -s ~/CourseData/MIC_data/AMB_data/raw_data/ .
cp ~/CourseData/MIC_data/AMB_data/sample_ids.txt .
cp ~/CourseData/MIC_data/AMB_data/mgs_metadata.txt .
conda activate anvio-7

Assemble with Megahit

# Assemble with megahit
R1=$( ls mapped_matched_fastq/*_R1.fastq | tr '\n' ',' | sed 's/,$//' )
R2=$( ls mapped_matched_fastq/*_R2.fastq | tr '\n' ',' | sed 's/,$//' )
mkdir anvio
    
megahit -1 $R1 \
         -2 $R2 \
         --min-contig-len 1000 \
         --num-cpu-threads 4 \
         --presets meta-large \
         --memory 0.8 \
         -o anvio/megahit_out \
        --verbose
# May need to pre-prepare this step

# Remove intermediate contigs to save space
rm -r megahit_out/*/intermediate_contigs

mkdir anvio
mkdir anvio/megahit_out
cp ~/CourseData/MIC_data/AMB_data/final.contigs.fa anvio/megahit_out/

Reformat for Anvi'o and make contigs databases

anvi-script-reformat-fasta anvio/megahit_out/final.contigs.fa \
                               --simplify-names \
                               --min-len 2500 \
                               -o anvio/megahit_out/final.contigs.fixed.fa
mkdir anvio/anvio_databases
anvi-gen-contigs-database -f anvio/megahit_out/final.contigs.fixed.fa \
                              -o anvio/anvio_databases/CONTIGS.db \
                              -n HMP2

Run HMMs to identify genes

anvi-run-hmms -c anvio/anvio_databases/CONTIGS.db \
                              --num-threads 4
                              
anvi-get-sequences-for-gene-calls -c anvio/anvio_databases/CONTIGS.db \
                              -o anvio/anvio_databases/gene_calls.fa

Make bowtie2 database of contigs and map contigs to samples

bowtie2-build anvio/megahit_out/final.contigs.fixed.fa \
                              anvio/megahit_out/final.contigs.fixed
mkdir anvio/bam_files

# Map contigs to samples 

for SAMPLE in `awk '{print $1}' sample_ids.txt`
do

    # do the bowtie mapping to get the SAM file:
    bowtie2 --threads 4 \
            -x anvio/megahit_out/final.contigs.fixed \
            -1 "raw_data/"$SAMPLE"_R1_subsampled.fastq.gz" \
            -2 "raw_data/"$SAMPLE"_R2_subsampled.fastq.gz" \
            --no-unal \
            -S anvio/bam_files/$SAMPLE.sam

    # convert the resulting SAM file to a BAM file:
    samtools view -F 4 -bS anvio/bam_files/$SAMPLE.sam > anvio/bam_files/$SAMPLE-RAW.bam
    
    anvi-init-bam anvio/bam_files/$SAMPLE-RAW.bam -o anvio/bam_files/$SAMPLE.bam
    rm anvio/bam_files/$SAMPLE-RAW.bam

done

Explain for loops? And note that here we'd usually be doing this not to our subsampled files, but to the originals! And both would be the larger files!

Add coverage and detection statistics to the Anv'io profile

mkdir anvio/anvio_databases/profiles

for SAMPLE in `awk '{print $1}' sample_ids.txt`
do

    anvi-profile -c anvio/anvio_databases/CONTIGS.db \
                 -i anvio/bam_files/$SAMPLE.bam \
                 --num-threads 4 \
                 -o anvio/anvio_databases/profiles/$SAMPLE
done

Merge sample profiles

anvi-merge -c anvio/anvio_databases/CONTIGS.db \
           -o anvio/anvio_databases/merged_profiles \
           anvio/anvio_databases/profiles/*/PROFILE.db
           
# Display contig stats
anvi-display-contigs-stats --report-as-text \
                           --output-file contigs_stats.txt \
                           anvio/anvio_databases/CONTIGS.db

Cluster contigs

anvi-cluster-contigs -c anvio/anvio_databases/CONTIGS.db \
                         -p anvio/anvio_databases/merged_profiles/PROFILE.db \
                         -C "merged_concoct_2500" \
                         --driver CONCOCT \
                         --length-threshold 2500 \
                         --num-threads 4 \
                         --just-do-it

mkdir anvio/concoct_summary_2500
anvi-summarize -c anvio/anvio_databases/CONTIGS.db \
                   -p anvio/anvio_databases/merged_profiles/PROFILE.db \
                   -C "merged_concoct_2500" \
                   -o anvio/concoct_summary_2500/summary/

#now take a look at the summary of all bins!
less anvio/concoct_summary_2500/summary/bins_summary.txt
less anvio/concoct_summary_2500/summary/bins_across_samples/abundance.txt
less anvio/concoct_summary_2500/summary/bins_across_samples/mean_coverage.txt
#now go to amb_module2/anvio/concoct_summary_2500/ and then right click on summary and click open in new tab
#now go to amb_module2/anvio/concoct_summary_2500 and add /summary/bins_summary.txt to the URL bar
#copy and paste this into a new excel (or similar) document - it will be useful to refer back to
#If you're in excel, to get this into columns go to the Data tab > text to columns > Delimited > Check "Space" > Finish

Now the interactive refinement!

#lets first look at the top bin that's 100% complete and 0% redundant - for me, this is called Bin_17, but I don't know if this naming is always consistent, so if that's a different bin for you, then that's fine!
anvi-refine -c anvio/anvio_databases/CONTIGS.db \
            -p anvio/anvio_databases/merged_profiles/PROFILE.db \
            -C "merged_concoct_2500" \
            -b Bin_17 --server-only -P 8081
            
#in second window
ssh -L 8081:localhost:8081 -i BMB.pem ubuntu@##.uhn-hpc.ca
#it will look like you just logged in in a new window
#go to http://localhost:8081/ in browser
#the press "Draw" in the browser window!
#click on the "Bins" tab

#If we select the whole tree, we can see that this is 100% complete and 0% redundant. If we unselect (right click) some of the tree branches, this makes the completion and length of the genome go down slightly, but doesn't affect the redundancy. We'll leave this and try another.
#go back to the first console window and press ctrl+c to stop
anvi-refine -c anvio/anvio_databases/CONTIGS.db \
            -p anvio/anvio_databases/merged_profiles/PROFILE.db \
            -C "merged_concoct_2500" \
            -b Bin_27 --server-only -P 8081
#Refresh your Anv'io browser window and you should see a different chart now.
#Again, click "Draw" and then go to the "Bins tab"
#This one is 59% complete but has 54.9% redundancy - we ideally want to get the redundancy below 10% while keeping the completion above 50%
#Start by deselecting (right clicking) branches that appear to have a different abundance profile than others. There is one that seems to only be present in sample CSM7KOMH - deselecting this one only reduces the completion to 56.3%, but reduces redundancy to 15.5%
#You can deselect large areas to determine whether they'll have an impact on the redundancy, and then go through in more detail and deselect/reselect individual parts of this
#Make sure you use the zoom +/- and zoom in and out to see the tree in more detail, as well as checking the abundance/GC content profiles of the parts you've deselected
#Sometimes, you might want to add another bin to keep some of the contigs that you've deselected from the first one
#When you are happy with what you've selected, click on "Store refined bins in the database"
#you can go back to the first console window and press ctrl+c
#The other two bins that are >50% completion are already <10% redundancy so it's not necessary to do anything to these, but you can have a look at them if you like

Rename bins

anvi-rename-bins -c anvio/anvio_databases/CONTIGS.db \
                     -p anvio/anvio_databases/merged_profiles/PROFILE.db \
                     --collection-to-read merged_concoct_2500 \
                     --collection-to-write FINAL \
                     --call-MAGs \
                     --min-completion-for-MAG 50 \
                     --max-redundancy-for-MAG 10 \
                     --prefix HMP2 \
                     --report-file renaming_bins.txt

Make new collection with these

anvi-summarize -c anvio/anvio_databases/CONTIGS.db \
                   -p anvio/anvio_databases/merged_profiles/PROFILE.db \
                   -C "FINAL" \
                   -o anvio/FINAL_summary/
                   
#Now we'll have a look at the bins that we've created. Take a look in the folder anvio/FINAL_summary/bin_by_bin
#You should see that some of these are called *MAG* while some are called *Bin* - this is because in the command above, we're only calling things a MAG if they're >50% completion and <10% redundancy. Have a look at the summary file again. You can see that for most of them, the source is shown as concoct, but the MAG that we refined is shown as anvi-refine, and all of the MAGs are identified as bacteria.

#Now have a look in one of those MAG folders (```anvio/FINAL_summary/bin_by_bin/```) - you'll see a lot of statistics, as well as a *-contigs.fa. This is a fasta file of the contigs used for each MAG, and we can use it as their genome for further analyses. Now we'll create a copy of these in a new folder.
mkdir MAG_fasta
cp anvio/FINAL_summary/bin_by_bin/*MAG*/*contigs.fa MAG_fasta/
ls MAG_fasta/
#Now that we have our refined MAGs, we can do anything that we like with them!

Now CheckM

conda activate checkm
export CHECKM_DATA_PATH=/home/ubuntu/CourseData/MIC_data/tools/CheckM_databases
checkm lineage_wf --reduced_tree -t 4 -x fa MAG_fasta MAGs-CHECKM-lineage -f MAGs-CHECKM.txt --tab_table
#this one will take a little while to run! Feel free to start with the CARD RGI database downloading below while this runs
checkm tree_qa MAGs-CHECKM-lineage -f MAGs-CHECKM-tax.txt --tab_table

less MAGs-CHECKM.txt 
less MAGs-CHECKM-tax.txt

CARD RGI annotation

Get necessary databases:

conda activate rgi
cd ..
mkdir card_data
cd card_data
wget https://card.mcmaster.ca/latest/data --no-check-certificate
tar -xvf data ./card.json
rgi load --card_json ./card.json --local
rgi database --version --local
#3.2.9
wget -O wildcard_data.tar.bz2 https://card.mcmaster.ca/latest/variants
mkdir -p wildcard
tar -xjf wildcard_data.tar.bz2 -C wildcard
gunzip wildcard/*.gz
rgi card_annotation -i localDB/card.json > card_annotation.log 2>&1
rgi wildcard_annotation -i wildcard --card_json localDB/card.json -v 3.2.9 > wildcard_annotation.log 2>&1
rgi load \
  --card_json localDB/card.json \
  --debug --local \
  --card_annotation card_database_v3.2.9.fasta \
  --wildcard_annotation wildcard_database_v3.2.9.fasta \
  --wildcard_index wildcard/index-for-model-sequences.txt \
  --wildcard_version 3.2.9 \
  --amr_kmers wildcard/all_amr_61mers.txt \
  --kmer_database wildcard/61_kmer_db.json \
  --kmer_size 61
cd ..

Run:

mkdir amb_module2/card_out/
cd card_data
parallel -j 1 'rgi main -i {} -o ~/workspace/amb_module2/card_out/{/.} -t contig -a DIAMOND -n 4 --include_loose --local --clean' ::: ~/workspace/amb_module2/MAG_fasta/*
cd ~/workspace/amb_module2/
rgi heatmap --input card_out/ --output card_heatmap
#Output file card_heatmap-4: Yellow represents a perfect hit, teal represents a strict hit, purple represents no hit.

If you have time...

Explore the other options within Anvi'o

Some other things that can be done with MAGs:

GTDB-tk

Contact

Please feel free to post a question on the Microbiome Helper google group if you have any issues.
General comments or inquires about Microbiome Helper can be sent to morgan.langille@dal.ca.