## Downloading sequence data

#### Isolate Genomes and SAGs

This is a walkthrough of our workflow and scripts, but we did not perform any analysis or visualization using jupyter notebooks. 

In [None]:
##Downloading data

##Current directory: /vortexfs1/omics/env-bio/collaboration/Prochlorococcus-metapangenome/data/

!curl -L https://ndownloader.figshare.com/files/9416614 -o PROCHLOROCOCCUS-FASTA-FILES.tar.gz
!tar -xzvf PROCHLOROCOCCUS-FASTA-FILES.tar.gz
!cat CONTIGS-FOR-ISOLATES.fa CONTIGS-FOR-SAGs.fa > Prochlorococcus-genomes.fa

#unzip MARIA SAG files downloaded from NCBI
!for fasta in Maria_SAGs/
!do
!gunzip $fasta
!done 

#merge all genome sequences into single file
!cat Maria_SAGs/*.fna Prochlorococcus-genomes.fa >> all-genome-seqs.fa

#edit deflines
!awk '{ if (substr($1,1,3) == ">CA") print ">" substr($10,1,10) substr($10,16,length($10)-1); else print $0}' all-genome-seqs.fa > all-genome-seqs-fixed.fa


In [None]:
##Metagenome quality filtering

#Retrieve sample information files 
wget http://merenlab.org/data/tara-oceans-mags/files/sets.txt
wget http://merenlab.org/data/tara-oceans-mags/files/samples.txt

#generate config .ini files for Atlantic samples
iu-gen-configs Atlantic_samples.txt

#quality filter metagenomes (quality-filtering.sh)
for sample in `awk '{print $1}' Atlantic_samples.txt`
do
    if [ "$sample" == "sample" ]; then continue; fi
    iu-filter-quality-minoche $sample.ini --ignore-deflines
done

In [None]:
##Generate anvi'o contigs database

#edit deflines
anvi-script-reformat-fasta data/PROCHLOROCOCCUS-FASTA-FILES/all-genome-seqs-fixed.fa -o seqs-fixed.fa -l 0 --simplify-names --report-file report-deflines.tab

#generate contigs database with HMMs and Blast (all steps in anvi-db-cogs-hmms.sh)
#anvio-6.1 conda environment must be activated before running this script
#generate a contigs database
anvi-gen-contigs-database -f data/PROCHLOROCOCCUS-FASTA-FILES/seqs-fixed.fa -o databases/Prochlorococcus-CONTIGS.db -n "Prochlorococcus Isolates and SAGs"

#perform HMM search for protein families on the contigs database
anvi-run-hmms -c databases/Prochlorococcus-CONTIGS.db --num-threads 20

#BLAST search for COGs on the contigs database
anvi-run-ncbi-cogs -c databases/Prochlorococcus-CONTIGS.db --num-threads 20


In [None]:
##Generate anvi'o contigs database

#Build Bowtie database
bowtie2-build data/PROCHLOROCOCCUS-FASTA-FILES/all-genome-seqs-fixed.fa databases/prochlorococcus-bowtie

#Map metagenome reads to genomes (bowtie-map.sh)
for sample in `awk '{print $1}' data/quality-filtered-fastqs/Atlantic_samples.txt`
do
    if [ "$sample" == "sample" ]; then continue; fi
    # do the bowtie mapping to get the SAM file:
    bowtie2 --threads 20 \
            -x databases/prochlorococcus-bowtie \
            -1 data/quality-filtered-fastqs/"$sample"-QUALITY_PASSED_R1.fastq \
            -2 data/quality-filtered-fastqs/"$sample"-QUALITY_PASSED_R2.fastq \
            --no-unal \
            -S output/"$sample".sam

    # covert the resulting SAM file to a BAM file:
    samtools view -F 4 -bS output/"$sample.sam" > output/"$sample-RAW.bam"

    # sort and index the BAM file:
    samtools sort output/"$sample"-RAW.bam -o output/"$sample".bam
    samtools index output/"$sample".bam

    # remove temporary files:
    rm output/"$sample.sam" output/"$sample"-RAW.bam
done

#create anvi'o profiles for each metagenomic sample (anvi-profile.sh)
for sample in `awk '{print $1}' data/quality-filtered-fastqs/Atlantic_samples.txt`
do
    if [ "$sample" == "sample" ]; then continue; fi

    anvi-profile -c databases/Prochlorococcus-CONTIGS.db -i output/"$sample".bam -M 100 --skip-SNV-profiling --num-threads 20 -o databases/"$sample"
done 

#Merge all profiles into single profile
anvi-merge databases/A*/PROFILE.db -o databases/Prochlorococcus-merged -c databases/Prochlorococcus-CONTIGS.db


In [None]:
#Make collection file

anvi-import-collection final_Prochlorococcus-GENOME-COLLECTION.txt -c databases/Prochlorococcus-CONTIGS.db -p databases/Prochlorococcus-MERGED/PROFILE.db -C Genomes

In [None]:
#Pangenome analysis

#run pangenome analysis (pangenome.sh)
anvi-gen-genomes-storage -i internal-genomes.txt -o ../databases/Prochlorococcus-ISOLATE-PAN-GENOMES.db
anvi-pan-genome -g ../databases/Prochlorococcus-ISOLATE-PAN-GENOMES.db --use-ncbi-blast --minbit 0.5 --mcl-inflation 10 --project-name Prochloroccocus-ISOLATE-PAN --num-threads 20

#Visualize pangenome (not a figure, just to see):
anvi-display-pan -p databases/Prochlorococcus-ISOLATE-PAN/Prochlorococcus-ISOLATE-PAN-PAN.db -g databases/Prochlorococcus-ISOLATE-PAN-GENOMES.db --server-only

In [None]:
#Generating summary files

#Summarize profiling of contigs:
anvi-summarize -c databases/Prochlorococcus-CONTIGS.db -p databases/Prochlorococcus-MERGED/PROFILE.db -C Genomes --init-gene-coverages -o output/Prochlorococcus-SUMMARY

#Summarize results from pangenome analysis
anvi-summarize -p databases/Prochlorococcus-ISOLATE-PAN/Prochlorococcus-ISOLATE-PAN-PAN.db -g databases/Prochlorococcus-ISOLATE-PAN-GENOMES.db -C default -o output/Prochlorococcus-ISOLATE-PAN-SUMMARY

In [None]:
#Linking pangenome to environment:

#Characterize ECGs and EAGs (anvi-meta-pan-genome.sh)
anvi-meta-pan-genome -p Prochlorococcus-ISOLATE-PAN/Prochloroccocus-ISOLATE-PAN-PAN.db -g Prochlorococcus-ISOLATE-PAN-  GENOMES.db -i ../data/internal-genomes.txt --fraction-of-median-coverage 0.25 

#Create text files to visualize distribution in EQPAC1 and MIT9314:
anvi-script-gen-distribution-of-genes-in-a-bin -c Prochlorococcus-CONTIGS.db -p Prochlorococcus-MERGED/PROFILE.db -b MIT9314 -C Genomes --fraction-of-median-coverage 0.25 
anvi-script-gen-distribution-of-genes-in-a-bin -c Prochlorococcus-CONTIGS.db -p Prochlorococcus-MERGED/PROFILE.db -b EQPAC1 -C Genomes --fraction-of-median-coverage 0.25 

In [None]:
#Visualizations:

#visualize Figure 2 (must be done after characterizing ECGs and EAGs)
anvi-display-pan -p Prochlorococcus-ISOLATE-PAN/Prochloroccocus-ISOLATE-PAN-PAN.db -g Prochlorococcus-ISOLATE-PAN-GENOMES.db --title "Prochlorococcus Metapangenome” --server-only

#visualize Figure 3:
#run this on EQPAC1 files and MIT9314 files separately
anvi-interactive -p databases/Prochlorococcus-MERGED/PROFILE.db -c databases/Prochlorococcus-CONTIGS.db -C Genomes --gene-mode -b EQPAC1 -d databases/EQPAC1-GENE-COVs.txt -A databases/EQPAC1-ENV-DETECTION.txt --title "Prochlorococcus EQPAC1 genes across TARA Oceans Project metagenomes" --server-only
anvi-interactive -p databases/Prochlorococcus-MERGED/PROFILE.db -c databases/Prochlorococcus-CONTIGS.db -C Genomes --gene-mode -b MIT9314 -d databases/MIT9314-GENE-COVs.txt -A databases/MIT9314-ENV-DETECTION.txt --title "Prochlorococcus MIT9314 genes across TARA Oceans Project metagenomes" --server-only

#visualize Figure 4 B and C for NATL2A:
#Repeat for each genome of interest
anvi-interactive -p databases/Prochlorococcus-MERGED/PROFILE.db -c databases/Prochlorococcus-CONTIGS.db -C Genomes --gene-mode -b NATL2A --server-only
