In [None]:

!pip install -q condacolab
import condacolab
condacolab.install()
     

In [None]:
!wget https://github.com/BioinformaticaUSFQ1/Course_Bioinformatics/blob/main/Modules/SARS-CoV-2.reference.fasta?raw=true
!mv  SARS-CoV-2.reference.fasta?raw=true SARS-CoV-2.reference.fasta

In [None]:
!mamba install -c bioconda minimap2 samtools bamclipper tabix bcftools bedtools

In [None]:
#Download the reads in a zip file and unzip
#The reads will go to nanopore_fastq folder
!gdown 1rRhK7H7R9aiPooqtKtT8kugnLNsqmkkR ; unzip Nanopore_READS.zip

In [None]:
# map the filtered reads to the reference genome
!minimap2 -x map-ont -t 4 -a -o SARSCoV2-nanopore.sam SARS-CoV-2.reference.fasta /content/Nanopore_READS/nanopore_fastq/barcode92/barcode92.fastq.gz

In [None]:
# convert mapping results (SAM format) into a binary format 
# the binary format can be faster read by a machine but not by a human
# we will use the binary format (called BAM) for visualization of the mapping
 # 1) convert SAM to BAM --> check the file size after convert!
!samtools view -bS SARSCoV2-nanopore.sam > SARSCoV2-nanopore.bam
 # 2) sort the BAM
!samtools sort SARSCoV2-nanopore.bam > SARSCoV2-nanopore.sorted.bam
# 3) index the BAM
!samtools index SARSCoV2-nanopore.sorted.bam

In [None]:
# First, we download the primer BED scheme for the ARTIC V1200 scheme
# Change to another BED file if needed!
!wget https://raw.githubusercontent.com/replikation/poreCov/master/data/external_primer_schemes/nCoV-2019/V1200/nCoV-2019.bed


In [None]:
# It's important that the FASTA header of the reference genome 
# and the IDs in the BED file match, let's check:
!head SARS-CoV-2.reference.fasta
!head nCoV-2019.bed

In [None]:
!pip install pandas
# now we convert this BED file into a BEDPE file needed by BAMclipper.
# we download a python script to do so:
!wget https://raw.githubusercontent.com/hoelzer/bed2bedpe/master/primerbed2bedpe.py


In [None]:
# and run it
!python primerbed2bedpe.py nCoV-2019.bed --forward_identifier _LEFT --reverse_identifier _RIGHT -o nCoV-2019.bedpe

In [None]:
# now we can use BAMclipper - finally
!bamclipper.sh -b SARSCoV2-nanopore.sorted.bam -p nCoV-2019.bedpe -n 4

In [None]:
# first, we create a new env named 'medaka' and install 'mamba' and a specific version of python needed by 'medaka'
!pip install medaka


In [None]:
# first generate a file with information about potential variants
# considering the used basecalling model. You should use the matching model from your Guppy basecalling settings!
!medaka consensus --model r941_min_hac_g507 --threads 4 --chunk_len 800 --chunk_ovlp 400 SARSCoV2-nanopore.sorted.primerclipped.bam medaka-nanopore.consensus.hdf

# actually call the variants
!medaka variant SARS-CoV-2.reference.fasta medaka-nanopore.consensus.hdf medaka-nanopore.vcf

# annotate VCF with read depth info etc. so we can filter it
!medaka tools annotate  medaka-nanopore.vcf SARS-CoV-2.reference.fasta SARSCoV2-nanopore.sorted.primerclipped.bam medaka-nanopore.annotate.vcf

In [None]:
# compress the annotated VCF file (needed for the next steps)
!bgzip -f medaka-nanopore.annotate.vcf
 
# index a TAB-delimited genome position file in bgz format 
# and create an index file
!tabix -f -p vcf medaka-nanopore.annotate.vcf.gz

# generate the consensus
!bcftools consensus -f SARS-CoV-2.reference.fasta medaka-nanopore.annotate.vcf.gz -o consensus-nanopore.fasta



Mask consensus sequence

In [None]:
!bedtools genomecov -ibam SARSCoV2-nanopore.sorted.primerclipped.bam -bga -split | awk '$4 < 20' | cut -f 1-3 > low_coverage_regions.bed


In [None]:
!bedtools maskfasta -fi consensus-nanopore.fasta -fo masked.fasta -bed low_coverage_regions.bed


In [None]:
# rename the consensus FASTA, right now the FASTA ID is still the reference
!sed -i 's/MN908947.3/Consensus-Nanopore/g' masked.fasta