# Welcome to the GOAL Bioinformatics Bootcamp, 2025!
# To Setup this notebook please run the first cell. This will perform the needed downloads and setup to participate.

In [None]:
# NGS Bioinformatics Hands On Lab

# Install required packages
!apt-get update
!apt-get install -y fastp fastqc bwa samtools tabix
!pip install pysam

# Optional: Install additional tools if not available in Colab
!wget https://github.com/biod/sambamba/releases/download/v0.8.0/sambamba-0.8.0-linux-amd64-static.gz
!gunzip sambamba-0.8.0-linux-amd64-static.gz
!chmod +x sambamba-0.8.0-linux-amd64-static
!mv sambamba-0.8.0-linux-amd64-static /usr/local/bin/sambamba

# For ABRA2
!wget https://github.com/mozack/abra2/releases/download/v2.23/abra2-2.23.jar
!chmod +x abra2-2.23.jar

# For VarScan
!wget https://github.com/dkoboldt/varscan/raw/master/VarScan.v2.4.6.jar
!chmod +x VarScan.v2.4.6.jar

# for ANNOVAR
!wget http://www.openbioinformatics.org/annovar/download/0wgxR2rIVP/annovar.latest.tar.gz
!tar -zxvf annovar.latest.tar.gz
!rm -r annovar.latest.tar.gz
!rm -r annovar/humandb

# Create directories for data and reference files
!mkdir -p data assets/gatk_b37 assets/humandb annovar

# Download pre-indexed reference genome
!wget https://hgdownload.cse.ucsc.edu/goldenpath/hg19/bigZips/analysisSet/hg19.p13.plusMT.no_alt_analysis_set.bwa_index.tar.gz -O assets/hg19_bwa_index.tar.gz

# Extract BWA index files
!tar -xzvf /content/assets/hg19_bwa_index.tar.gz --strip-components=1 -C /content/assets/gatk_b37/

# Download reference FASTA file.
!wget https://hgdownload.cse.ucsc.edu/goldenpath/hg19/bigZips/analysisSet/hg19.p13.plusMT.no_alt_analysis_set.fa.gz -O /content/assets/gatk_b37/hg19.p13.plusMT.no_alt_analysis_set.fa.gz

# Uncompress the reference FASTA file
!gunzip -c /content/assets/gatk_b37/hg19.p13.plusMT.no_alt_analysis_set.fa.gz > /content/assets/gatk_b37/hg19.p13.plusMT.no_alt_analysis_set.fa

# Get the raw read files and the bed file.
!wget https://raw.githubusercontent.com/Eitan177/Bioinformatics_Bootcamp/main/ngs_ws_hd701_R1.fq.gz
!wget https://raw.githubusercontent.com/Eitan177/Bioinformatics_Bootcamp/main/ngs_ws_hd701_R2.fq.gz
!wget https://raw.githubusercontent.com/Eitan177/Bioinformatics_Bootcamp/main/ngs_ws.bed
!cp ngs_ws.bed /content/assets/

# Download the annovar file.

!pip install -U gdown
import gdown
file_id = "1yANVV31SMqwzRNnEFiLSktxj8eQL5CSp"
gdown.download(f"https://drive.google.com/uc?id={file_id}", quiet=False)

!unzip annotation_db_and_vcfs.zip # Unzip the annovar file.

!tar -xzvf /content/grip_course_annovar_db.tar.gz --strip-components=1 -C /content/assets/humandb/ # Add the necessary files to humandb.

In [None]:
# 1. Examine content of the FASTQ file
!zcat /content/ngs_ws_hd701_R1.fq.gz | head -n 20

In [None]:
# 2. Perform FASTQ processing prior to alignment
!fastp -i /content/ngs_ws_hd701_R1.fq.gz -o data/hd701_R1_processed.fq.gz -I /content/ngs_ws_hd701_R2.fq.gz -O data/hd701_R2_processed.fq.gz -z 4 -w 2 -h data/hd701_fq_qc.html

In [None]:
# 3. Generate FASTQ quality control data
!fastqc -o data -f fastq /content/ngs_ws_hd701_R1.fq.gz /content/ngs_ws_hd701_R2.fq.gz

In [None]:
# 4. Display QC report in Colab
from IPython.core.display import display, HTML
with open('data/ngs_ws_hd701_R1_fastqc.html', 'r') as f:
    display(HTML(f.read()))

# Click the 3 vertical dots on the right of the **previous** cell and then click on view output fullscreen to see the QC file.

In [None]:
# 5. Align sequence to GRCh37

!bwa mem -M -v 1 -t 2 -R "@RG\\tID:hd701\\tSM:hd701\\tPL:ILLUMINA\\tPI:150\\tCN:lab" /content/assets/gatk_b37/hg19.p13.plusMT.no_alt_analysis_set.fa /content/data/hd701_R1_processed.fq.gz /content/data/hd701_R2_processed.fq.gz > /content/data/hd701_raw.sam

In [None]:
# 6. Convert the SAM file into a BAM file
!samtools view -Shu data/hd701_raw.sam > data/hd701_raw.bam

In [None]:
# 7. Sort and index raw BAM file
!sambamba sort -p -t 2 -o data/hd701_sorted.bam data/hd701_raw.bam

In [None]:
# 8. Mark PCR duplicates in BAM file
!sambamba markdup -t 2 -p data/hd701_sorted.bam data/hd701_dedup.bam

In [None]:
# 9. Perform indel realignment - Update the reference path
!java -jar abra2-2.23.jar --in data/hd701_dedup.bam --out data/hd701_realigned.bam --ref /content/assets/gatk_b37/hg19.p13.plusMT.no_alt_analysis_set.fa \\
!--threads 2 --targets assets/ngs_ws.bed --index --tmpdir /tmp/ > data/hd701_abra.log

In [None]:
# 10. Inspect aligned sequences (show first few lines)
!samtools view -h data/hd701_realigned.bam | head -n 20

In [None]:
!# 11. Call variant - Update the reference path
!samtools mpileup -BA -q 20 -Q 30 -d 4000 -l assets/ngs_ws.bed -f /content/assets/gatk_b37/hg19.p13.plusMT.no_alt_analysis_set.fa data/hd701_realigned.bam | java -Xmx4G -jar VarScan.v2.4.6.jar mpileup2vcf --min-coverage 8 --min-var-freq 0.05 --p-value 0.05 --min-avg-qual 30 --strand-filter 1 --output-vcf 1 --variants > data/hd701_raw.vcf

In [None]:
# 12. Compress and index VCF file
!bgzip data/hd701_raw.vcf
!tabix -p vcf data/hd701_raw.vcf.gz

In [None]:
# 13. Inspect variants in a raw VCF file
!zcat data/hd701_raw.vcf.gz | head -n 100

In [None]:
# 14. Annotate variants (requires ANNOVAR to be set up)
!perl annovar/table_annovar.pl data/hd701_raw.vcf.gz assets/humandb/ -out data/hd701_annotated -buildver hg19 -remove -nastring . -otherinfo -vcfinput -thread 2 -maxgenethread 2 -protocol refGene,cytoBand,cosmic85,clinvar_20150330 -operation g,r,f,f

In [None]:
# 15. Inspect annotated VCF file
!head -n 100 data/hd701_annotated.hg19_multianno.vcf

In [None]:
# 16. 🌟 Download BAM files for visualization

from google.colab import files
files.download('data/hd701_realigned.bam') # Download the realigned bam file.


In [None]:
files.download('data/hd701_realigned.bai') # Download the realigned bam index file.

In [21]:
files.download('data/hd701_annotated.hg19_multianno.vcf')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>