Ensembl · nakib103 · Mar 5, 2024 · Feb 16, 2024 · Mar 5, 2024 · Mar 5, 2024
diff --git a/nextflow/pangenomes/bin/create_pangenomes_annotation.py b/nextflow/pangenomes/bin/create_pangenomes_annotation.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python3
+import argparse
+import pandas as pd
+import re
+import os
+import csv
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--version', help="Release version", required=True)
+parser.add_argument('--gtf', help="Assembly-specific GFF/GTF", required=True)
+parser.add_argument('--outdir', default=".", help="Output directory")
+parser.add_argument('--gene_symbols', default=None,
+                    help="Lookup table with two columns (gene symbols and Ensembl identifiers) ")
+
+group = parser.add_mutually_exclusive_group(required=True)
+group.add_argument('--go', help="GO terms annotation")
+group.add_argument('--pheno', help="Phenotypes annotation")
+args = parser.parse_args()
+
+if args.go:
+  plugin = 'GO'
+  annot  = args.go
+  ext    = 'gff'
+  feat   = 'transcript'
+elif args.pheno:
+  plugin = 'phenotypes'
+  annot  = args.pheno
+  ext    = 'gvf'
+  feat   = 'gene'
+
+output = re.sub(r'(.*)-gca_(\d+)\.(\d+).*',
+                f'\\1_gca\\2v\\3_{args.version}_VEP_{plugin}_plugin.{ext}',
+                os.path.basename(args.gtf.lower()))
+
+if not os.path.exists(args.outdir):
+  os.makedirs(args.outdir)
+output = args.outdir + "/" + output
+
+# read assembly annotation
+print(f"Preparing assembly annotation from {args.gtf}...", flush=True)
+colnames = ['chr', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']
+annot_pd = pd.read_csv(args.gtf, delimiter="\t", comment="#", header=None, names=colnames, dtype=str)
+annot_pd = annot_pd[annot_pd['feature'].str.contains(feat)]
+annot_pd = annot_pd.assign(gene=annot_pd['attribute'].str.extract(r'gene_id "(.*?)";'))
+annot_pd = annot_pd.assign(gene_symbol=annot_pd['attribute'].str.extract(r'gene_name "(.*?)";'))
+annot_pd = annot_pd.assign(transcript=annot_pd['attribute'].str.extract(r'transcript_id "(.*?)";'))
+
+## read GO terms or Phenotypes annotation
+print(f"Preparing {plugin} annotation from {annot}...", flush=True)
+ref_pd = pd.read_csv(annot, delimiter="\t", comment="#", header=None, names=colnames, dtype=str)
+
+if args.go:
+  ref_pd = ref_pd.assign(gene_symbol=ref_pd['attribute'].str.extract(r'ID=(.*?);'))
+elif args.pheno:
+  ref_pd = ref_pd.assign(gene=ref_pd['attribute'].str.extract(r'id=(.*?);'))
+
+# convert appropriate columns to numeric
+annot_pd.start = annot_pd.start.astype(int)
+annot_pd.end   = annot_pd.end.astype(int)
+ref_pd.start   = ref_pd.start.astype(int)
+ref_pd.end     = ref_pd.end.astype(int)
+
+if args.gene_symbols is not None:
+  # join with lookup table
+  print(f"Preparing lookup table from {args.gene_symbols}...", flush=True)
+  symbols = pd.read_csv(args.gene_symbols, delimiter="\t", names=['gene_symbol', 'gene'])
+  ref_pd = pd.merge(ref_pd, symbols, on='gene', how='inner')
+
+# join annotations based on gene symbols
+print(f"Joining annotation...", flush=True)
+joint = pd.merge(annot_pd, ref_pd, on='gene_symbol', how='inner')
+
+if args.go:
+  # prepare new assembly-specific GO annotations with backwards compatibiliy (i.e., transcript-based)
+  joint = joint.assign(go_terms=joint['attribute_y'].str.extract(r';(Ontology_term=.*)'))
+  joint = joint.assign(new_attribute="ID=" + joint['transcript'] + ';' + joint['go_terms'])
+elif args.pheno:
+  # prepare new assembly-specific Phenotypes annotations
+  joint = joint.assign(phenotype=joint['attribute_y'].str.extract(r'; (phenotype=.*)'))
+  joint = joint.assign(new_attribute="id=" + joint['gene_x'] + '; ' + joint['phenotype'])
+
+# sort by genomic position
+new_gtf = joint[['chr_x', 'source_y', 'feature_x', 'start_x', 'end_x',
+                 'score_x', 'strand_x', 'frame_x', 'new_attribute']]
+new_gtf = new_gtf.sort_values(by=['chr_x', 'start_x', 'end_x'])
+
+if (len(new_gtf) == 0):
+  raise Exception(f"ERROR: new pangenomes {plugin} annotation is empty (maybe no genes matched between annotations?)")  
+
+# write to file
+print(f"Writing new {plugin} annotation to {output}...", flush=True)
+f = open(output, 'w')
+if args.go:
+  f.write('##gff-version 1.10\n')
+new_gtf.to_csv(f, sep="\t", header=False, index=False, quoting=csv.QUOTE_NONE)
+f.close()
+print(f"Done!", flush=True)
diff --git a/nextflow/pangenomes/main.nf b/nextflow/pangenomes/main.nf
@@ -0,0 +1,66 @@
+#!/usr/bin/env nextflow
+nextflow.enable.dsl=2
+
+params.url     = 'ftp://ftp.ensembl.org/pub/rapid-release/species/Homo_sapiens/'
+params.outdir  = 'outdir'
+params.sw      = "${ENSEMBL_ROOT_DIR}"
+params.vep     = "${params.sw}/ensembl-vep/vep"
+
+params.version = null
+params.species = "homo_sapiens"
+
+params.user    = null
+params.host    = null
+params.port    = null
+
+log.info "\nCreate GO and Phenotype annotations for pangenomes"
+log.info "=================================================="
+for (a in params) {
+  // print param
+  log.info "  ${a.getKey().padRight(8)} : ${a.getValue()}"
+  // raise error if param is null
+  if (!a.getValue()) exit 1, "ERROR: parameter --${a.getKey()} not defined"  
+}
+log.info ""
+
+include { fetch_gene_symbol_lookup;
+          list_assemblies; 
+          download_pangenomes_data } from './modules/download.nf'
+include { create_latest_annotation;
+          filter_Phenotypes_gene_annotation;
+          tabix_plugin_annotation;
+          create_pangenomes_annotation } from './modules/annotation.nf'
+include { tabix_gtf; decompress_fasta } from './modules/utils.nf'
+include { test_annotation } from './modules/test.nf'
+
+workflow create_go_annotations {
+  take:
+    data
+  main:
+    go_grch38 = create_latest_annotation('GO', params.version, params.species,
+                                         params.user, params.host, params.port)
+    go_pan = create_pangenomes_annotation('GO', params.version, data, go_grch38.file, '/')
+    go_pan = go_pan | tabix_plugin_annotation | decompress_fasta | tabix_gtf
+    test_annotation('GO', go_pan)
+}
+
+workflow create_pheno_annotations {
+  take:
+    data
+  main:
+    pheno_grch38 = create_latest_annotation('Phenotypes', params.version, params.species,
+                                            params.user, params.host, params.port)
+    filtered     = filter_Phenotypes_gene_annotation(pheno_grch38.file)
+    gene_symbol  = fetch_gene_symbol_lookup()
+    pheno_pan    = create_pangenomes_annotation('Phenotypes', params.version,
+                                                data, filtered, gene_symbol.file)
+    pheno_pan = pheno_pan | tabix_plugin_annotation | decompress_fasta | tabix_gtf
+    test_annotation('Phenotypes', pheno_pan)
+}
+
+workflow {
+  assemblies = list_assemblies(params.url).splitCsv().flatten()
+  pangenomes = download_pangenomes_data(params.url, assemblies).transpose()
+  create_go_annotations(pangenomes)
+  create_pheno_annotations(pangenomes)
+}
diff --git a/nextflow/pangenomes/modules/annotation.nf b/nextflow/pangenomes/modules/annotation.nf
@@ -0,0 +1,78 @@
+process create_latest_annotation {
+  // Use VEP to create Phenotypes or GO annotation for latest assembly
+
+  input:
+    val plugin
+    val version
+    val species
+    val user
+    val host
+    val port
+  output:
+    path "*.gz", emit: file
+    path "*.gz.tbi", emit: tbi
+
+  afterScript 'rm *.tmp'
+
+  script:
+  def plugin_opts = plugin == 'GO' ?
+    'GO,match=gene_symbol' :
+    'Phenotypes,dir=.,include_types=Gene'
+  """
+  ${params.vep} --database --db_version ${version} \
+      --species $species \
+      --user ${user} --host ${host} --port ${port} \
+      --plugin ${plugin_opts} \
+    || echo "Avoid VEP error: Cannot detect format from STDIN"
+  """
+}
+
+process filter_Phenotypes_gene_annotation {
+  // Filter Phenotypes annotation to only contain Gene data
+
+  input:
+    path annotation
+  output:
+    path annotation.baseName, emit: file
+
+  """
+  zcat ${annotation} | awk -F" " '\$3 == "Gene"' > ${annotation.baseName}
+  """
+}
+
+process create_pangenomes_annotation {
+  container 'docker://biocontainers/pandas:1.5.1_cv1'
+  tag "${gtf.baseName}"
+
+  input:
+    val plugin
+    val version
+    tuple path(gtf), path(fasta)
+    path annotation
+    path gene_symbols
+
+  output:
+    tuple path(gtf), path(fasta), path('*.g*f')
+
+  script:
+    def opts = (plugin == 'GO' ? '--go' : '--pheno') + " ${annotation}"
+    def lookup = (gene_symbols.name == 'null') ? '' : "--gene_symbols ${gene_symbols}"
+  """
+  create_pangenomes_annotation.py --version ${version} --gtf ${gtf} ${opts} ${lookup}
+  """
+}
+
+process tabix_plugin_annotation {
+  tag "${annotation.baseName}"
+  publishDir "${params.outdir}", mode: 'copy', pattern: '*plugin*.gz*'
+
+  input:
+    tuple path(gtf), path(fasta), path(annotation)
+  output:
+    tuple path(gtf), path(fasta), path('*.gz'), path('*.gz.tbi')
+
+  """
+  bgzip ${annotation}
+  tabix ${annotation}.gz
+  """
+}
diff --git a/nextflow/pangenomes/modules/download.nf b/nextflow/pangenomes/modules/download.nf
@@ -0,0 +1,48 @@
+process fetch_gene_symbol_lookup {
+  // Download HGNC gene symbol lookup table with respective Ensembl identifiers
+
+  output:
+    path 'gene_symbol_table.txt', emit: file
+
+  """
+  # Download HGNC gene symbol table
+  wget https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/tsv/hgnc_complete_set.txt
+  awk -F"\t" '{if (\$20) print \$2"\t"\$20}' hgnc_complete_set.txt | awk 'NR > 1' > gene_symbol_table.txt
+  """
+}
+
+process list_assemblies {
+  // List available assemblies from URL (requires FTP protocol)
+  input:
+    val url
+  output:
+    stdout
+
+  script:
+    def link = url + "/"
+  """
+  curl -l ${link}
+  """
+}
+
+process download_pangenomes_data {
+  // Download pangenomes data
+  tag "${assembly}"
+
+  input:
+    val url
+    val assembly
+  output:
+    tuple path('*.gtf.gz'), path('*.fa.gz')
+
+  script:
+    def link = url + "/" + assembly + "/"
+  """
+  wget -A "*genes.gtf.gz" --no-parent -r -nd ${link}
+  wget -A "*unmasked.fa.gz" --no-parent -r -nd ${link}
+
+  # remove older GTF files
+  ls -t *.gtf.gz | tail -n +2 | xargs -r rm --
+  """
+}
+
diff --git a/nextflow/pangenomes/modules/test.nf b/nextflow/pangenomes/modules/test.nf
@@ -0,0 +1,29 @@
+process test_annotation {
+  // Test annotation using the proper GTF and FASTA for the assembly
+  tag "${annotation.baseName}"
+
+  input:
+    val plugin
+    tuple path(gtf), path(gtf_tbi), path(fasta), path(annotation), path(annotation_tbi)
+
+  """
+  # create random examples based on GTF exons
+  zgrep exon ${gtf} | \\
+    awk '\$3 == "exon"' | \\
+    head -n 10000 | \\
+    shuf -n 100 | \\
+    awk '{print \$1, \$4, \$4, "C/T", "+"}' > input.txt
+
+  ${params.vep} -i input.txt \\
+      -o vep.out \\
+      --fasta ${fasta} --gtf ${gtf} \\
+      --plugin ${plugin},file=${annotation}
+
+  # count number of lines with plugin annotation
+  count=\$(grep -v "^#" vep.out | grep -c -i ${plugin})
+  if [ \${count} -eq 0 ]; then
+    echo 'No results found with ${plugin} annotation'
+    exit 1
+  fi
+  """
+}
diff --git a/nextflow/pangenomes/modules/utils.nf b/nextflow/pangenomes/modules/utils.nf
@@ -0,0 +1,27 @@
+process decompress_fasta {
+  tag "${fasta.baseName}"
+
+  input:
+    tuple path(gtf), path(fasta), path(annotation), path(annotation_tbi)
+  output:
+    tuple path(gtf), path("*.fa"), path(annotation), path(annotation_tbi)
+
+  """
+  gunzip -c ${fasta} > file.fa
+  """
+}
+
+process tabix_gtf {
+  tag "${gtf.baseName}"
+
+  input:
+    tuple path(gtf), path(fasta), path(annotation), path(annotation_tbi)
+  output:
+    tuple path("*.gtf.gz"), path("*.gtf.gz.tbi"), path(fasta), path(annotation), path(annotation_tbi)
+
+  """
+  gunzip -c ${gtf} > file.gtf
+  grep -v "#" file.gtf | sort -k1,1 -k4,4n -k5,5n -t '\t' | bgzip -c > file.gtf.gz
+  tabix file.gtf.gz
+  """
+}
diff --git a/nextflow/pangenomes/nextflow.config b/nextflow/pangenomes/nextflow.config
@@ -0,0 +1,43 @@
+profiles {
+  lsf {
+    process.executor = 'lsf'
+  }
+
+  slurm {
+    process.executor = 'slurm'
+  }
+}
+
+process {
+  errorStrategy = 'ignore'
+}
+
+singularity {
+  enabled = true
+  autoMounts = true
+}
+
+trace {
+    enabled = true
+    overwrite = true
+    file = "reports/trace.txt"
+    //fields = 'task_id,name,status,exit,realtime,%cpu,rss'
+}
+
+dag {
+    enabled = true
+    overwrite = true
+    file = "reports/flowchart.html"
+}
+
+timeline {
+    enabled = true
+    overwrite = true
+    file = "reports/timeline.html"
+}
+
+report {
+    enabled = true
+    overwrite = true
+    file = "reports/report.html"
+}