Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Nextflow pipeline to create GO/Phenotypes annotations for pangenomes #1070

Merged
merged 7 commits into from
Mar 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 97 additions & 0 deletions nextflow/pangenomes/bin/create_pangenomes_annotation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
#!/usr/bin/env python3
import argparse
import pandas as pd
import re
import os
import csv

parser = argparse.ArgumentParser()
parser.add_argument('--version', help="Release version", required=True)
parser.add_argument('--gtf', help="Assembly-specific GFF/GTF", required=True)
parser.add_argument('--outdir', default=".", help="Output directory")
parser.add_argument('--gene_symbols', default=None,
help="Lookup table with two columns (gene symbols and Ensembl identifiers) ")

group = parser.add_mutually_exclusive_group(required=True)
group.add_argument('--go', help="GO terms annotation")
group.add_argument('--pheno', help="Phenotypes annotation")
args = parser.parse_args()

if args.go:
plugin = 'GO'
annot = args.go
ext = 'gff'
feat = 'transcript'
elif args.pheno:
plugin = 'phenotypes'
annot = args.pheno
ext = 'gvf'
feat = 'gene'

output = re.sub(r'(.*)-gca_(\d+)\.(\d+).*',
f'\\1_gca\\2v\\3_{args.version}_VEP_{plugin}_plugin.{ext}',
os.path.basename(args.gtf.lower()))

if not os.path.exists(args.outdir):
os.makedirs(args.outdir)
output = args.outdir + "/" + output

# read assembly annotation
print(f"Preparing assembly annotation from {args.gtf}...", flush=True)
colnames = ['chr', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']
annot_pd = pd.read_csv(args.gtf, delimiter="\t", comment="#", header=None, names=colnames, dtype=str)
annot_pd = annot_pd[annot_pd['feature'].str.contains(feat)]
annot_pd = annot_pd.assign(gene=annot_pd['attribute'].str.extract(r'gene_id "(.*?)";'))
annot_pd = annot_pd.assign(gene_symbol=annot_pd['attribute'].str.extract(r'gene_name "(.*?)";'))
annot_pd = annot_pd.assign(transcript=annot_pd['attribute'].str.extract(r'transcript_id "(.*?)";'))

## read GO terms or Phenotypes annotation
print(f"Preparing {plugin} annotation from {annot}...", flush=True)
ref_pd = pd.read_csv(annot, delimiter="\t", comment="#", header=None, names=colnames, dtype=str)

if args.go:
ref_pd = ref_pd.assign(gene_symbol=ref_pd['attribute'].str.extract(r'ID=(.*?);'))
elif args.pheno:
ref_pd = ref_pd.assign(gene=ref_pd['attribute'].str.extract(r'id=(.*?);'))

# convert appropriate columns to numeric
annot_pd.start = annot_pd.start.astype(int)
annot_pd.end = annot_pd.end.astype(int)
ref_pd.start = ref_pd.start.astype(int)
ref_pd.end = ref_pd.end.astype(int)

if args.gene_symbols is not None:
# join with lookup table
print(f"Preparing lookup table from {args.gene_symbols}...", flush=True)
symbols = pd.read_csv(args.gene_symbols, delimiter="\t", names=['gene_symbol', 'gene'])
ref_pd = pd.merge(ref_pd, symbols, on='gene', how='inner')

# join annotations based on gene symbols
print(f"Joining annotation...", flush=True)
joint = pd.merge(annot_pd, ref_pd, on='gene_symbol', how='inner')

if args.go:
# prepare new assembly-specific GO annotations with backwards compatibiliy (i.e., transcript-based)
joint = joint.assign(go_terms=joint['attribute_y'].str.extract(r';(Ontology_term=.*)'))
joint = joint.assign(new_attribute="ID=" + joint['transcript'] + ';' + joint['go_terms'])
elif args.pheno:
# prepare new assembly-specific Phenotypes annotations
joint = joint.assign(phenotype=joint['attribute_y'].str.extract(r'; (phenotype=.*)'))
joint = joint.assign(new_attribute="id=" + joint['gene_x'] + '; ' + joint['phenotype'])

# sort by genomic position
new_gtf = joint[['chr_x', 'source_y', 'feature_x', 'start_x', 'end_x',
'score_x', 'strand_x', 'frame_x', 'new_attribute']]
new_gtf = new_gtf.sort_values(by=['chr_x', 'start_x', 'end_x'])

if (len(new_gtf) == 0):
raise Exception(f"ERROR: new pangenomes {plugin} annotation is empty (maybe no genes matched between annotations?)")

# write to file
print(f"Writing new {plugin} annotation to {output}...", flush=True)
f = open(output, 'w')
if args.go:
f.write('##gff-version 1.10\n')
new_gtf.to_csv(f, sep="\t", header=False, index=False, quoting=csv.QUOTE_NONE)
f.close()
print(f"Done!", flush=True)
66 changes: 66 additions & 0 deletions nextflow/pangenomes/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#!/usr/bin/env nextflow
nextflow.enable.dsl=2

params.url = 'ftp://ftp.ensembl.org/pub/rapid-release/species/Homo_sapiens/'
params.outdir = 'outdir'
params.sw = "${ENSEMBL_ROOT_DIR}"
params.vep = "${params.sw}/ensembl-vep/vep"

params.version = null
params.species = "homo_sapiens"

params.user = null
params.host = null
params.port = null

log.info "\nCreate GO and Phenotype annotations for pangenomes"
log.info "=================================================="
for (a in params) {
// print param
log.info " ${a.getKey().padRight(8)} : ${a.getValue()}"
// raise error if param is null
if (!a.getValue()) exit 1, "ERROR: parameter --${a.getKey()} not defined"
}
log.info ""

include { fetch_gene_symbol_lookup;
list_assemblies;
download_pangenomes_data } from './modules/download.nf'
include { create_latest_annotation;
filter_Phenotypes_gene_annotation;
tabix_plugin_annotation;
create_pangenomes_annotation } from './modules/annotation.nf'
include { tabix_gtf; decompress_fasta } from './modules/utils.nf'
include { test_annotation } from './modules/test.nf'

workflow create_go_annotations {
take:
data
main:
go_grch38 = create_latest_annotation('GO', params.version, params.species,
params.user, params.host, params.port)
go_pan = create_pangenomes_annotation('GO', params.version, data, go_grch38.file, '/')
go_pan = go_pan | tabix_plugin_annotation | decompress_fasta | tabix_gtf
test_annotation('GO', go_pan)
}

workflow create_pheno_annotations {
take:
data
main:
pheno_grch38 = create_latest_annotation('Phenotypes', params.version, params.species,
params.user, params.host, params.port)
filtered = filter_Phenotypes_gene_annotation(pheno_grch38.file)
gene_symbol = fetch_gene_symbol_lookup()
pheno_pan = create_pangenomes_annotation('Phenotypes', params.version,
data, filtered, gene_symbol.file)
pheno_pan = pheno_pan | tabix_plugin_annotation | decompress_fasta | tabix_gtf
test_annotation('Phenotypes', pheno_pan)
}

workflow {
assemblies = list_assemblies(params.url).splitCsv().flatten()
pangenomes = download_pangenomes_data(params.url, assemblies).transpose()
create_go_annotations(pangenomes)
create_pheno_annotations(pangenomes)
}
78 changes: 78 additions & 0 deletions nextflow/pangenomes/modules/annotation.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
process create_latest_annotation {
// Use VEP to create Phenotypes or GO annotation for latest assembly

input:
val plugin
val version
val species
val user
val host
val port
output:
path "*.gz", emit: file
path "*.gz.tbi", emit: tbi

afterScript 'rm *.tmp'

script:
def plugin_opts = plugin == 'GO' ?
'GO,match=gene_symbol' :
'Phenotypes,dir=.,include_types=Gene'
"""
${params.vep} --database --db_version ${version} \
--species $species \
--user ${user} --host ${host} --port ${port} \
--plugin ${plugin_opts} \
|| echo "Avoid VEP error: Cannot detect format from STDIN"
"""
}

process filter_Phenotypes_gene_annotation {
// Filter Phenotypes annotation to only contain Gene data

input:
path annotation
output:
path annotation.baseName, emit: file

"""
zcat ${annotation} | awk -F" " '\$3 == "Gene"' > ${annotation.baseName}
"""
}

process create_pangenomes_annotation {
container 'docker://biocontainers/pandas:1.5.1_cv1'
tag "${gtf.baseName}"

input:
val plugin
val version
tuple path(gtf), path(fasta)
path annotation
path gene_symbols

output:
tuple path(gtf), path(fasta), path('*.g*f')

script:
def opts = (plugin == 'GO' ? '--go' : '--pheno') + " ${annotation}"
def lookup = (gene_symbols.name == 'null') ? '' : "--gene_symbols ${gene_symbols}"
"""
create_pangenomes_annotation.py --version ${version} --gtf ${gtf} ${opts} ${lookup}
"""
}

process tabix_plugin_annotation {
tag "${annotation.baseName}"
publishDir "${params.outdir}", mode: 'copy', pattern: '*plugin*.gz*'

input:
tuple path(gtf), path(fasta), path(annotation)
output:
tuple path(gtf), path(fasta), path('*.gz'), path('*.gz.tbi')

"""
bgzip ${annotation}
tabix ${annotation}.gz
"""
}
48 changes: 48 additions & 0 deletions nextflow/pangenomes/modules/download.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
process fetch_gene_symbol_lookup {
// Download HGNC gene symbol lookup table with respective Ensembl identifiers

output:
path 'gene_symbol_table.txt', emit: file

"""
# Download HGNC gene symbol table
wget https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/tsv/hgnc_complete_set.txt
awk -F"\t" '{if (\$20) print \$2"\t"\$20}' hgnc_complete_set.txt | awk 'NR > 1' > gene_symbol_table.txt
"""
}

process list_assemblies {
// List available assemblies from URL (requires FTP protocol)
input:
val url
output:
stdout

script:
def link = url + "/"
"""
curl -l ${link}
"""
}

process download_pangenomes_data {
// Download pangenomes data
tag "${assembly}"

input:
val url
val assembly
output:
tuple path('*.gtf.gz'), path('*.fa.gz')

script:
def link = url + "/" + assembly + "/"
"""
wget -A "*genes.gtf.gz" --no-parent -r -nd ${link}
wget -A "*unmasked.fa.gz" --no-parent -r -nd ${link}
nuno-agostinho marked this conversation as resolved.
Show resolved Hide resolved

# remove older GTF files
ls -t *.gtf.gz | tail -n +2 | xargs -r rm --
"""
}

29 changes: 29 additions & 0 deletions nextflow/pangenomes/modules/test.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
process test_annotation {
// Test annotation using the proper GTF and FASTA for the assembly
tag "${annotation.baseName}"

input:
val plugin
tuple path(gtf), path(gtf_tbi), path(fasta), path(annotation), path(annotation_tbi)

"""
# create random examples based on GTF exons
zgrep exon ${gtf} | \\
awk '\$3 == "exon"' | \\
head -n 10000 | \\
shuf -n 100 | \\
awk '{print \$1, \$4, \$4, "C/T", "+"}' > input.txt

${params.vep} -i input.txt \\
-o vep.out \\
--fasta ${fasta} --gtf ${gtf} \\
--plugin ${plugin},file=${annotation}

# count number of lines with plugin annotation
count=\$(grep -v "^#" vep.out | grep -c -i ${plugin})
if [ \${count} -eq 0 ]; then
echo 'No results found with ${plugin} annotation'
exit 1
fi
"""
}
27 changes: 27 additions & 0 deletions nextflow/pangenomes/modules/utils.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
process decompress_fasta {
tag "${fasta.baseName}"

input:
tuple path(gtf), path(fasta), path(annotation), path(annotation_tbi)
output:
tuple path(gtf), path("*.fa"), path(annotation), path(annotation_tbi)

"""
gunzip -c ${fasta} > file.fa
"""
}

process tabix_gtf {
tag "${gtf.baseName}"

input:
tuple path(gtf), path(fasta), path(annotation), path(annotation_tbi)
output:
tuple path("*.gtf.gz"), path("*.gtf.gz.tbi"), path(fasta), path(annotation), path(annotation_tbi)

"""
gunzip -c ${gtf} > file.gtf
grep -v "#" file.gtf | sort -k1,1 -k4,4n -k5,5n -t '\t' | bgzip -c > file.gtf.gz
tabix file.gtf.gz
"""
}
43 changes: 43 additions & 0 deletions nextflow/pangenomes/nextflow.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
profiles {
lsf {
process.executor = 'lsf'
}

slurm {
process.executor = 'slurm'
}
}

process {
errorStrategy = 'ignore'
nakib103 marked this conversation as resolved.
Show resolved Hide resolved
}

singularity {
enabled = true
autoMounts = true
}

trace {
enabled = true
overwrite = true
file = "reports/trace.txt"
//fields = 'task_id,name,status,exit,realtime,%cpu,rss'
}

dag {
enabled = true
overwrite = true
file = "reports/flowchart.html"
}

timeline {
enabled = true
overwrite = true
file = "reports/timeline.html"
}

report {
enabled = true
overwrite = true
file = "reports/report.html"
}