Skip to content

Commit

Permalink
Merge pull request #86 from Clinical-Genomics/genes_to_panel
Browse files Browse the repository at this point in the history
Download genes for a genome build from Ensembl and save them to database
  • Loading branch information
northwestwitch authored Oct 12, 2020
2 parents fd5dde3 + 10d33aa commit 14cb1d6
Show file tree
Hide file tree
Showing 8 changed files with 273 additions and 3 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
- Save BND variants to database (introduced additional mateName key/values)
- Query BND variants using mateName
- Documentation using `MkDocs`.
- Populate database with genes downloaded from Ensembl Biomart


## [1.1] - 2020.06.17
Expand Down
2 changes: 2 additions & 0 deletions cgbeacon2/cli/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from cgbeacon2.server import create_app
from .add import add
from .delete import delete
from .update import update


@click.version_option(__version__)
Expand All @@ -24,3 +25,4 @@ def cli(**_):

cli.add_command(add)
cli.add_command(delete)
cli.add_command(update)
35 changes: 35 additions & 0 deletions cgbeacon2/cli/update.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import click
from flask.cli import with_appcontext
from cgbeacon2.utils.ensembl_biomart import EnsemblBiomartClient
from cgbeacon2.utils.update import update_genes


@click.group()
def update():
"""Update items in the database using the cli"""
pass


@update.command()
@with_appcontext
@click.option(
"-build",
type=click.Choice(["GRCh37", "GRCh38"]),
nargs=1,
help="Genome assembly (default:GRCh37)",
default="GRCh37",
)
def genes(build):
"""Update genes and gene coordinates in database"""

click.echo(f"Collecting gene names from Ensembl, genome build -> {build}")
client = EnsemblBiomartClient(build)
gene_lines = client.query_service()
# If gene query was not successful, exit command
if gene_lines is None:
return

n_inserted = update_genes(gene_lines, build)
click.echo(f"Number of inserted genes for build {build}: {len(n_inserted)}")
18 changes: 18 additions & 0 deletions cgbeacon2/utils/delete.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,24 @@
LOG = logging.getLogger(__name__)


def delete_genes(collection, build="GRCh37"):
"""Delete all genes from gene database collection
Accepts:
build(str): GRCh37 or GRCh38
Returns:
result.deleted(int): number of deleted documents
"""
query = {"build": build}
try:
result = collection.delete_many(query)
except Exception as err:
LOG.error(err)
return
return result.deleted_count


def delete_dataset(database, id):
"""Delete a dataset from dataset collection
Expand Down
116 changes: 116 additions & 0 deletions cgbeacon2/utils/ensembl_biomart.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
"""Code for downloading all genes with coordinates from Ensembl Biomart"""
import logging
import requests

BIOMART_37 = "http://grch37.ensembl.org/biomart/martservice?query="
BIOMART_38 = "http://ensembl.org/biomart/martservice?query="
CHROMOSOMES = [str(num) for num in range(1, 23)] + ["X", "Y", "MT"]
ATTRIBUTES = [
"chromosome_name",
"start_position",
"end_position",
"ensembl_gene_id",
"hgnc_symbol",
"hgnc_id",
]

LOG = logging.getLogger(__name__)


class EnsemblBiomartClient:
"""Class to handle requests to the ensembl biomart api"""

def __init__(self, build="GRCh37"):
"""Initialise a ensembl biomart client"""
self.server = BIOMART_37
if build == "GRCh38":
self.server = BIOMART_38
self.filters = {"chromosome_name": CHROMOSOMES}
self.attributes = [
"ensembl_gene_id",
"hgnc_id",
"hgnc_symbol",
"chromosome_name",
"start_position",
"end_position",
]
self.xml = self._create_biomart_xml()
self.header = True

def _create_biomart_xml(self):
"""Convert biomart query params into biomart xml query
Accepts:
filters(dict): keys are filter names and values are filter values
attributes(list): a list of attributes
Returns:
xml: a query xml file
"""
filter_lines = self._xml_filters()
attribute_lines = self._xml_attributes()
xml_lines = [
'<?xml version="1.0" encoding="UTF-8"?>',
"<!DOCTYPE Query>",
'<Query virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows'
' = "0" count = "" datasetConfigVersion = "0.6" completionStamp = "1">',
"",
'\t<Dataset name = "hsapiens_gene_ensembl" interface = "default" >',
]
for line in filter_lines:
xml_lines.append("\t\t" + line)
for line in attribute_lines:
xml_lines.append("\t\t" + line)
xml_lines += ["\t</Dataset>", "</Query>"]

return "\n".join(xml_lines)

def _xml_filters(self):
"""Creates a filter line for the biomart xml document
Returns:
formatted_lines(list[str]): List of formatted xml filter lines
"""
formatted_lines = []
for filter_name in self.filters:
value = self.filters[filter_name]
if isinstance(value, str):
formatted_lines.append(
'<Filter name = "{0}" value = "{1}"/>'.format(filter_name, value)
)
else:
formatted_lines.append(
'<Filter name = "{0}" value = "{1}"/>'.format(filter_name, ",".join(value))
)

return formatted_lines

def _xml_attributes(self):
"""Creates an attribute line for the biomart xml document
Returns:
formatted_lines(list(str)): list of formatted xml attribute lines
"""
formatted_lines = []
for attr in self.attributes:
formatted_lines.append('<Attribute name = "{}" />'.format(attr))
return formatted_lines

def query_service(self):
"""Query the Ensembl biomart service and yield the resulting lines
Accepts:
xml(str): an xml formatted query, as described here:
https://grch37.ensembl.org/info/data/biomart/biomart_perl_api.html
Yields:
biomartline
"""
url = "".join([self.server, self.xml])
try:
with requests.get(url, stream=True) as r:
for line in r.iter_lines():
yield line.decode("utf-8")
except Exception as ex:
LOG.info("Error downloading data from biomart: {}".format(ex))
return
51 changes: 48 additions & 3 deletions cgbeacon2/utils/update.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,57 @@
# -*- coding: utf-8 -*-
from flask.cli import current_app
import datetime
import logging
from .delete import delete_genes

LOG = logging.getLogger(__name__)


def update_genes(gene_lines, build="GRCh37"):
"""Update database genes using Ensembl Biomart service
Accepts:
gene_lines(types.GeneratorType)
Returns:
inserted_ids(int): number of genes inserted
"""
gene_collection = current_app.db["gene"]
delete_genes(gene_collection, build)
gene_objects = []
for line in gene_lines:
hgnc_id = None
hgnc_symbol = None
parsed_line = line.rstrip().split("\t")

if len(parsed_line) != 6:
continue # it's probably the last line (success message)

# No HGNC ID, do not insert gene into database
if parsed_line[1] == "":
continue
if "HGNC:" in parsed_line[1]:
parsed_line[1] = parsed_line[1].split(":")[1]
hgnc_id = int(parsed_line[1])
if parsed_line[2] != "":
hgnc_symbol = parsed_line[2]

gene_obj = dict(
ensembl_id=parsed_line[0],
hgnc_id=hgnc_id,
symbol=hgnc_symbol,
build=build,
chromosome=parsed_line[3],
start=int(parsed_line[4]),
end=int(parsed_line[5]),
)
# gene_collection.insert_one(gene_obj)
gene_objects.append(gene_obj)

result = gene_collection.insert_many(gene_objects)
return result.inserted_ids


def update_event(database, dataset_id, updated_collection, add):
"""Register an event corresponding to a change in the database
Expand Down Expand Up @@ -158,9 +205,7 @@ def _samples_calls(variant_collection, dataset_id, samples):
{
"$group": {
"_id": None,
"alleles": {
"$sum": f"$datasetIds.{dataset_id}.samples.{sample}.allele_count"
},
"alleles": {"$sum": f"$datasetIds.{dataset_id}.samples.{sample}.allele_count"},
}
}
]
Expand Down
1 change: 1 addition & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ coveralls
mongomock
pytest>4.6
pytest-cov
responses

# documentation
mkdocs
52 changes: 52 additions & 0 deletions tests/cli/test_update.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# -*- coding: utf-8 -*-
import responses # for the sake of mocking it
from cgbeacon2.cli.commands import cli
from cgbeacon2.utils.ensembl_biomart import BIOMART_38

XML_QUERY = """<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE Query>
<Query virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" completionStamp = "1">
<Dataset name = "hsapiens_gene_ensembl" interface = "default" >
<Filter name = "chromosome_name" value = "1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y,MT"/>
<Attribute name = "ensembl_gene_id" />
<Attribute name = "hgnc_id" />
<Attribute name = "hgnc_symbol" />
<Attribute name = "chromosome_name" />
<Attribute name = "start_position" />
<Attribute name = "end_position" />
</Dataset>
</Query>"""


@responses.activate
def test_update_genes_build_38(mock_app, database):
"""Test the cli command that downloads all genes for a genome build from Ensembl using Biomart"""

# GIVEN client with a xml query for a gene
build = "GRCh38"
url = "".join([BIOMART_38, XML_QUERY])

# GIVEN a mocked response from Ensembl Biomart
response = (
b"ENSG00000171314\tHGNC:8888\tPGAM1\t10\t97426191\t97433444\n"
b"ENSG00000121236\tHGNC:16277\tTRIM6\t11\t5596109\t5612958\n"
b"ENSG00000016391\tHGNC:24288\tCHDH\t3\t53812335\t53846419\n"
b"ENSG00000232218\t\t\t22\t32386668\t32386868\n"
b"[success]"
)
responses.add(responses.GET, url, body=response, status=200, stream=True)

# test add a dataset_obj using the app cli
runner = mock_app.test_cli_runner()

# When invoking the update genes command
result = runner.invoke(cli, ["update", "genes", "-build", build])

# Then the command shouldn't return error
assert result.exit_code == 0

# And 3 genes should be found on database
assert f"Number of inserted genes for build {build}: 3" in result.output
genes = list(database["gene"].find())
assert len(genes) == 3

0 comments on commit 14cb1d6

Please sign in to comment.