-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #86 from Clinical-Genomics/genes_to_panel
Download genes for a genome build from Ensembl and save them to database
- Loading branch information
Showing
8 changed files
with
273 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
#!/usr/bin/env python3 | ||
# -*- coding: utf-8 -*- | ||
import click | ||
from flask.cli import with_appcontext | ||
from cgbeacon2.utils.ensembl_biomart import EnsemblBiomartClient | ||
from cgbeacon2.utils.update import update_genes | ||
|
||
|
||
@click.group() | ||
def update(): | ||
"""Update items in the database using the cli""" | ||
pass | ||
|
||
|
||
@update.command() | ||
@with_appcontext | ||
@click.option( | ||
"-build", | ||
type=click.Choice(["GRCh37", "GRCh38"]), | ||
nargs=1, | ||
help="Genome assembly (default:GRCh37)", | ||
default="GRCh37", | ||
) | ||
def genes(build): | ||
"""Update genes and gene coordinates in database""" | ||
|
||
click.echo(f"Collecting gene names from Ensembl, genome build -> {build}") | ||
client = EnsemblBiomartClient(build) | ||
gene_lines = client.query_service() | ||
# If gene query was not successful, exit command | ||
if gene_lines is None: | ||
return | ||
|
||
n_inserted = update_genes(gene_lines, build) | ||
click.echo(f"Number of inserted genes for build {build}: {len(n_inserted)}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
"""Code for downloading all genes with coordinates from Ensembl Biomart""" | ||
import logging | ||
import requests | ||
|
||
BIOMART_37 = "http://grch37.ensembl.org/biomart/martservice?query=" | ||
BIOMART_38 = "http://ensembl.org/biomart/martservice?query=" | ||
CHROMOSOMES = [str(num) for num in range(1, 23)] + ["X", "Y", "MT"] | ||
ATTRIBUTES = [ | ||
"chromosome_name", | ||
"start_position", | ||
"end_position", | ||
"ensembl_gene_id", | ||
"hgnc_symbol", | ||
"hgnc_id", | ||
] | ||
|
||
LOG = logging.getLogger(__name__) | ||
|
||
|
||
class EnsemblBiomartClient: | ||
"""Class to handle requests to the ensembl biomart api""" | ||
|
||
def __init__(self, build="GRCh37"): | ||
"""Initialise a ensembl biomart client""" | ||
self.server = BIOMART_37 | ||
if build == "GRCh38": | ||
self.server = BIOMART_38 | ||
self.filters = {"chromosome_name": CHROMOSOMES} | ||
self.attributes = [ | ||
"ensembl_gene_id", | ||
"hgnc_id", | ||
"hgnc_symbol", | ||
"chromosome_name", | ||
"start_position", | ||
"end_position", | ||
] | ||
self.xml = self._create_biomart_xml() | ||
self.header = True | ||
|
||
def _create_biomart_xml(self): | ||
"""Convert biomart query params into biomart xml query | ||
Accepts: | ||
filters(dict): keys are filter names and values are filter values | ||
attributes(list): a list of attributes | ||
Returns: | ||
xml: a query xml file | ||
""" | ||
filter_lines = self._xml_filters() | ||
attribute_lines = self._xml_attributes() | ||
xml_lines = [ | ||
'<?xml version="1.0" encoding="UTF-8"?>', | ||
"<!DOCTYPE Query>", | ||
'<Query virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows' | ||
' = "0" count = "" datasetConfigVersion = "0.6" completionStamp = "1">', | ||
"", | ||
'\t<Dataset name = "hsapiens_gene_ensembl" interface = "default" >', | ||
] | ||
for line in filter_lines: | ||
xml_lines.append("\t\t" + line) | ||
for line in attribute_lines: | ||
xml_lines.append("\t\t" + line) | ||
xml_lines += ["\t</Dataset>", "</Query>"] | ||
|
||
return "\n".join(xml_lines) | ||
|
||
def _xml_filters(self): | ||
"""Creates a filter line for the biomart xml document | ||
Returns: | ||
formatted_lines(list[str]): List of formatted xml filter lines | ||
""" | ||
formatted_lines = [] | ||
for filter_name in self.filters: | ||
value = self.filters[filter_name] | ||
if isinstance(value, str): | ||
formatted_lines.append( | ||
'<Filter name = "{0}" value = "{1}"/>'.format(filter_name, value) | ||
) | ||
else: | ||
formatted_lines.append( | ||
'<Filter name = "{0}" value = "{1}"/>'.format(filter_name, ",".join(value)) | ||
) | ||
|
||
return formatted_lines | ||
|
||
def _xml_attributes(self): | ||
"""Creates an attribute line for the biomart xml document | ||
Returns: | ||
formatted_lines(list(str)): list of formatted xml attribute lines | ||
""" | ||
formatted_lines = [] | ||
for attr in self.attributes: | ||
formatted_lines.append('<Attribute name = "{}" />'.format(attr)) | ||
return formatted_lines | ||
|
||
def query_service(self): | ||
"""Query the Ensembl biomart service and yield the resulting lines | ||
Accepts: | ||
xml(str): an xml formatted query, as described here: | ||
https://grch37.ensembl.org/info/data/biomart/biomart_perl_api.html | ||
Yields: | ||
biomartline | ||
""" | ||
url = "".join([self.server, self.xml]) | ||
try: | ||
with requests.get(url, stream=True) as r: | ||
for line in r.iter_lines(): | ||
yield line.decode("utf-8") | ||
except Exception as ex: | ||
LOG.info("Error downloading data from biomart: {}".format(ex)) | ||
return |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,6 +3,7 @@ coveralls | |
mongomock | ||
pytest>4.6 | ||
pytest-cov | ||
responses | ||
|
||
# documentation | ||
mkdocs |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
# -*- coding: utf-8 -*- | ||
import responses # for the sake of mocking it | ||
from cgbeacon2.cli.commands import cli | ||
from cgbeacon2.utils.ensembl_biomart import BIOMART_38 | ||
|
||
XML_QUERY = """<?xml version="1.0" encoding="UTF-8"?> | ||
<!DOCTYPE Query> | ||
<Query virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" completionStamp = "1"> | ||
<Dataset name = "hsapiens_gene_ensembl" interface = "default" > | ||
<Filter name = "chromosome_name" value = "1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y,MT"/> | ||
<Attribute name = "ensembl_gene_id" /> | ||
<Attribute name = "hgnc_id" /> | ||
<Attribute name = "hgnc_symbol" /> | ||
<Attribute name = "chromosome_name" /> | ||
<Attribute name = "start_position" /> | ||
<Attribute name = "end_position" /> | ||
</Dataset> | ||
</Query>""" | ||
|
||
|
||
@responses.activate | ||
def test_update_genes_build_38(mock_app, database): | ||
"""Test the cli command that downloads all genes for a genome build from Ensembl using Biomart""" | ||
|
||
# GIVEN client with a xml query for a gene | ||
build = "GRCh38" | ||
url = "".join([BIOMART_38, XML_QUERY]) | ||
|
||
# GIVEN a mocked response from Ensembl Biomart | ||
response = ( | ||
b"ENSG00000171314\tHGNC:8888\tPGAM1\t10\t97426191\t97433444\n" | ||
b"ENSG00000121236\tHGNC:16277\tTRIM6\t11\t5596109\t5612958\n" | ||
b"ENSG00000016391\tHGNC:24288\tCHDH\t3\t53812335\t53846419\n" | ||
b"ENSG00000232218\t\t\t22\t32386668\t32386868\n" | ||
b"[success]" | ||
) | ||
responses.add(responses.GET, url, body=response, status=200, stream=True) | ||
|
||
# test add a dataset_obj using the app cli | ||
runner = mock_app.test_cli_runner() | ||
|
||
# When invoking the update genes command | ||
result = runner.invoke(cli, ["update", "genes", "-build", build]) | ||
|
||
# Then the command shouldn't return error | ||
assert result.exit_code == 0 | ||
|
||
# And 3 genes should be found on database | ||
assert f"Number of inserted genes for build {build}: 3" in result.output | ||
genes = list(database["gene"].find()) | ||
assert len(genes) == 3 |