Skip to content

Commit

Permalink
Merge pull request #52 from moonso/annotate-vcf
Browse files Browse the repository at this point in the history
Annotate vcf
  • Loading branch information
Måns Magnusson committed Oct 31, 2018
2 parents ea7908e + 808c6ba commit 56d640c
Show file tree
Hide file tree
Showing 21 changed files with 625 additions and 44 deletions.
9 changes: 9 additions & 0 deletions CHANGELOG.md
@@ -1,3 +1,12 @@
[2.2]

- Adds CLI function to annotate variants
- Adds CLI functionality to dump and restore a database

[2.1]

- Fix bug with inserting variants

[2.0]

- Adds structural variants to loqus
6 changes: 6 additions & 0 deletions docs/admin-guide/README.md
@@ -0,0 +1,6 @@
# Admin Guide

This section will include documentation for admins and developers

* [Dump database](./dump.md)
* [Restore database](./restore.md)
17 changes: 17 additions & 0 deletions docs/admin-guide/dump.md
@@ -0,0 +1,17 @@
# Dumping a database

Loqus have a command to dump a database to a zipped file.
This command just wraps the mongo command dump with some defaults.

```bash
$loqusdb dump --help
Usage: loqusdb dump [OPTIONS]

Dump the database to a zipped file.

Default filename is loqusdb.<todays date>.gz (e.g loqusdb.19971004.gz)

Options:
-f, --filename PATH If custom named file is to be used
--help Show this message and exit.
```
25 changes: 25 additions & 0 deletions docs/admin-guide/restore.md
@@ -0,0 +1,25 @@
# Restore a database

It is possible to load all information from a dumped loqusdb instance.
This command just wraps the mongo command `mongo restore`
Loqusdb is shipped with a database instance that includes structural variants from ~740 1000G individuals.
Those have been called with [manta][manta], [tiddit][tiddit] and [CNVnator][cnvnator]
and the calls are merged with [svdb][svdb].
If no file is pointed at `loqusdb restore` will use this database as default.

```bash
Usage: loqusdb restore [OPTIONS]

Restore the database from a zipped file.

Default is to restore from db dump in loqusdb/resources/

Options:
-f, --filename PATH If custom named file is to be used
--help Show this message and exit.
```

[manta]: https://github.com/Illumina/manta
[tiddit]: https://github.com/SciLifeLab/TIDDIT
[svdb]: https://github.com/J35P312/SVDB
[cnvnator]: https://github.com/abyzovlab/CNVnator
3 changes: 2 additions & 1 deletion docs/user-guide/README.md
Expand Up @@ -21,4 +21,5 @@ This is to avoid the situation where a large family could have a great impact on
* [Usage](./usage.md)
* [Loading](./loading.md)
* [Deleting](./deleting.md)
* [Exporting](./exporting.md)
* [Exporting](./exporting.md)
* [Annotating](./annotating.md)
15 changes: 15 additions & 0 deletions docs/user-guide/annotating.md
@@ -0,0 +1,15 @@
# Annotating variants

Loqusdb can annotate a VCF with observations from an instance.

Use command:

```bash
Usage: loqusdb annotate [OPTIONS] <vcf_file>

Annotate the variants in a VCF

Options:
--sv
--help Show this message and exit.
```
106 changes: 68 additions & 38 deletions loqusdb/build_models/variant.py
Expand Up @@ -75,54 +75,41 @@ def is_greater(a,b):
return False


def build_variant(variant, case_obj, case_id=None, gq_treshold=None):
"""Return a Variant object
Take a cyvcf2 formated variant line and return a models.Variant.
If criterias are not fullfilled, eg. variant have no gt call or quality
is below gq treshold then return None.
def get_coords(variant):
"""Returns a dictionary with position information
Args:
variant(cyvcf2.Variant)
case_obj(Case): We need the case object to check individuals sex
case_id(str): The case id
gq_treshold(int): Genotype Quality treshold
Return:
formated_variant(models.Variant): A variant dictionary
Returns:
coordinates(dict)
"""
variant_obj = None

sv = False
# Let cyvcf2 tell if it is a Structural Variant or not
if variant.var_type == 'sv':
sv = True

coordinates = {
'chrom': None,
'end_chrom': None,
'sv_length': None,
'sv_type': None,
'pos': None,
'end': None,
}
chrom = variant.CHROM
if chrom.startswith(('chr', 'CHR', 'Chr')):
chrom = chrom[3:]
coordinates['chrom'] = chrom
end_chrom = chrom

pos = int(variant.POS)

alt = variant.ALT[0]

# Get the end position
# This will be None for non-svs
end_pos = variant.INFO.get('END')
if end_pos:
end = int(end_pos)
else:
end = int(variant.end)

# chrom_pos_ref_alt
variant_id = get_variant_id(variant)

ref = variant.REF
# ALT is an array in cyvcf2
alt = variant.ALT[0]

# Default end chrom is chrom
end_chrom = chrom

coordinates['end'] = end

sv_type = variant.INFO.get('SVTYPE')
length = variant.INFO.get('SVLEN')
if length:
Expand All @@ -141,7 +128,6 @@ def build_variant(variant, case_obj, case_id=None, gq_treshold=None):

#Set 'infinity' to length if translocation
sv_len = float('inf')
sv_type = 'BND'

# Insertions often have length 0 in VCF
if (sv_len == 0 and alt != '<INS>'):
Expand All @@ -160,7 +146,50 @@ def build_variant(variant, case_obj, case_id=None, gq_treshold=None):

chrom = end_position.chrom
pos = end_position.pos


coordinates['end_chrom'] = end_chrom
coordinates['pos'] = pos
coordinates['end'] = end
coordinates['sv_length'] = sv_len
coordinates['sv_type'] = sv_type

return coordinates

def build_variant(variant, case_obj, case_id=None, gq_treshold=None):
"""Return a Variant object
Take a cyvcf2 formated variant line and return a models.Variant.
If criterias are not fullfilled, eg. variant have no gt call or quality
is below gq treshold then return None.
Args:
variant(cyvcf2.Variant)
case_obj(Case): We need the case object to check individuals sex
case_id(str): The case id
gq_treshold(int): Genotype Quality treshold
Return:
formated_variant(models.Variant): A variant dictionary
"""
variant_obj = None

sv = False
# Let cyvcf2 tell if it is a Structural Variant or not
if variant.var_type == 'sv':
sv = True

# chrom_pos_ref_alt
variant_id = get_variant_id(variant)

ref = variant.REF
# ALT is an array in cyvcf2
# We allways assume splitted and normalized VCFs
alt = variant.ALT[0]

coordinates = get_coords(variant)
chrom = coordinates['chrom']
pos = coordinates['pos']

# These are integers that will be used when uploading
found_homozygote = 0
Expand Down Expand Up @@ -197,16 +226,17 @@ def build_variant(variant, case_obj, case_id=None, gq_treshold=None):
found_homozygote = 1

if found_variant:

variant_obj = Variant(
variant_id=variant_id,
chrom=chrom,
pos=pos,
end=end,
end=coordinates['end'],
ref=ref,
alt=alt,
end_chrom=end_chrom,
sv_type = sv_type,
sv_len = sv_len,
end_chrom=coordinates['end_chrom'],
sv_type = coordinates['sv_type'],
sv_len = coordinates['sv_length'],
case_id = case_id,
homozygote = found_homozygote,
hemizygote = found_hemizygote,
Expand Down
3 changes: 3 additions & 0 deletions loqusdb/commands/__init__.py
Expand Up @@ -10,3 +10,6 @@
from .view import index as index_command
from .migrate import migrate as migration_command
from .identity import identity as identity_command
from .annotate import annotate as annotate_command
from .dump import dump as dump_command
from .restore import restore as restore_command
64 changes: 64 additions & 0 deletions loqusdb/commands/annotate.py
@@ -0,0 +1,64 @@
import os
import logging
import click

from pprint import pprint as pp

from datetime import datetime

from loqusdb.exceptions import (VcfError)
from loqusdb.utils.load import load_database
from loqusdb.utils.vcf import (get_file_handle, check_vcf, add_headers)
from loqusdb.utils.annotate import (annotate_snvs, annotate_svs)

from . import base_command

LOG = logging.getLogger(__name__)

@base_command.command('annotate', short_help="Annotate a VCF with observations")
@click.argument('variant-file',
type=click.Path(exists=True),
metavar='<vcf_file>',
)
@click.option('--sv', is_flag=True)
@click.pass_context
def annotate(ctx, variant_file, sv):
"""Annotate the variants in a VCF
"""
adapter = ctx.obj['adapter']

variant_path = os.path.abspath(variant_file)

expected_type = 'snv'
if sv:
expected_type = 'sv'

if 'sv':
nr_cases = adapter.nr_cases(sv_cases=True)
else:
nr_cases = adapter.nr_cases(snv_cases=True)
LOG.info("Found {0} {1} cases in database".format(nr_cases, expected_type))

vcf_obj = get_file_handle(variant_path)
add_headers(vcf_obj, nr_cases=nr_cases, sv=sv)
# Print the headers
for header_line in vcf_obj.raw_header.split('\n'):
if len(header_line) == 0:
continue
click.echo(header_line)

start_inserting = datetime.now()

if sv:
annotated_variants = annotate_svs(adapter, vcf_obj)
else:
annotated_variants = annotate_snvs(adapter, vcf_obj)
# try:
for variant in annotated_variants:
click.echo(str(variant).rstrip())
# except (Exception) as error:
# LOG.warning(error)
# ctx.abort()


48 changes: 48 additions & 0 deletions loqusdb/commands/dump.py
@@ -0,0 +1,48 @@
# -*- coding: utf-8 -*-
import os
import logging
import subprocess

from datetime import datetime

import click

from . import base_command

LOG = logging.getLogger(__name__)

@base_command.command('dump', short_help="Dump the database")
@click.option('-f' ,'--filename',
help='If custom named file is to be used',
type=click.Path(exists=False)
)
@click.pass_context
def dump(ctx, filename):
"""Dump the database to a zipped file.
Default filename is loqusdb.<todays date>.gz (e.g loqusdb.19971004.gz)
"""

if not filename:
filename = "loqusdb.{}.gz".format(datetime.now().strftime('%Y%m%d'))

if os.path.isfile(filename):
LOG.warning("File {} already exists. Please remove file or change name with '--filename'".format(filename))
ctx.abort()

call = ['mongodump', '--gzip', '--db', 'loqusdb', '--archive={}'.format(filename)]

LOG.info('dumping database...')
start_time = datetime.now()
try:
completed = subprocess.run(call, check=True)
except subprocess.CalledProcessError as err:
LOG.warning(err)
LOG.info("Deleting dump..")
os.path.remove(filename)
ctx.abort()

LOG.info('Database dumped succesfully')
LOG.info('Time to dump database: {0}'.format(datetime.now()-start_time))


0 comments on commit 56d640c

Please sign in to comment.