Skip to content

Commit

Permalink
Fixes based on comments
Browse files Browse the repository at this point in the history
  • Loading branch information
adrosenbaum committed Mar 12, 2019
1 parent 86d7066 commit 9774198
Show file tree
Hide file tree
Showing 2 changed files with 122 additions and 41 deletions.
135 changes: 94 additions & 41 deletions mutacc_auto/parse/parse_scout.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,13 @@
from datetime import datetime, timedelta

from mutacc_auto.commands.scout_command import ScoutExportCases
from mutacc_auto.parse.vcf_constants import *
from mutacc_auto.parse.vcf_constants import (SCOUT_TO_FORMAT,
SCOUT_TO_INFO,
HEADER,
NEWLINE,
TAB,
COLUMN_NAMES,
SCOUT_TO_COLUMNS)

#The timestamp in the scout database seems to be given with
#millisecond precision, it is therefor necessary to divide the
Expand Down Expand Up @@ -63,75 +69,122 @@ def get_vcf_from_json(scout_vcf_output):

#Write header of vcf
for header_line in HEADER:
vcf_string += header_line + '\n'
vcf_string += header_line + NEWLINE

#Get samples
samples = [sample['sample_id'] for sample in scout_vcf_output[0]['samples']]
samples = '\t'.join(samples)

vcf_string += f"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{samples}\n"
#Append sample names to the COLUMN_NAMES list
column_names = COLUMN_NAMES + samples
column_names = TAB.join(column_names)

vcf_string += column_names + NEWLINE

#Write variants
for variant in scout_vcf_output:

entry= []
entry.append(str(variant['chromosome'])) #CHROM
entry.append(str(variant['position'])) #POS
entry.append(str(variant['dbsnp_id'] or '.')) #ID
entry.append(str(variant['reference'])) #REF
entry.append(str(variant['alternative'])) #ALT
entry.append(str(variant['quality'])) #QUAL
entry.append('PASS') #FILTER
#Write column values
record = get_columns(variant)

#write INFO
info = []
for ID in SCOUT_TO_INFO.keys():
info = get_info(variant)
record.append(info)

info_string = f"{SCOUT_TO_INFO[ID]}={int(variant[ID])}"
info.append(info_string)
#Write the format a
format = ':'.join([SCOUT_TO_FORMAT[ID] for ID in SCOUT_TO_FORMAT.keys()])
record.append(format)

#write genotypes for each sample
samples = get_genotypes(variant)
record.append(samples)

record = TAB.join(record) + NEWLINE

#Add variant record to vcf_string
vcf_string += record

return vcf_string

def get_columns(variant):
"""
Given a variant object from scout, write the columns CHR - FILTER
as a string with values separated by tab
if variant['category'].lower() == 'snv':
info_string = f"TYPE={variant['sub_category']}"
Args:
variant (dict): dictionary of scout variant object
Returns:
record (str): values CHR-FILTER as a string
"""
record = []

for column in SCOUT_TO_COLUMNS:

if type(variant[column]) == list:
column_value = ','.join([str(element) for element in variant[column]])

else:
info_string = f"SVTYPE={variant['sub_category']}"
column_value = str(variant[column])

info.append(info_string)
record.append(column_value)

info = ';'.join(info)
return record

entry.append(info)
def get_info(variant):
"""
Given a variant object from scout, write the INFO column
for a variant.
#Write the format and genotype calls
format = ':'.join([SCOUT_TO_FORMAT[ID] for ID in SCOUT_TO_FORMAT.keys()])
Args:
variant (dict): dictionary of scout variant object
Returns:
info (str): INFO string
"""
info = []
for ID in SCOUT_TO_INFO.keys():

info_string = f"{SCOUT_TO_INFO[ID]}={int(variant[ID])}"
info.append(info_string)

entry.append(format)
if variant['category'].lower() == 'snv':
info_string = f"TYPE={variant['sub_category']}"

samples = []
for sample in variant['samples']:
else:
info_string = f"SVTYPE={variant['sub_category']}"

gt_calls = []
for ID in SCOUT_TO_FORMAT.keys():
info.append(info_string)

if type(sample[ID]) == list:
info = ';'.join(info)

ID_value = ','.join([str(element) for element in sample[ID]])
return info

else:
ID_value = str(sample[ID])
def get_genotypes(variant):
"""
Given a variant object from scout, write the genotypes column for each
sample.
gt_calls.append(ID_value)
Args:
variant (dict): dictionary of scout variant object
Returns:
samples (str): genotypes for each sample
"""
samples = []
for sample in variant['samples']:

gt_calls = ':'.join(gt_calls)
samples.append(gt_calls)
gt_calls = []
for ID in SCOUT_TO_FORMAT.keys():

samples = '\t'.join(samples)
if type(sample[ID]) == list:

entry.append(samples)
ID_value = ','.join([str(element) for element in sample[ID]])

entry = '\t'.join(entry) + '\n'
else:
ID_value = str(sample[ID])

vcf_string += entry
gt_calls.append(ID_value)

return vcf_string
gt_calls = ':'.join(gt_calls)
samples.append(gt_calls)

samples = TAB.join(samples)

return samples
28 changes: 28 additions & 0 deletions mutacc_auto/parse/vcf_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,31 @@
'##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">'

)

NEWLINE = '\n'

TAB = '\t'

COLUMN_NAMES = [
'#CHROM',
'POS',
'ID',
'REF',
'ALT',
'QUAL',
'FILTER',
'INFO',
'FORMAT'
]

SCOUT_TO_COLUMNS = (

'chromosome', #CHROM
'position', #POS
'dbsnp_id', #ID
'reference', #REF
'alternative',#ALT
'quality', #QUAL
'filters' #FILTER

)

0 comments on commit 9774198

Please sign in to comment.