-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Recreate vcf from variants in scout #6
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,6 +2,7 @@ | |
from datetime import datetime, timedelta | ||
|
||
from mutacc_auto.commands.scout_command import ScoutExportCases | ||
from mutacc_auto.parse.vcf_constants import * | ||
|
||
#The timestamp in the scout database seems to be given with | ||
#millisecond precision, it is therefor necessary to divide the | ||
|
@@ -19,7 +20,7 @@ def get_cases_from_scout(scout_output, days_ago=None): | |
scout_output (str): output from scout command | ||
days_ago (int): number of days since case updated | ||
|
||
Returns (list(dict)): list of dictionaries representing the cases | ||
Returns (list(dict)): list of dictionaries representing the cases | ||
""" | ||
|
||
cases = json.loads(scout_output) | ||
|
@@ -39,3 +40,98 @@ def get_cases_from_scout(scout_output, days_ago=None): | |
recent_cases.append(case) | ||
|
||
return recent_cases | ||
|
||
|
||
|
||
|
||
|
||
def get_vcf_from_json(scout_vcf_output): | ||
|
||
""" | ||
Reconstructs vcf from scout variant object | ||
|
||
Args: | ||
scout_vcf_output (str): string returned by command 'scout export variants --json' | ||
|
||
Returns: | ||
vcf_string (str): string with vcf content | ||
""" | ||
|
||
scout_vcf_output = json.loads(scout_vcf_output) | ||
|
||
vcf_string = "" | ||
|
||
#Write header of vcf | ||
for header_line in HEADER: | ||
vcf_string += header_line + '\n' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks like a named constant + NEWLINE |
||
|
||
#Get samples | ||
samples = [sample['sample_id'] for sample in scout_vcf_output[0]['samples']] | ||
samples = '\t'.join(samples) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here TAB |
||
|
||
vcf_string += f"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{samples}\n" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would like a simple "TAB.join(#CHROM, POS, ...etc)" if possible or even better That is a better description of what the parts are and more readable. You will have to excuse my python, but I think you get the idea |
||
|
||
|
||
#Write variants | ||
for variant in scout_vcf_output: | ||
|
||
entry= [] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. entry = [] There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. And a line is vcf is usually called a "record" I think... |
||
entry.append(str(variant['chromosome'])) #CHROM | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do you not make this into a for loop and append each iteration using the key as iterator. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Handle the PASS outside loop if it is not part of variant There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Make this into a def |
||
entry.append(str(variant['position'])) #POS | ||
entry.append(str(variant['dbsnp_id'] or '.')) #ID | ||
entry.append(str(variant['reference'])) #REF | ||
entry.append(str(variant['alternative'])) #ALT | ||
entry.append(str(variant['quality'])) #QUAL | ||
entry.append('PASS') #FILTER | ||
|
||
#write INFO | ||
info = [] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. make this into a def |
||
for ID in SCOUT_TO_INFO.keys(): | ||
|
||
info_string = f"{SCOUT_TO_INFO[ID]}={int(variant[ID])}" | ||
info.append(info_string) | ||
|
||
if variant['category'].lower() == 'snv': | ||
info_string = f"TYPE={variant['sub_category']}" | ||
|
||
else: | ||
info_string = f"SVTYPE={variant['sub_category']}" | ||
|
||
info.append(info_string) | ||
|
||
info = ';'.join(info) | ||
|
||
entry.append(info) | ||
|
||
#Write the format and genotype calls | ||
format = ':'.join([SCOUT_TO_FORMAT[ID] for ID in SCOUT_TO_FORMAT.keys()]) | ||
|
||
entry.append(format) | ||
|
||
samples = [] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. make this into a def |
||
for sample in variant['samples']: | ||
|
||
gt_calls = [] | ||
for ID in SCOUT_TO_FORMAT.keys(): | ||
|
||
if type(sample[ID]) == list: | ||
|
||
ID_value = ','.join([str(element) for element in sample[ID]]) | ||
|
||
else: | ||
ID_value = str(sample[ID]) | ||
|
||
gt_calls.append(ID_value) | ||
|
||
gt_calls = ':'.join(gt_calls) | ||
samples.append(gt_calls) | ||
|
||
samples = '\t'.join(samples) | ||
|
||
entry.append(samples) | ||
|
||
entry = '\t'.join(entry) + '\n' | ||
|
||
vcf_string += entry | ||
|
||
return vcf_string |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
#Scout fields name: vcf ID | ||
SCOUT_TO_FORMAT = { | ||
|
||
'genotype_call': 'GT', | ||
'allele_depths': 'AD', | ||
'read_depth': 'DP', | ||
'genotype_quality': 'GQ' | ||
|
||
} | ||
|
||
SCOUT_TO_INFO = { | ||
|
||
'end': 'END', | ||
'rank_score': 'RankScore' | ||
} | ||
|
||
HEADER = ( | ||
|
||
'##fileformat=VCFv4.2', | ||
|
||
'##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">', | ||
'##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">', | ||
'##INFO=<ID=TYPE,Number=A,Type=String,Description="The type of allele, either snp, mnp, ins, del, or complex.">', | ||
'##INFO=<ID=RankScore,Number=.,Type=String,Description="The rank score for this variant in this family. family_id:rank_score.">', | ||
|
||
|
||
'##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">', | ||
'##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">', | ||
'##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">', | ||
'##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">' | ||
|
||
) |
Large diffs are not rendered by default.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does this mean import all? If so, it is usually better to be explicit. Otherwise your namespace might be polluted without you knowing it.