Skip to content

Commit

Permalink
cleanup sanger calls
Browse files Browse the repository at this point in the history
  • Loading branch information
danielecook committed Nov 1, 2016
1 parent d0b043d commit dc654e1
Show file tree
Hide file tree
Showing 5 changed files with 118 additions and 63 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Expand Up @@ -15,4 +15,6 @@ proj.*
.coverage
*__pycache__*
.genome_directory
.eggs
.eggs

site
85 changes: 53 additions & 32 deletions vcfkit/call.py
Expand Up @@ -11,13 +11,14 @@
"""
from docopt import docopt
import vk
from utils.reference import resolve_reference_genome
from Bio import SeqIO
from utils.blastn import blast, blast_variant
from utils.vcf import *
from subprocess import Popen
import sys
from collections import defaultdict
from clint.textui import colored, puts, indent, puts_err
from clint.textui import colored, puts, puts_err, indent
import os
from signal import signal, SIGPIPE, SIG_DFL
signal(SIGPIPE, SIG_DFL)
Expand All @@ -26,16 +27,36 @@


def seq_type(filename):
"""
Resolves sequence filetype using extension.
"""
filename, ext = os.path.splitext(filename.lower())
if ext in [".fasta", ".fa"]:
return 'fasta'
extension = 'fasta'
elif ext in [".fastq",".fq"]:
return 'fastq'
extension = 'fastq'
elif ext in [".ab1"]:
return 'ab1'
extension = 'ab1'
else:
raise Exception("Unknown sequence file type: " + filename)

with indent(4):
puts_err(colored.green("\nReading sequences as %s\n" % extension.upper()))
return extension


def resolve_sample_from_fasta_line(samples, fasta_line):
"""
Resolves sample names by splitting fasta line
on non-word characters.
"""
fasta_line = re.split("\W",fasta_line)
matched_sample = [x for x in samples if x in fasta_line]
if len(matched_sample) == 1:
return matched_sample[0]
return ""


def format_gt(gt):
# Return homozygous calls as single bases.
gt = list(set(re.split("[\|\/]", gt)))
Expand All @@ -60,10 +81,10 @@ def format_args(args, add_missing_stdin = False):
argv=args,
options_first=False)

# Reference path - check that it exists

module_path = os.path.split(os.path.realpath(__file__))[0]
reference = module_path + "/reference/" + args["--ref"] + ".fa.gz"
handle = open(args["<seq>"], "rU")
reference = resolve_reference_genome(args["--ref"])

if args["<vcf>"]:
concordance = True
Expand All @@ -75,57 +96,55 @@ def format_args(args, add_missing_stdin = False):
with indent(4):
exit(puts_err(colored.red("\nMust specify <vcf> with --vcf-targets\n")))

# Open fasta and read
# Setup reference for blast call
b = blast(reference)

# Set file type:
sequence_file_type = seq_type(args["<seq>"])

# Output header
print("\t".join(blast_variant.output_order + ["classification", "sample", "description"]))
print("\t".join(blast_variant.output_order))
for record in SeqIO.parse(handle, sequence_file_type):
rec_split = re.split("[ \|]{1}", record.name, 1)
sample = record.name.strip(">")
description = record.description.strip(">")
# Resolve sample within fasta line
sample = resolve_sample_from_fasta_line(v.samples, record.name)
blast_results = b.blast_call(record)
classification = ""
for n, variant in enumerate(blast_results):
output_line = False
if variant is None:
puts_err(colored.red("No Results for " + sample + " " + description))
puts_err(colored.red("No Results for " + sample + " " + record.description))
continue
if args["<vcf>"]:
if n == 0:
vcf_variants = []
for vcf_variant in v(variant.region()):
# Deal with phasing
gt = format_gt(vcf_variant.gt_bases[v.samples.index(sample)])
vcf_variants.append([vcf_variant.CHROM,
vcf_variant.POS,
gt])
vcf_variant_positions = [x[0:2] for x in vcf_variants]
if sample:
gt = format_gt(vcf_variant.gt_bases[v.samples.index(sample)])
vcf_variants.append([vcf_variant.CHROM,
vcf_variant.POS,
gt])
vcf_variant_positions = [x[0:2] for x in vcf_variants]

chrom_pos = variant.chrom_pos_allele()[0:2]
vcf_variant_match = [x for x in vcf_variants if x[0:2] == chrom_pos]
if vcf_variant_match:
vcf_variant_match = vcf_variant_match[0]
variant.vcf_gt = vcf_variant_match[2]
if variant.REF == variant.gt and variant.gt == variant.vcf_gt:
classification = "TN"
elif variant.REF != variant.gt and variant.gt == variant.vcf_gt:
classification = "TP"
elif variant.REF == variant.gt and variant.gt != variant.vcf_gt:
classification = "FP"
elif variant.REF != variant.gt and variant.gt != variant.vcf_gt:
classification = "FN"
if variant.REF == variant.seq_gt and variant.seq_gt == variant.vcf_gt:
variant.classification = "TN"
elif variant.REF != variant.seq_gt and variant.seq_gt == variant.vcf_gt:
variant.classification = "TP"
elif variant.REF == variant.seq_gt and variant.seq_gt != variant.vcf_gt:
variant.classification = "FP"
elif variant.REF != variant.seq_gt and variant.seq_gt != variant.vcf_gt:
variant.classification = "FN"
else:
if variant.REF != variant.gt:
classification = "FN"
if variant.REF != variant.seq_gt:
variant.classification = "FN"
else:
classification = ""
variant.classification = ""

#print args["--vcf-targets"] and variant.chrom_pos_allele()[0:2] in vcf_variant_positions
if args["--vcf-targets"] and classification != "":
if args["--vcf-targets"] and variant.classification != "":
output_line = True
elif args["--all-sites"] is True:
output_line = True
Expand All @@ -135,4 +154,6 @@ def format_args(args, add_missing_stdin = False):
elif variant.is_variant:
output_line = True
if output_line:
print '\t'.join([str(variant), classification, sample, description])
variant.sample = sample
variant.description = record.description
print '\t'.join([str(variant)])
55 changes: 36 additions & 19 deletions vcfkit/genome.py
Expand Up @@ -26,15 +26,38 @@
import os
import urllib
from sys import exit # Used by exit(); don't remove.

from Bio import Entrez


def fetch_chrom_name(id):
try:
if not id.startswith("NC_"):
return id
Entrez.email = "vcf-kit@vcf-kit.com"
chrom = Entrez.read(Entrez.efetch(db="nuccore", id=id, rettype="gb", retmode = "xml"))
gb_feature_quals = chrom[0]["GBSeq_feature-table"][0]["GBFeature_quals"]
features = dict([x.values() for x in gb_feature_quals])
if "organelle" in features:
if features["organelle"] == "mitochondrion":
return "MtDNA"
else:
chrom_name = features["chromosome"]
return chrom_name
except:
return id

def download_genomes(genome_db):
with indent(2):
puts(colored.blue('\nDownloading list of reference genomes\n'))
r = requests.get("http://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_refseq.txt")
genome_file = open(genome_db, "w")
with genome_file as f:
f.write(r.text.encode('utf-8').strip())
if os.path.isfile(genome_db):
fileTime = os.path.getmtime(genome_db)
else:
fileTime = 0
if (time() - fileTime) > (3 * 30 * 24 * 60 * 60) or is_non_zero_file(genome_db) is False:
with indent(2):
puts(colored.blue('\nDownloading list of reference genomes\n'))
r = requests.get("http://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_refseq.txt")
genome_file = open(genome_db, "w")
with genome_file as f:
f.write(r.text.encode('utf-8').strip())


def is_non_zero_file(fpath):
Expand Down Expand Up @@ -70,7 +93,6 @@ def main(debug=None):

with indent(2):
puts(genome_directory)
#puts(colored.blue("\nGenome Directory: " + genome_directory + "\n"))

if args["location"] and not args["<path>"]:
return genome_directory
Expand All @@ -90,12 +112,7 @@ def main(debug=None):
##################
elif args["--search"]:
# Download and cache a list of genomes from NCBI for searching
if os.path.isfile(genome_db):
fileTime = os.path.getctime(genome_db)
else:
fileTime = 0
if fileTime < (time() - 60 * 60 * 24 * 2) or is_non_zero_file(genome_db):
download_genomes(genome_db)
download_genomes(genome_db)

# Cache result
header = ["assembly_accession", # 0
Expand All @@ -120,15 +137,14 @@ def main(debug=None):
with indent(2):
puts(colored.blue('\nTo download a genome and setup for use:'))
with indent(4):
puts(colored.green("\nvk genome --ref=<asm_name>\n"))
puts(colored.green("\nvk genome ncbi --ref=<asm_name>\n"))
return results
elif args["--ref"]:
# reference name.
reference_name = args["--ref"]

# Ensure genome db is available
if is_non_zero_file(genome_db) is False:
download_genomes(genome_db)
download_genomes(genome_db)

# reference directory
if not args["--directory"]:
Expand Down Expand Up @@ -184,9 +200,10 @@ def main(debug=None):
for line in f:
outline = line
if line.startswith(">"):
chrom_name = re.match(".*[c|C]hromosome ([A-Za-z0-9]+)[W]*", line)
acc = line.split(" ")[0].strip(">")
chrom_name = fetch_chrom_name(acc)
if chrom_name is not None:
outline = ">" + chrom_name.group(1) + "\n"
outline = ">" + chrom_name + "\n"
elif line.lower().find("mitochon") > 0:
outline = ">MtDNA\n"
puts(colored.blue(line.strip("\n>")) + " --> " + colored.blue(outline.strip("\n>")))
Expand Down
30 changes: 21 additions & 9 deletions vcfkit/utils/blastn.py
Expand Up @@ -42,9 +42,12 @@ class blast_variant:
output_order = ["CHROM",
"POS",
"REF",
"gt",
"ALT",
"seq_gt",
"vcf_gt",
"sample",
"variant_type",
"classification",
"index",
"alignment_start",
"alignment_end",
Expand All @@ -55,7 +58,8 @@ class blast_variant:
"evalue",
"bitscore",
"phred_quality",
"phred_quality_window"]
"phred_quality_window",
"description"]

def __init__(self,
blast_result,
Expand All @@ -71,9 +75,13 @@ def __init__(self,

self.CHROM = blast_result["sacc"]
self.POS = POS
posp1 = self.POS + 1
self.CHROM_POS = "{self.CHROM}:{self.POS}-{posp1}".format(**locals())
self.classification = None
self.REF = ref_out.strip("-")
self.gt = alt_out.strip("-")
self.is_variant = (self.REF != self.gt)
self.ALT = None
self.seq_gt = alt_out.strip("-")
self.is_variant = (self.REF != self.seq_gt)
self.vcf_gt = ""
self.alignment_start = blast_result["sstart"]
self.alignment_end = blast_result["send"]
Expand All @@ -84,23 +92,27 @@ def __init__(self,
self.index = index
self.evalue = blast_result["evalue"]
self.bitscore = blast_result["bitscore"]
if self.REF != self.gt:
if len(self.REF) == len(self.gt):
if self.REF != self.seq_gt:
if len(self.REF) == len(self.seq_gt):
self.variant_type = "snp"
elif len(self.REF) > len(self.gt):
elif len(self.REF) > len(self.seq_gt):
self.variant_type = "deletion"
elif len(self.REF) < len(self.gt):
elif len(self.REF) < len(self.seq_gt):
self.variant_type = "insertion"
else:
self.variant_type = ""
self.phred_quality = phred_quality
self.phred_quality_window = phred_quality_window

def classify_variant(self):
# Determines whether variant is TP, TN, FP, FN
pass

def chrom_pos_allele(self):
"""
Simplified representation of variant
"""
return [self.CHROM, self.POS, self.gt]
return [self.CHROM, self.POS, self.seq_gt]

def region(self):
return "{self.CHROM}:{self.alignment_start}-{self.alignment_end}".format(**locals())
Expand Down
7 changes: 5 additions & 2 deletions vcfkit/utils/reference.py
@@ -1,4 +1,4 @@
from clint.textui import colored, puts, indent
from clint.textui import colored, puts, puts_err, indent
import os


Expand Down Expand Up @@ -41,7 +41,10 @@ def resolve_reference_genome(loc):
return loc
else:
if loc in get_genome_list():
return "{gd}/{loc}/{loc}.fa.gz".format(gd = get_genome_directory(), loc = loc)
reference_location = "{gd}/{loc}/{loc}.fa.gz".format(gd = get_genome_directory(), loc = loc)
with indent(4):
puts_err(colored.green("\nUsing reference located at %s\n" % reference_location))
return reference_location
else:
with indent(4):
puts_err(colored.red("\nGenome does not exist\n"))

0 comments on commit dc654e1

Please sign in to comment.