cleanup sanger calls

AndersenLab · Nov 1, 2016 · dc654e1 · dc654e1
1 parent d0b043d
commit dc654e1
Show file tree

Hide file tree

Showing 5 changed files with 118 additions and 63 deletions.
diff --git a/.gitignore b/.gitignore
@@ -15,4 +15,6 @@ proj.*
 .coverage
 *__pycache__*
 .genome_directory
-.eggs
+.eggs
+
+site
diff --git a/vcfkit/call.py b/vcfkit/call.py
@@ -11,13 +11,14 @@
 """
 from docopt import docopt
 import vk
+from utils.reference import resolve_reference_genome
 from Bio import SeqIO
 from utils.blastn import blast, blast_variant
 from utils.vcf import *
 from subprocess import Popen
 import sys
 from collections import defaultdict
-from clint.textui import colored, puts, indent, puts_err
+from clint.textui import colored, puts, puts_err, indent
 import os
 from signal import signal, SIGPIPE, SIG_DFL
 signal(SIGPIPE, SIG_DFL)
@@ -26,16 +27,36 @@
 
 
 def seq_type(filename):
+    """
+        Resolves sequence filetype using extension.
+    """
     filename, ext = os.path.splitext(filename.lower())
     if ext in [".fasta", ".fa"]:
-        return 'fasta'
+        extension = 'fasta'
     elif ext in [".fastq",".fq"]:
-        return 'fastq'
+        extension = 'fastq'
     elif ext in [".ab1"]:
-        return 'ab1'
+        extension = 'ab1'
     else:
         raise Exception("Unknown sequence file type: " + filename)
 
+    with indent(4):
+        puts_err(colored.green("\nReading sequences as %s\n" % extension.upper()))
+    return extension
+
+
+def resolve_sample_from_fasta_line(samples, fasta_line):
+    """
+        Resolves sample names by splitting fasta line
+        on non-word characters.
+    """
+    fasta_line = re.split("\W",fasta_line)
+    matched_sample = [x for x in samples if x in fasta_line]
+    if len(matched_sample) == 1:
+        return matched_sample[0]
+    return ""
+
+
 def format_gt(gt):
     # Return homozygous calls as single bases.
     gt = list(set(re.split("[\|\/]", gt)))
@@ -60,10 +81,10 @@ def format_args(args, add_missing_stdin = False):
                   argv=args,
                   options_first=False)
 
-    # Reference path - check that it exists
+
     module_path = os.path.split(os.path.realpath(__file__))[0]
-    reference = module_path + "/reference/" + args["--ref"] + ".fa.gz"
     handle = open(args["<seq>"], "rU")
+    reference = resolve_reference_genome(args["--ref"])
 
     if args["<vcf>"]:
         concordance = True
@@ -75,57 +96,55 @@ def format_args(args, add_missing_stdin = False):
         with indent(4):
             exit(puts_err(colored.red("\nMust specify <vcf> with --vcf-targets\n")))
 
-    # Open fasta and read
+    # Setup reference for blast call
     b = blast(reference)
 
     # Set file type:
     sequence_file_type = seq_type(args["<seq>"])
 
     # Output header
-    print("\t".join(blast_variant.output_order + ["classification", "sample", "description"]))
+    print("\t".join(blast_variant.output_order))
     for record in SeqIO.parse(handle, sequence_file_type):
-        rec_split = re.split("[ \|]{1}", record.name, 1)
-        sample = record.name.strip(">")
-        description = record.description.strip(">")
+        # Resolve sample within fasta line
+        sample = resolve_sample_from_fasta_line(v.samples, record.name)
         blast_results = b.blast_call(record)
         classification = ""
         for n, variant in enumerate(blast_results):
             output_line = False
             if variant is None:
-                puts_err(colored.red("No Results for " + sample + " " + description))
+                puts_err(colored.red("No Results for " + sample + " " + record.description))
                 continue
             if args["<vcf>"]:
                 if n == 0:
                     vcf_variants = []
                     for vcf_variant in v(variant.region()):
-                        # Deal with phasing
-                        gt = format_gt(vcf_variant.gt_bases[v.samples.index(sample)])
-                        vcf_variants.append([vcf_variant.CHROM,
-                                             vcf_variant.POS,
-                                             gt])
-                        vcf_variant_positions = [x[0:2] for x in vcf_variants]
+                        if sample:
+                            gt = format_gt(vcf_variant.gt_bases[v.samples.index(sample)])
+                            vcf_variants.append([vcf_variant.CHROM,
+                                                 vcf_variant.POS,
+                                                 gt])
+                            vcf_variant_positions = [x[0:2] for x in vcf_variants]
 
                 chrom_pos =  variant.chrom_pos_allele()[0:2]
                 vcf_variant_match = [x for x in vcf_variants if x[0:2] == chrom_pos]
                 if vcf_variant_match:
                     vcf_variant_match = vcf_variant_match[0]
                     variant.vcf_gt = vcf_variant_match[2]
-                    if variant.REF == variant.gt and variant.gt == variant.vcf_gt:
-                        classification = "TN"
-                    elif variant.REF != variant.gt and variant.gt == variant.vcf_gt:
-                        classification = "TP"
-                    elif variant.REF == variant.gt and variant.gt != variant.vcf_gt:
-                        classification = "FP" 
-                    elif variant.REF != variant.gt and variant.gt != variant.vcf_gt:
-                        classification = "FN"
+                    if variant.REF == variant.seq_gt and variant.seq_gt == variant.vcf_gt:
+                        variant.classification = "TN"
+                    elif variant.REF != variant.seq_gt and variant.seq_gt == variant.vcf_gt:
+                        variant.classification = "TP"
+                    elif variant.REF == variant.seq_gt and variant.seq_gt != variant.vcf_gt:
+                        variant.classification = "FP" 
+                    elif variant.REF != variant.seq_gt and variant.seq_gt != variant.vcf_gt:
+                        variant.classification = "FN"
                 else:
-                    if variant.REF != variant.gt:
-                        classification = "FN"
+                    if variant.REF != variant.seq_gt:
+                        variant.classification = "FN"
                     else:
-                        classification = ""
+                        variant.classification = ""
 
-                #print args["--vcf-targets"] and variant.chrom_pos_allele()[0:2] in vcf_variant_positions
-                if args["--vcf-targets"] and classification != "":
+                if args["--vcf-targets"] and variant.classification != "":
                     output_line = True
                 elif args["--all-sites"] is True:
                     output_line = True
@@ -135,4 +154,6 @@ def format_args(args, add_missing_stdin = False):
                 elif variant.is_variant:
                     output_line = True
             if output_line:
-                print '\t'.join([str(variant), classification, sample, description])
+                variant.sample = sample
+                variant.description = record.description
+                print '\t'.join([str(variant)])
diff --git a/vcfkit/genome.py b/vcfkit/genome.py
@@ -26,15 +26,38 @@
 import os
 import urllib
 from sys import exit # Used by exit(); don't remove.
-
+from Bio import Entrez
+
+
+def fetch_chrom_name(id):
+    try:
+        if not id.startswith("NC_"):
+            return id
+        Entrez.email = "vcf-kit@vcf-kit.com"
+        chrom = Entrez.read(Entrez.efetch(db="nuccore", id=id, rettype="gb", retmode = "xml"))
+        gb_feature_quals = chrom[0]["GBSeq_feature-table"][0]["GBFeature_quals"]
+        features = dict([x.values() for x in gb_feature_quals])
+        if "organelle" in features:
+            if features["organelle"] == "mitochondrion":
+                return "MtDNA"
+        else:
+            chrom_name = features["chromosome"]
+            return chrom_name
+    except:
+        return id
 
 def download_genomes(genome_db):
-    with indent(2):
-        puts(colored.blue('\nDownloading list of reference genomes\n'))
-    r = requests.get("http://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_refseq.txt")
-    genome_file = open(genome_db, "w")
-    with genome_file as f:
-        f.write(r.text.encode('utf-8').strip())
+    if os.path.isfile(genome_db):
+        fileTime = os.path.getmtime(genome_db)
+    else:
+        fileTime = 0
+    if (time() - fileTime) >  (3 * 30 * 24 * 60 * 60) or is_non_zero_file(genome_db) is False:
+        with indent(2):
+            puts(colored.blue('\nDownloading list of reference genomes\n'))
+        r = requests.get("http://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_refseq.txt")
+        genome_file = open(genome_db, "w")
+        with genome_file as f:
+            f.write(r.text.encode('utf-8').strip())
 
 
 def is_non_zero_file(fpath):  
@@ -70,7 +93,6 @@ def main(debug=None):
 
     with indent(2):
         puts(genome_directory)
-        #puts(colored.blue("\nGenome Directory: " + genome_directory + "\n"))
 
     if args["location"] and not args["<path>"]:
         return genome_directory
@@ -90,12 +112,7 @@ def main(debug=None):
     ##################
     elif args["--search"]:
         # Download and cache a list of genomes from NCBI for searching
-        if os.path.isfile(genome_db):
-            fileTime = os.path.getctime(genome_db)
-        else:
-            fileTime = 0
-        if fileTime < (time() - 60 * 60 * 24 * 2) or is_non_zero_file(genome_db):
-            download_genomes(genome_db)
+        download_genomes(genome_db)
 
         # Cache result
         header = ["assembly_accession",  # 0
@@ -120,15 +137,14 @@ def main(debug=None):
         with indent(2):
             puts(colored.blue('\nTo download a genome and setup for use:'))
         with indent(4):
-            puts(colored.green("\nvk genome --ref=<asm_name>\n"))
+            puts(colored.green("\nvk genome ncbi --ref=<asm_name>\n"))
         return results
     elif args["--ref"]:
         # reference name.
         reference_name = args["--ref"]
 
         # Ensure genome db is available
-        if is_non_zero_file(genome_db) is False:
-            download_genomes(genome_db)
+        download_genomes(genome_db)
 
         # reference directory
         if not args["--directory"]:
@@ -184,9 +200,10 @@ def main(debug=None):
                     for line in f:
                         outline = line
                         if line.startswith(">"):
-                            chrom_name = re.match(".*[c|C]hromosome ([A-Za-z0-9]+)[W]*", line)
+                            acc = line.split(" ")[0].strip(">")
+                            chrom_name = fetch_chrom_name(acc)
                             if chrom_name is not None:
-                                outline = ">" + chrom_name.group(1) + "\n"
+                                outline = ">" + chrom_name + "\n"
                             elif line.lower().find("mitochon") > 0:
                                 outline = ">MtDNA\n"
                             puts(colored.blue(line.strip("\n>")) + " --> " + colored.blue(outline.strip("\n>")))

diff --git a/vcfkit/utils/blastn.py b/vcfkit/utils/blastn.py
@@ -42,9 +42,12 @@ class blast_variant:
     output_order = ["CHROM",
                     "POS",
                     "REF",
-                    "gt",
+                    "ALT",
+                    "seq_gt",
                     "vcf_gt",
+                    "sample",
                     "variant_type",
+                    "classification",
                     "index",
                     "alignment_start",
                     "alignment_end",
@@ -55,7 +58,8 @@ class blast_variant:
                     "evalue",
                     "bitscore",
                     "phred_quality",
-                    "phred_quality_window"]
+                    "phred_quality_window",
+                    "description"]
 
     def __init__(self,
                  blast_result,
@@ -71,9 +75,13 @@ def __init__(self,
 
         self.CHROM = blast_result["sacc"]
         self.POS = POS
+        posp1 = self.POS + 1
+        self.CHROM_POS = "{self.CHROM}:{self.POS}-{posp1}".format(**locals())
+        self.classification = None
         self.REF = ref_out.strip("-")
-        self.gt = alt_out.strip("-")
-        self.is_variant = (self.REF != self.gt)
+        self.ALT = None
+        self.seq_gt = alt_out.strip("-")
+        self.is_variant = (self.REF != self.seq_gt)
         self.vcf_gt = ""
         self.alignment_start = blast_result["sstart"]
         self.alignment_end = blast_result["send"]
@@ -84,23 +92,27 @@ def __init__(self,
         self.index = index
         self.evalue = blast_result["evalue"]
         self.bitscore = blast_result["bitscore"]
-        if self.REF != self.gt:
-            if len(self.REF) == len(self.gt):
+        if self.REF != self.seq_gt:
+            if len(self.REF) == len(self.seq_gt):
                 self.variant_type = "snp"
-            elif len(self.REF) > len(self.gt):
+            elif len(self.REF) > len(self.seq_gt):
                 self.variant_type = "deletion"
-            elif len(self.REF) < len(self.gt):
+            elif len(self.REF) < len(self.seq_gt):
                 self.variant_type = "insertion"
         else:
             self.variant_type = ""
         self.phred_quality = phred_quality
         self.phred_quality_window = phred_quality_window
 
+    def classify_variant(self):
+        # Determines whether variant is TP, TN, FP, FN
+        pass
+
     def chrom_pos_allele(self):
         """
             Simplified representation of variant
         """
-        return [self.CHROM, self.POS, self.gt]
+        return [self.CHROM, self.POS, self.seq_gt]
 
     def region(self):
         return "{self.CHROM}:{self.alignment_start}-{self.alignment_end}".format(**locals())

diff --git a/vcfkit/utils/reference.py b/vcfkit/utils/reference.py
@@ -1,4 +1,4 @@
-from clint.textui import colored, puts, indent
+from clint.textui import colored, puts, puts_err, indent
 import os
 
 
@@ -41,7 +41,10 @@ def resolve_reference_genome(loc):
         return loc
     else:
         if loc in get_genome_list():
-            return "{gd}/{loc}/{loc}.fa.gz".format(gd = get_genome_directory(), loc = loc)
+            reference_location = "{gd}/{loc}/{loc}.fa.gz".format(gd = get_genome_directory(), loc = loc)
+            with indent(4):
+                puts_err(colored.green("\nUsing reference located at %s\n" % reference_location))
+            return reference_location
         else:
             with indent(4):
                 puts_err(colored.red("\nGenome does not exist\n"))