In [64]:
import sys
import os
from Bio.Blast import NCBIXML
from Bio import SeqIO


def DansBlastParse(blast_xml_output, best_hit_criteria, Eval_thresh):
    """
    
    Usage: BlastParseExtra.py  <blast_xml_output>  <best_hit_criteria>  <Evalue_threshold>
    
    <blast_xml_output>  -  Path should be absolute
    <best_hit_criteria> -  Orders of magnitude higher that best hit has to be in multi
    <Evalue_threshold>  -  (1e-20 is default)
    
    
    This script first filters the mappings in the <blast_xml_output>, for only those where the
    best hit is at least 5 orders of magnitude higher than the second.
    
    It will then retrieve a segment of the <genome_fasta> which is + and - the <window_size> around the 
    mapping coordinates for each query. If the ends of the scaffold are not within this window, then the 
    length of the segment will be (length of mapped query sequence + 2 x <window_size>). However if an end
    of a scaffold is within this window, the segment will be trimmed to this length.
    
    """

    blast = NCBIXML.parse(open(blast_xml_output,"r"))

    good_blast_counts = 0
    uniq_counts = 0
    
    ## From Alan's script: Returns blast hits only when the best e-value is 5 orders of magnitude better than the second best.
    for record in blast :
        #print len(record.alignments)
        if len(record.alignments)>1:
            if record.alignments[0].hsps[0].expect < best_hit_criteria * record.alignments[1].hsps[0].expect:
                
                subject = str(record.alignments[0].hit_def)
                Evalue = float(record.alignments[0].hsps[0].expect)
                Hit_start_coord = int(record.alignments[0].hsps[0].sbjct_start)
                Hit_end_coord = int(record.alignments[0].hsps[0].sbjct_end)
                print "Multi\t%s\t%s\t%s\t%s\t%s" % (record.query, subject, Evalue, Hit_start_coord, Hit_end_coord)
                good_blast_counts += 1

        elif len(record.alignments)==1:  ## Extra else statement - this wasn't in Alan's original script.
            if float(record.alignments[0].hsps[0].expect) < Eval_thresh:
                subject = str(record.alignments[0].hit_def)
                Evalue = float(record.alignments[0].hsps[0].expect)
                Hit_start_coord = int(record.alignments[0].hsps[0].sbjct_start)
                Hit_end_coord = int(record.alignments[0].hsps[0].sbjct_end)
                print "Unique\t%s\t%s\t%s\t%s\t%s" % (record.query, subject, Evalue, Hit_start_coord, Hit_end_coord)
                good_blast_counts += 1
                uniq_counts += 1
    print "Number of good blast hits:", good_blast_counts
    print "Number of unique mappings:", uniq_counts


In [None]:
## Cline args

if len(sys.argv) == 1:
    
    print DansBlastParse.__doc__
    
elif len(sys.argv) < 4: ## If not enough args are supplied print error message
    sys.exit("\n##Error, not enough arguments, run script with no arguments to see help message\n")

elif len(sys.argv) == 4:
    in_path = sys.argv[1]
    muti_hit_criteria = float(sys.argv[2])
    Eval = float(sys.argv[3])


    DansBlastParse(in_path, muti_hit_criteria, Eval) ## RUN!

In [67]:
muti_hit_criteria = 1e-5
Eval = 1e-20

parent_dir = "/home/djeffrie/Data/Ribe_LM/"

for root, firs, files in os.walk(parent_dir):
    for fil in files:
        if "Male_L6_Xenchunks_" in fil:
            in_path = "%s%s" % (root,fil)
            print "\nProcessing %s" % (in_path)
            DansBlastParse(in_path, muti_hit_criteria, Eval)


Processing /home/djeffrie/Data/Ribe_LM/Male_L6_Xenchunks_1000.xml
Multi	scaffold37.1|size456371	Chr02	3.17932e-59	19134200	19133764
Unique	scaffold490.1|size258825	Chr03	1.9842e-21	58781379	58781449
Multi	scaffold5790.1|size82980	Chr03	7.25741e-11	97846769	97846856
Unique	scaffold11278.1|size59203	Chr04	4.62233e-28	51687691	51687588
Unique	scaffold79227.1|size18324	Chr03	4.64487e-28	21583106	21582976
Number of good blast hits: 5
Number of unique mappings: 3

Processing /home/djeffrie/Data/Ribe_LM/Male_L6_Xenchunks_500.xml
Multi	scaffold37.1|size456371	Chr05	4.47788e-15	54673628	54673567
Multi	scaffold5790.1|size82980	Chr03	3.40915e-11	97846769	97846856
Number of good blast hits: 2
Number of unique mappings: 0

Processing /home/djeffrie/Data/Ribe_LM/Male_L6_Xenchunks_2000.xml
Unique	scaffold490.1|size258825	Chr03	4.0951e-21	58781379	58781449
Multi	scaffold5790.1|size82980	Chr03	1.49577e-10	97846769	97846856
Multi	scaffold29037.1|size50135	Chr10	5.09457e-80	38015018	38015563
Unique	scaf

In [53]:
#### Test args

in_path = "/home/djeffrie/Data/Ribe_LM/Male_LG3_Rtemp_blasthits.xml"
muti_hit_criteria = 1e-5
Eval = 1e-20

DansBlastParse(in_path, muti_hit_criteria, Eval)

Multi	745	scaffold46199.1|size35026	2.99753e-32	34048	34139
Multi	2711	scaffold93399.1|size15147	8.45302e-23	8301	8233
Unique	4139	scaffold93399.1|size15147	8.45302e-23	8301	8233
Unique	5134	scaffold93399.1|size15147	8.45302e-23	8301	8233
Multi	6126	scaffold311071.1|size1376	6.48852e-29	655	744
Multi	8038	scaffold3905.1|size92276	1.0705e-36	22990	23080
Multi	8350	scaffold74752.1|size45057	1.38478e-35	27243	27152
Multi	8519	scaffold1012.1|size167646	6.44277e-34	51516	51425
Unique	8771	scaffold1012.1|size167646	6.44277e-34	51516	51425
Unique	9622	scaffold1012.1|size167646	6.44277e-34	51516	51425
Multi	12788	scaffold18042.1|size52226	5.05154e-25	44200	44134
Unique	13609	scaffold18042.1|size52226	5.05154e-25	44200	44134
Multi	13938	scaffold14468.1|size53227	3.85022e-36	32644	32558
Multi	14202	scaffold77651.1|size18722	2.9764e-37	8043	8134
Unique	14313	scaffold77651.1|size18722	2.9764e-37	8043	8134
Multi	15436	scaffold21629.1|size42593	2.9764e-37	32904	32812
Multi	15437	scaffold107114.1|siz