## Here I want to filter the tblastx hits from mapping M. annua transcripts to the M. huetii genome

Test, is the highest scoring hsp of an alignment 5e10 higher in its evalue to the next one, and is the alignment > 70% of the query length. 

In [119]:
from Bio.Blast import NCBIXML
import sys

def Blast_filter(in_path, eval_thresh = 1e-100, len_thresh = 0.1, out_path = None):

    """
    
    \nUsage: Blast_filter.py <full_in_path> [eval_thresh] [len_thresh] [out_path]
    
    eval_thresh -  The threshold value for the blast hit (Default: 1e-100)
    len_thresh  -  The percentage of the query length which must match (Default 0.1)
    
    """
    
    Kept = {}

    blastouts = NCBIXML.parse(open(in_path, 'r'))

    if out_path == None:
        output_path = "%s/Kept_blast_hits.txt" % in_path.rpartition("/")[0]
    
    #eval_thresh = 1e-100
    #len_thresh = 0.1
    
    eval_passes = 0
    length_passes = 0
    N_records = 0 

    for record in blastouts:

        N_records += 1

        ID = record.query
        query_length = record.query_length

        eval_test = False
        align_length_test = False

        if len(record.alignments) == 1:
            evalue_1 = record.alignments[0].hsps[0].expect

            if evalue_1 < eval_thresh:
                eval_test = True
                eval_passes += 1

        elif len(record.alignments) > 1:

            evalue_2 = record.alignments[1].hsps[0].expect*1e-10

            if evalue_1 < evalue_2*1e-10 and evalue_1 < eval_thresh:
                eval_test = True
                eval_passes += 1

        query_length = record.query_length
        align_length=  record.alignments[0].hsps[0].align_length

        if align_length > len_thresh*query_length:
            align_length_test = True
            align_length_test += 1

        if eval_test and align_length_test:
            Kept[ID] = record.alignments[0].hit_def

    print "%s Blast hits passed the filters" % len(Kept)
    
    ## output the kept query-subject pairs to a file
    
    print "Writing to file"
    
    outfile = open(output_path, 'w')
    
    for query in Kept:
        outfile.write("%s\t%s\n" % (query, Kept[query]))
                      
    outfile.close()
                      
    
    

In [118]:
help(Blast_filter)

Help on function Blast_filter in module __main__:

Blast_filter(in_path, eval_thresh=1e-100, len_thresh=0.1, out_path=None)
        
    Usage: Blast_filter.py <full_in_path> <eval_thresh> <len_thresh> [out_path]
        
        eval_thresh -  The threshold value for the blast hit (Default: 1e-100)
        len_thresh  -  The percentage of the query length which must match (Default 0.1)



In [120]:
## Cline

#input_path = "/Users/danieljeffries/Data/M_huetii_genome/M_annua_homologs/Transcript_subset_to_huetii_purged_tblastx.xml"

if len(sys.argv) == 1:
    sys.exit("Only one argument given\n%s" %  Blast_filter.__doc__)

elif len(sys.argv) == 2:
    in_path = sys.argv[1]
    
    Blast_filter(in_path)
    
elif len(sys.argv) == 4:
    in_path = sys.argv[1]
    expect_thresh = sys.argv[2]
    length_thresh = sys.argv[3]
    
    Blast_filter(in_path, expect_thresh, length_thresh)
    
elif len(sys.argv) == 5:
    in_path = sys.argv[1]
    expect_thresh = sys.argv[2]
    length_thresh = sys.argv[3]
    output_path = sys.argv[4]
    
    Blast_filter(in_path, expect_thresh, length_thresh, output_path)

else:
    sys.exit("Too many arguments\n%s" %  Blast_filter.__doc__)




SystemExit: Too many arguments

    
    
Usage: Blast_filter.py <full_in_path> [eval_thresh] [len_thresh] [out_path]
    
    eval_thresh -  The threshold value for the blast hit (Default: 1e-100)
    len_thresh  -  The percentage of the query length which must match (Default 0.1)
    
    

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [106]:
Kept

{u'comp16_c0_seq1_m.2': u'ctg110',
 u'comp40_c0_seq1_m.3': u'ctg1431',
 u'comp44_c0_seq1_m.4': u'ctg321',
 u'comp46_c0_seq1_m.7': u'ctg791'}