In [153]:
def summarise_alignments(alignments_path, log_handle):
    
    from Bio.Blast import NCBIXML
    
    blast_alignments = NCBIXML.parse(open(alignments_path, 'r'))
    
    logfile = log_handle
    logfile.write("\nAlignments parsed, summarising hits, filtering no-hit and multi-good-hit markers . . . \n")
    
    print "\nAlignments parsed, summarising hits, filtering no-hit and multi-good-hit markers . . . \n"
    
    
    
    unique_alignments = []
    two_alignments_only = []
    multi_alignments = []
    
    one_best_alignment = []
    two_best_alignments = []
    more_than_two_best = []
    
    dict_to_keep = {}
    dict_to_keep["One_good_hit"] = []
    dict_to_keep["Two_good_hits"] = []
    
    eval_diff_thresh = 1e-5
    
    alignment_counter = 0
    
    for record in blast_alignments:
        
        alignment_counter += 1
        
        if len(record.alignments) == 1:
            
            if record.alignments[0].hsps[0].expect < 1e-20:
            
                unique_alignments.append(record.query)
                dict_to_keep["One_good_hit"].append(record)
            
            
        elif len(record.alignments) == 2:
            
            two_alignments_only.append(record.query)
            
            if record.alignments[0].hsps[0].expect < 1e-20:
                
                if all([record.alignments[1].hsps[0].expect < 1e-20,
                        record.alignments[0].hsps[0].expect >= eval_diff_thresh*record.alignments[1].hsps[0].expect]):
                    
                    two_best_alignments.append(record.query)
                    dict_to_keep["Two_good_hits"].append(record)
                
                else:
                    
                    one_best_alignment.append(record.query)
                    dict_to_keep["One_good_hit"].append(record)
                    
                    
                    
        elif len(record.alignments) >= 3:
                       
            multi_alignments.append(record.query)
            
            if record.alignments[0].hsps[0].expect < 1e-20:
                
                if all([record.alignments[1].hsps[0].expect < 1e-20,
                        record.alignments[0].hsps[0].expect >= eval_diff_thresh*record.alignments[1].hsps[0].expect]):
                    
                
                    if all([record.alignments[2].hsps[0].expect < 1e-20,
                        record.alignments[1].hsps[0].expect >= eval_diff_thresh*record.alignments[2].hsps[0].expect]):
                    
                        more_than_two_best.append(record.query)
                           
                    else:
                           
                        two_best_alignments.append(record.query)
                        dict_to_keep["Two_good_hits"].append(record)
                    
                else:
                    
                    one_best_alignment.append(record.query)
                    dict_to_keep["One_good_hit"].append(record)
                    
        
       
    logfile.write("Total alignments %s\n" % alignment_counter)
    
    logfile.write("\nUnique_alignments %s\n" % len(unique_alignments))
    logfile.write("Two_alignments_only %s\n" % len(two_alignments_only))
    logfile.write("Multi_alignments %s\n" % len(multi_alignments))

    logfile.write("\nOne_best_alignment %s\n" % len(one_best_alignment))
    logfile.write("Two_best_alignments %s\n" % len(two_best_alignments))
    logfile.write("More_than_two_best %s\n" % len(more_than_two_best))
        
    logfile.write("\nTotal_alignments_processed %s\n" % str(len(unique_alignments) + len(two_alignments_only) + len(multi_alignments)))
    
    
    
    
    print "Total alignments", alignment_counter
    
    print "\nUnique_alignments", len(unique_alignments)
    print "Two_alignments_only", len(two_alignments_only)
    print "Multi_alignments", len(multi_alignments)

    print "\nOne_best_alignment", len(one_best_alignment)
    print "Two_best_alignments", len(two_best_alignments)
    print "More_than_two_best", len(more_than_two_best)
        
    print "\nTotal_alignments_processed", len(unique_alignments) + len(two_alignments_only) + len(multi_alignments)
    
    return dict_to_keep

In [154]:
def Filter_haplotig_alignments(kept_alignments, log_handle):

    print "\nFiltering haplotig alignments where possible . . . "
    
    confirmed_pair = 0
    no_pair_no_haps = 0
    no_pair_no_anchored = 0
    no_pair_mismatch = 0
    total_alignments = 0
    
    logfile = log_handle
    
    kept_multi_alignments = {}
    
    for alignment in kept_alignments["Two_good_hits"]:

        total_alignments += 1

        hit_1_has_ALT = False
        hit_2_has_ALT = False

        hit_1_name = alignment.alignments[0].hit_def.split()[0]
        hit_1_start = alignment.alignments[0].hsps[0].sbjct_start

        hit_2_name = alignment.alignments[1].hit_def.split()[0]
        hit_2_start = alignment.alignments[1].hsps[0].sbjct_start

        logfile.write("\nMarker: %s\n" % alignment.query)
        logfile.write("Hit_1: %s, Strt: %s\n" % (alignment.alignments[0].hit_def, alignment.alignments[0].hsps[0].sbjct_start))
        logfile.write("Hit_2: %s, Strt: %s\n" % (alignment.alignments[1].hit_def, alignment.alignments[1].hsps[0].sbjct_start))

        ## if there is info on an ALT alignment in either hit, then we can see if we can do something with the pair. If not, then there is nothing we can do 

        if "ALT" in alignment.alignments[0].hit_def or "ALT" in alignment.alignments[1].hit_def:

            if all(["ALT" in alignment.alignments[0].hit_def, 
                    "unanchored" not in alignment.alignments[0].hit_def,
                    "chr" in alignment.alignments[1].hit_def]):

                hit_1_ALT_scaff = alignment.alignments[0].hit_def.split("ALT_Parent:")[1].split("(")[0]
                hit_1_ALT_chrom = alignment.alignments[0].hit_def.split("ALT_Parent:")[1].split("(")[1].split(":")[0]
                hit_1_ALT_chrom_strt = alignment.alignments[0].hit_def.split("ALT_Parent:")[1].split("(")[1].split(":")[1].split("-")[0]
                hit_1_ALT_chrom_stop = alignment.alignments[0].hit_def.split("ALT_Parent:")[1].split("(")[1].split(":")[1].split("-")[1].split(")")[0]

                logfile.write("Hit_2 is anchored, hit_1 is a known haplotig\n")

                if hit_1_ALT_chrom == hit_2_name:

                    if int(hit_1_ALT_chrom_strt) <= int(hit_2_start) <= int(hit_1_ALT_chrom_stop):

                        confirmed_pair += 1
                        logfile.write("CONFIRMED PAIR: Marker %s hits %s at position %s and also hits %s which\n" % (alignment.query, hit_2_name, hit_2_start, hit_1_name))
                        logfile.write("is a haplotig of scaffold %s which is anchored on %s at position %s-%s\n" % (hit_1_ALT_scaff, hit_1_ALT_chrom, hit_1_ALT_chrom_strt, hit_1_ALT_chrom_stop))
                        
                        kept_multi_alignments[alignment.query] = {}
                        kept_multi_alignments[alignment.query]["CHROM"] = hit_1_ALT_chrom
                        kept_multi_alignments[alignment.query]["STRT_POS"] = int(hit_1_ALT_chrom_strt)+int(alignment.alignments[0].hsps[0].sbjct_start)
                        
                        
                    else:
                        logfile.write("NO PAIR: Haplotig alignment does not match the anchroed alignment position\n")
                        no_pair_mismatch += 1

                else:
                    logfile.write("NO PAIR: Haplotig hit does not match the chromosome-anchored hit\n")
                    no_pair_mismatch += 1


            elif all(["ALT" in alignment.alignments[1].hit_def, 
                    "unanchored" not in alignment.alignments[1].hit_def,
                    "chr" in alignment.alignments[0].hit_def]):

                hit_2_ALT_scaff = alignment.alignments[1].hit_def.split("ALT_Parent:")[1].split("(")[0]
                hit_2_ALT_chrom = alignment.alignments[1].hit_def.split("ALT_Parent:")[1].split("(")[1].split(":")[0]
                hit_2_ALT_chrom_strt = alignment.alignments[1].hit_def.split("ALT_Parent:")[1].split("(")[1].split(":")[1].split("-")[0]
                hit_2_ALT_chrom_stop = alignment.alignments[1].hit_def.split("ALT_Parent:")[1].split("(")[1].split(":")[1].split("-")[1].split(")")[0]

                logfile.write("Hit_1 is anchored, hit_2 is a known haplotig\n")

                if hit_2_ALT_chrom == hit_1_name:

                    if int(hit_2_ALT_chrom_strt) <= int(hit_1_start) <= int(hit_2_ALT_chrom_stop):

                        confirmed_pair += 1
                        logfile.write("CONFIRMED PAIR: Marker %s hits %s at position %s and also hits %s which\n" % (alignment.query, hit_1_name, hit_1_start, hit_2_name))
                        logfile.write("is a haplotig of scaffold %s which is anchored on %s at position %s-%s\n" % (hit_2_ALT_scaff, hit_2_ALT_chrom, hit_2_ALT_chrom_strt, hit_2_ALT_chrom_stop))

                        kept_multi_alignments[alignment.query] = {}
                        kept_multi_alignments[alignment.query]["CHROM"] = hit_1_ALT_chrom
                        kept_multi_alignments[alignment.query]["STRT_POS"] = int(hit_1_ALT_chrom_strt)+int(alignment.alignments[0].hsps[0].sbjct_start)
                        
                    else:
                        logfile.write("NO PAIR: Haplotig alignment does not match the anchroed alignment position\n")
                        no_pair_mismatch += 1
                else:
                    logfile.write("NO PAIR: Haplotig hit does not match the chromosome-anchored hit\n")
                    no_pair_mismatch += 1

            else:
                logfile.write("NO PAIR: Missing anchored alignments\n")
                no_pair_no_anchored += 1
        else:
            no_pair_no_haps += 1
            logfile.write("NO PAIR: No haplotigs hit\n")

    logfile.write("\n### SUMMARY ###\n")
    logfile.write("\nConfirmed alignment pairs: %s\n" % confirmed_pair)
    logfile.write("Pairs discarded due to no haplotigs hit: %s\n" % no_pair_no_haps)
    logfile.write("Pairs discarded due to no anchored hit: %s\n" % no_pair_no_anchored)
    logfile.write("Pairs discarded due to mismatch between haplotig and anchored alignment: %s\n" % no_pair_mismatch )
    logfile.write("Total alignments processed: %s\n" % total_alignments)
    
    print "\n## Haplotig filtering summary"
    print "\nConfirmed alignment pairs: %s" % confirmed_pair
    print "Pairs discarded due to no haplotigs hit: %s" % no_pair_no_haps
    print "Pairs discarded due to no anchored hit: %s" % no_pair_no_anchored
    print "Pairs discarded due to mismatch between haplotig and anchored alignment: %s" % no_pair_mismatch 
    print "Total alignments processed: %s" % total_alignments
            
        
    return kept_multi_alignments

In [156]:
my_alignments_path = "/home/djeffrie/Data/RADseq/Dufresnes_rana/Clines_to_Rtemp.xml"

filtered_alignments = open("%s_filered_aligments.tsv" % my_alignments_path.rpartition(".")[0], 'w')
filtered_alignments.write("#MARKER\tCHROM\tPOS\n")

my_log_handle = open("%s_alignment_filtering.log" % my_alignments_path.rpartition(".")[0], 'w')

## summarise the alignments
my_kept_alignments = summarise_alignments(my_alignments_path, my_log_handle)

## Filter haplotig alignments
my_kept_multi_alignments = Filter_haplotig_alignments(my_kept_alignments, my_log_handle)

## make the output table

for alignment in my_kept_alignments["One_good_hit"]:
    marker = alignment.query
    scaff = alignment.alignments[0].hit_def.split()[0]
    pos = alignment.alignments[0].hsps[0].sbjct_start
    filtered_alignments.write("%s\t%s\t%s\n" % (marker, scaff, pos))

my_log_handle.write("## FILTERING multi alignments to haplotigs\n\n")

for marker in my_kept_multi_alignments:
    scaff = my_kept_multi_alignments[marker]["CHROM"]
    pos = my_kept_multi_alignments[marker]["STRT_POS"]
    filtered_alignments.write("%s\t%s\t%s\n" % (marker, scaff, pos))

filtered_alignments.close()
my_log_handle.close()



Alignments parsed, summarising hits, filtering no-hit and multi-good-hit markers . . . 

Total alignments 403

Unique_alignments 56
Two_alignments_only 56
Multi_alignments 280

One_best_alignment 131
Two_best_alignments 128
More_than_two_best 76

Total_alignments_processed 392

Filtering haplotig alignments where possible . . . 

## Haplotig filtering summary

Confirmed alignment pairs: 54
Pairs discarded due to no haplotigs hit: 57
Pairs discarded due to no anchored hit: 11
Pairs discarded due to mismatch between haplotig and anchored alignment: 6
Total alignments processed: 128


In [144]:
filtered_alignments = open("%s/")

for alignment in my_kept_alignments["One_good_hit"]:
    marker = alignment.query
    scaff = alignment.alignments[0].hit_def.split()[0]
    pos = alignment.alignments[0].hsps[0].sbjct_start
    print marker, scaff, pos
    
for marker in my_kept_multi_alignments:
    scaff = my_kept_multi_alignments[marker]["CHROM"]
    pos = my_kept_multi_alignments[marker]["STRT_POS"]
    
    print marker, scaff, pos
    

126440 chr10 57156373
82673 chr02 411527875
43496 chr06 99164530
110288 chr04 514888495
92806 chr01 693614538
72607 chr06 21062033
56772 chr04 374869048
19509 chr10 307051708
51692 chr02 199375177
11882 chr02 199300997
42448 chr03 88209422
32019 chr09 108714699
48077 chr10 38360595
216945 chr01 337002320
60911 chr09 165747649
110563 chr04 491729870
59181 chr01 318332980
126035 chr06 98980983
115747 chr07 66148350
95072 chr02 199329053
69992 chr01 54028589
99338 chr01 78945000
57439 chr04 256402457
43747 chr02 478807101
232769 chr09 100251586
27089 chr04 412617129
32733 chr04 432670176
62683 chr06 99000299
25166 chr06 139732455
114316 chr01 240283461
17351 chr01 326451882
94381 chr02 231728634
108855 chr04 437913342
60027 chr02 199359132
65991 chr12 2805811
121150 chr01 191426647
76850 chr12 66308735
123564 chr01 753936481
92118 chr08 102739125
41723 chr01 272089275
88058 chr03 323551402
102915 chr05 398080622
115929 chr01 337026169
81096 chr06 174680381
94622 chr08 92414009
107061 chr0

In [134]:
my_kept_alignments

{'One_good_hit': [<Bio.Blast.Record.Blast at 0x7f75bef7bc50>,
  <Bio.Blast.Record.Blast at 0x7f75d96c5610>,
  <Bio.Blast.Record.Blast at 0x7f75d96c5810>,
  <Bio.Blast.Record.Blast at 0x7f75ca88f890>,
  <Bio.Blast.Record.Blast at 0x7f75c550fed0>,
  <Bio.Blast.Record.Blast at 0x7f75c553b190>,
  <Bio.Blast.Record.Blast at 0x7f75c553b290>,
  <Bio.Blast.Record.Blast at 0x7f75c5524310>,
  <Bio.Blast.Record.Blast at 0x7f75c63d8d90>,
  <Bio.Blast.Record.Blast at 0x7f75ca084890>,
  <Bio.Blast.Record.Blast at 0x7f75bedad510>,
  <Bio.Blast.Record.Blast at 0x7f75bedad810>,
  <Bio.Blast.Record.Blast at 0x7f75cbe1b6d0>,
  <Bio.Blast.Record.Blast at 0x7f75ca4cf250>,
  <Bio.Blast.Record.Blast at 0x7f75ca4f9110>,
  <Bio.Blast.Record.Blast at 0x7f75c0d72890>,
  <Bio.Blast.Record.Blast at 0x7f75c0d72b90>,
  <Bio.Blast.Record.Blast at 0x7f75c6337f50>,
  <Bio.Blast.Record.Blast at 0x7f75c1214490>,
  <Bio.Blast.Record.Blast at 0x7f75caa2c2d0>,
  <Bio.Blast.Record.Blast at 0x7f75c9fc58d0>,
  <Bio.Blast.Recor

In [133]:
my_kept_multi_alignments

{u'100716': {'CHROM': u'chr02', 'STRT_POS': 460805023},
 u'102915': {'CHROM': u'chr05', 'STRT_POS': 398080622},
 u'107061': {'CHROM': u'chr06', 'STRT_POS': 98969853},
 u'108855': {'CHROM': u'chr04', 'STRT_POS': 437913342},
 u'110288': {'CHROM': u'chr04', 'STRT_POS': 514888495},
 u'110563': {'CHROM': u'chr04', 'STRT_POS': 491729870},
 u'114316': {'CHROM': u'chr01', 'STRT_POS': 240283461},
 u'115747': {'CHROM': u'chr07', 'STRT_POS': 66148350},
 u'115929': {'CHROM': u'chr01', 'STRT_POS': 337026169},
 u'118398': {'CHROM': u'chr05', 'STRT_POS': 223616091},
 u'11882': {'CHROM': u'chr02', 'STRT_POS': 199300997},
 u'120135': {'CHROM': u'chr05', 'STRT_POS': 405022106},
 u'120744': {'CHROM': u'chr05', 'STRT_POS': 177446773},
 u'121150': {'CHROM': u'chr01', 'STRT_POS': 191426647},
 u'123564': {'CHROM': u'chr01', 'STRT_POS': 753936481},
 u'126035': {'CHROM': u'chr06', 'STRT_POS': 98980983},
 u'126440': {'CHROM': u'chr10', 'STRT_POS': 57156373},
 u'14933': {'CHROM': u'chr01', 'STRT_POS': 339382232}