```
python ~/sag_mg_recruit.py mg_template.csv sag_template.csv --outdir recruit_results/ --cores 20 --log recruit_test.log
```

```
python ~/sag_mg_recruit.py 160722_maria_mgs.csv 160722_maria_sags.csv --outdir 1609_smr_maria/ --cores 20 --log SMRTEST_922.txt --minlen 74
```

In [4]:
from __future__ import print_function
from __future__ import division
import pysam
import os.path as op
import logging

logger = logging.getLogger(__name__)

In [5]:
def read_overlap_pctid(l, pctid, min_len, overlap=0):
    real_len = l.infer_query_length()
    aln_len = l.query_alignment_length
    mismatch = l.get_tag("NM")

    aln_overlap = (aln_len / real_len) * 100
    aln_pctid = ((aln_len - mismatch) / aln_len) * 100
    # print("{0} >= {1}".format(aln_overlap, overlap))
    # print("{0} >= {1}".format(aln_pctid, pctid))
    # print("{0} >= {1}".format(aln_len, min_len))
    if aln_overlap >= overlap and aln_pctid >= pctid and aln_len >= min_len:
        return True
    else:
        return False

In [6]:
bam = "../sept_data/LVP2_vs_AAA036-G05-Sulfurovum.bam"
outbam = "../sept_data/LVP2_vs_AAA036-G05-Sulfurovum_OUTBAM.bam"
overlap = 0
pctid = 95
minlen = 75

In [11]:
with pysam.AlignmentFile(bam, "rb", check_sq=False) as ih, pysam.AlignmentFile(outbam, "wb", template=ih) as oh:
    good = 0
    total = 0
    name = op.basename(outbam).split(".")[0]
    outfile = ".".join(outbam.split(".")[:-1]) + ".aln_count"
    for i, l in enumerate(ih):
        if l.is_duplicate:
            continue
        if i == 20:
            break
        total += 1
        print("total: {}".format(total))
        #md = l.get_tag("MD")
        #match = _match_len(md)
        #pct_match = (match)/l.rlen * 100

        #if pct_match > pctid:
        #    good += 1
        #    oh.write(l)
        if read_overlap_pctid(l, overlap, pctid, minlen):
            good += 1
        print("good: {}".format(good))

total: 1
43.1279620853 >= 75
87.9120879121 >= 0
91 >= 95
good: 0
total: 2
43.1279620853 >= 75
87.9120879121 >= 0
91 >= 95
good: 0
total: 3
43.1279620853 >= 75
87.9120879121 >= 0
91 >= 95
good: 0
total: 4
28.640776699 >= 75
91.5254237288 >= 0
59 >= 95
good: 0
total: 5
32.2404371585 >= 75
91.5254237288 >= 0
59 >= 95
good: 0
total: 6
28.640776699 >= 75
91.5254237288 >= 0
59 >= 95
good: 0
total: 7
97.2972972973 >= 75
91.6666666667 >= 0
144 >= 95
good: 1
total: 8
97.2972972973 >= 75
91.6666666667 >= 0
144 >= 95
good: 2
total: 9
28.8343558282 >= 75
93.6170212766 >= 0
47 >= 95
good: 2
total: 10
24.3523316062 >= 75
93.6170212766 >= 0
47 >= 95
good: 2
total: 11
28.8343558282 >= 75
93.6170212766 >= 0
47 >= 95
good: 2
total: 12
24.3523316062 >= 75
93.6170212766 >= 0
47 >= 95
good: 2
total: 13
24.3523316062 >= 75
93.6170212766 >= 0
47 >= 95
good: 2
total: 14
28.8343558282 >= 75
93.6170212766 >= 0
47 >= 95
good: 2
total: 15
24.3523316062 >= 75
93.6170212766 >= 0
47 >= 95
good: 2
total: 16
24.352331

In [7]:
def filter_bam(bam, outbam, overlap=95, pctid=95, minlen=150):
    with pysam.AlignmentFile(bam, "rb", check_sq=False) as ih, pysam.AlignmentFile(outbam, "wb", template=ih) as oh:
        good = 0
        total = 0
        name = op.basename(outbam).split(".")[0]
        outfile = ".".join(outbam.split(".")[:-1]) + ".aln_count"
        for i, l in enumerate(ih):
            if l.is_duplicate:
                continue

            total += 1
            #md = l.get_tag("MD")
            #match = _match_len(md)
            #pct_match = (match)/l.rlen * 100

            #if pct_match > pctid:
            #    good += 1
            #    oh.write(l)
            if read_overlap_pctid(l, overlap, pctid, minlen):
                good += 1
                oh.write(l)

        with open(outfile, "w") as oh:
            print(name, good, file=oh)
        logger.info("for %s, there were %s good read alignments out of %s total alignments" % (bam, good, total))
    return outbam

In [8]:
new = filter_bam(bam, outbam, overlap=0, pctid=95, minlen=50)

In [9]:
new

'../sept_data/LVP2_vs_AAA036-G05-Sulfurovum_OUTBAM.bam'

In [11]:
!open ../sept_data/LVP2_vs_AAA036-G05-Sulfurovum_OUTBAM.bam -a textedit