In [1]:
import pandas as pd
from Bio import pairwise2
from Bio.pairwise2 import format_alignment

In [2]:
pattern="GGTTGGCGCTGGGGGAAACCAAGCTGGACAGGGTGG"
rpattern="CCACCCTGTCCAGCTTGGTTTCCCCCAGCGCCAACC" #reverse complement
patterns=[pattern,rpattern]

In [9]:
def process_file(filename):
    with open(filename,"r") as f:
        raw_books = f.readlines()
    clean_books = lambda x: (x[2], int(x[3]), int(x[7]), str(x[9]))
    books = [clean_books(book.split("\t")[:13]) for book in raw_books]
    df = pd.DataFrame(books, columns =['chr', 'start', 'stop', "seq"]) 
    df["len"]=df['seq'].str.len()
    return(df)

In [4]:
def get_alignments(df):
    muts=[]
    for r in range(len(df)):
        alignments = pairwise2.align.localxs(rpattern,df["seq"][r],-5,-5)
        if len(alignments) > 1:
            #print("more than one match")
            #for a in range(len(alignments)):
                #print(format_alignment(*alignments[a]))
            muts.append("MUL")
        else:
            if alignments[0].score < 30:
                #print("bad match")
                #print(format_alignment(*alignments[0]))
                muts.append("BAD")
            else:
                #print("good match to be added")
                #print(format_alignment(*alignments[0]))
                muts.append(alignments[0].start+11)
    return(muts)

In [5]:
def aa_trans(seq):
    if seq == "AGG":
        aa = "pro"
    elif seq == "AGC":
        aa = "ala"
    elif seq == "ACC":
        aa = "gly"
    elif seq == "ACG":
        aa = "arg"
    else:
        aa="other"
    return aa

In [6]:
def get_aas(df,muts):
    aas=[]
    for r in range(len(df)):
        pos=muts[r]
        if type(pos) == str:
            aas.append(pos)
        else:
            seqs = df["seq"][r][pos:pos+3]
            aas.append(aa_trans(seqs))
    return aas

In [26]:
def calc_final():
    for x in range(101, 121):
        a="target_region/CAP_18170X"
        b="_200925_A00421_0244_AHKML5DSXY_S"
        c="_L002_R1_001.rmdup.bam.temp"
        filename=a + str(x) + b + str(x+20) +c
        print(filename)
        df=process_file(filename)
        positions=get_alignments(df)
        print(get_aas(df,positions))

In [27]:
calc_final()

target_region/CAP_18170X101_200925_A00421_0244_AHKML5DSXY_S121_L002_R1_001.rmdup.bam.temp
['BAD', 'MUL', 'MUL', 'pro', 'pro', 'pro', 'pro', 'pro', 'BAD']
target_region/CAP_18170X102_200925_A00421_0244_AHKML5DSXY_S122_L002_R1_001.rmdup.bam.temp
['BAD', 'pro', 'BAD', 'BAD', 'BAD', 'BAD', 'BAD', 'pro']
target_region/CAP_18170X103_200925_A00421_0244_AHKML5DSXY_S123_L002_R1_001.rmdup.bam.temp
['other', 'BAD']
target_region/CAP_18170X104_200925_A00421_0244_AHKML5DSXY_S124_L002_R1_001.rmdup.bam.temp
['BAD', 'MUL', 'MUL', 'pro', 'pro', 'pro', 'pro', 'pro', 'other', 'BAD']
target_region/CAP_18170X105_200925_A00421_0244_AHKML5DSXY_S125_L002_R1_001.rmdup.bam.temp
['BAD', 'ala', 'ala', 'ala', 'ala', 'BAD', 'BAD']
target_region/CAP_18170X106_200925_A00421_0244_AHKML5DSXY_S126_L002_R1_001.rmdup.bam.temp
['ala', 'ala', 'ala', 'ala', 'other', 'BAD']
target_region/CAP_18170X107_200925_A00421_0244_AHKML5DSXY_S127_L002_R1_001.rmdup.bam.temp
['BAD', 'pro', 'gly']
target_region/CAP_18170X108_200925_A00421_