In [1]:
from Bio import SeqIO
from Bio.SeqUtils.IsoelectricPoint import IsoelectricPoint as IP
from Bio.SeqUtils.ProtParam import ProteinAnalysis as PA
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import os
from collections import defaultdict
import pandas as pd
import ipywidgets as widgets
from ipywidgets import *

In [2]:
def listtostr(strlist):
    output = ""
    for word in strlist:
        output += word + "_"
    return(output[:len(output)-1])
def simplifyname(orgname):
    firstname = [orgname.split("_")[0][0]]
    lastname = orgname.split("_")[1:]
    fullname = firstname + lastname
    return(listtostr(fullname))

In [3]:
def getHistPercent(protstring):
    Hcount = 0
    for AA in protstring:
        if AA == "H":
            Hcount += 1
    return(Hcount/len(protstring))

def findHistChains(protstring):
    chainDict = {}
    chainlen = 1
    lastAA = ""
    for AA in protstring:
        if AA == "H":
            if lastAA == "H":
                chainlen += 1
        elif chainlen > 1 and lastAA == "H":
            #print(f"Chain found, current length {chainlen}")
            if str(chainlen) in chainDict:
                chainDict[str(chainlen)] = chainDict[str(chainlen)] + 1
            else:
                chainDict[str(chainlen)] = 1
            chainlen = 1
        lastAA = AA
    if chainlen > 1:
        if str(chainlen) in chainDict:
            chainDict[str(chainlen)] = chainDict[str(chainlen)] + 1
        else:
            chainDict[str(chainlen)] = 1
    return(chainDict)

def HistScore(protstring):
    last4 = ""
    last8 = ""
    CD4 = 0
    CD8 = 0
    score = 0
    for AA in protstring:
        #iterate last4 & last8
        if len(last4) < 4:
            last4 += AA
        else:
            last4 = last4[1:] + AA
        if len(last8) < 8:
            last8 += AA
        else:
            last8 = last8[1:] + AA
        
        #scan for non-chain H groupings 
        if getHistPercent(last4) >= 0.5 and CD4 < 1:
            score += 2
            CD4 = 4
        else:
            CD4 -= 1
        if getHistPercent(last8) >= 0.375 and CD8 < 1:
            score += 6
            CD8 = 8
        else:
            CD8 -= 1
    
    #scan for chains
    histChains = findHistChains(protstring)
    for key in histChains:
        score += histChains[key] * (int(key) ** 2)
    return(score)

In [4]:
#Create a selector for the intended size exclusion media
geldict = {
    "Q Media (Triethylamine +)" : -1,
    "S Media (Sulfite -)" : 1,
}
gelselect = Dropdown(options=geldict.keys(),description="Media")
display(gelselect)

pHslider = widgets.FloatSlider(value=7.0,min=0,max=14,step=0.1)
display(pHslider)

#testbutton = widgets.Button(description="Confirm")
#output = widgets.Output()
#display(testbutton, output)
#def on_button_clicked(b):
#    with output:
#        runpH = pHslider.value
#        gelcharge = int(geldict[gelselect.value])
    
#testbutton.on_click(on_button_clicked)



Dropdown(description='Media', options=('Q Media (Triethylamine +)', 'S Media (Sulfite -)'), value='Q Media (Tr…

FloatSlider(value=7.0, max=14.0)

In [56]:
#Generate a file path for the correct .fasta file
input_filename = input("Enter Input Name")
output_name = input("Enter Output Name")
datafile = os.path.join('data', input_filename)
#organism_name = input_filename.replace(".faa","")

Enter Input Name E_coli.faa
Enter Output Name E_coli_QpH7


In [52]:
data_folder = os.path.join('data', output_name)

if not os.path.isdir(data_folder):
    os.mkdir(data_folder)

In [53]:
unfiltered = []
wash = []
fractions = []
with open(datafile) as protfile:
    for record in SeqIO.parse(protfile,"fasta"):
        unfiltered.append(record)
        sequence = str(record.seq).replace("X","Q")
        protparams = PA(sequence)
        if protparams.charge_at_pH(pHslider.value)*int(geldict[gelselect.value]) < 0:
            wash.append(record)
        else:
            fractions.append(record)    

#data_name = gelselect.value[0] + str(pHslider.value).replace(".","_") + "_" + simplifyname(organism_name)
data_name = output_name
SeqIO.write(unfiltered,os.path.join(data_folder,data_name+"_total.faa"),"fasta")
SeqIO.write(wash,os.path.join(data_folder,data_name+"_wash.faa"),"fasta")
SeqIO.write(fractions,os.path.join(data_folder,data_name+"_fractions.faa"),"fasta")


393

In [54]:
#Converting modified fasta into a pd dataframe to show filter at work, preset file name used for simplicity

data = {
    'sequence' : [],
    'description' : [],
    f'charge at ph {pHslider.value}' : [],
    'ID' : []
}


protcharge = []
prot_desc = []
prot_name = []
prot_id = []
with open(os.path.join(data_folder,data_name+"_fractions.faa")) as protfile:
  for record in SeqIO.parse(protfile,"fasta"):
    sequence = str(record.seq)
    data['sequence'].append(sequence)
    protparams = PA(sequence)
    charge = round(protparams.charge_at_pH(pHslider.value),2)
    data[f'charge at ph {pHslider.value}'].append(charge)
    data['description'].append(record.description)
    data['ID'].append(record.id)


df = pd.DataFrame.from_dict(data)

df.sort_values(by=[f'charge at ph {pHslider.value}'], inplace=True, ascending = bool(int(geldict[gelselect.value]) == -1))
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,sequence,description,charge at ph 7.0,ID
0,MIAKHVPMRSLGKSDFAGLANYITDAQSKDHRLGHVQATNCEAGSI...,sp|Q00192|TRAI5_ECOLX Protein TraI OS=Escheric...,41.11,sp|Q00192|TRAI5_ECOLX
1,MGLSHKERREYGRSYKRDNNRTAPFSCWPRWIRPGGWSGHGGMSGT...,sp|P08098|MOBA1_ECOLX Mobilization protein A O...,33.68,sp|P08098|MOBA1_ECOLX
2,MIAKHVPMRSIKKSDFAELVKYITDEQGKTERLGHVRVTNCEANTL...,sp|Q00191|TRAI6_ECOLX Protein TraI OS=Escheric...,33.36,sp|Q00191|TRAI6_ECOLX
3,MTKTSKLDALRAATSREDLAKILDVKLVFLTNVLYRIGSDNQYTQF...,sp|P21325|RT67_ECOLX Retron Ec67 protein OS=Es...,31.11,sp|P21325|RT67_ECOLX
4,MSHADMSDSSGFNEAAAAFSWNGPKKAINPYLDPAEVAPFSALSNL...,sp|P21312|ENDOR_ECOLX Probable replication end...,30.42,sp|P21312|ENDOR_ECOLX
...,...,...,...,...
388,MKKVNHWINGKNVAGNDYFLTTNPATGEVLADVASGGEAEINQAVA...,sp|P42269|HPCC_ECOLX 5-carboxymethyl-2-hydroxy...,0.21,sp|P42269|HPCC_ECOLX
389,MNRTVMMALVIIFLDAMGIGIIMPVLPALLREFVGKANVAENYGVL...,sp|Q07282|TCR5_ECOLX Tetracycline resistance p...,0.19,sp|Q07282|TCR5_ECOLX
390,MSTILKWAGNKTAIMSELKKHLPAGPRLVEPFAGSCAVMMATDYPS...,sp|P21311|DMA7_ECOLX Retron Ec67 DNA adenine m...,0.17,sp|P21311|DMA7_ECOLX
391,MKLFKSILLIAACHAAQASAAIDINADPNLTGAAPLTGILNGQQSD...,sp|Q03961|KPSD1_ECOLX Polysialic acid transpor...,0.06,sp|Q03961|KPSD1_ECOLX


In [55]:
def param_of_interest(protdata):
    output = bool(HistScore(protdata) >= 4)
    return(output)

nfractions = 7
fraclen = round(len(df)/nfractions)
seqhits = {}
noise = 0.10
for n in range(nfractions - 1):
    fracrecords = []
    #Generate min/max range values
    fuzzymin = int(round(n*fraclen-noise*fraclen,0))
    fuzzymax = int(round((n+1)*fraclen + noise*fraclen,0))
    if fuzzymin < 0:
        fuzzymin = 0
    if fuzzymax > len(df):
        fuzzymax = len(df)
        
    tempdf = df.iloc[fuzzymin:fuzzymax]
    for index  in tempdf.index:
        record = SeqRecord(Seq(tempdf['sequence'][index]),id=str(tempdf["ID"][index]),description=tempdf["description"][index])
        fracrecords.append(record)
        if param_of_interest(tempdf['sequence'][index]) == True:
            #print("yes")
            if n+1 in seqhits:
                seqhits[n+1][0] += 1
                seqhits[n+1][1].append(index)
            else:
                seqhits[n+1] = [1,[index]]
    SeqIO.write(fracrecords,os.path.join(data_folder,data_name+"_fraction"+ str(n+1) + ".faa"),"fasta")

tempdf = df.iloc[(nfractions-1)*fraclen:]
remainderrecord = []
for index  in tempdf.index:
        record = SeqRecord(Seq(tempdf['sequence'][index]),id=str(tempdf["ID"][index]),description=tempdf["description"][index])
        remainderrecord.append(record)
        if param_of_interest(tempdf['sequence'][index]):
            if nfractions in seqhits:
                seqhits[nfractions][0] += 1
                seqhits[nfractions][1].append(index)
            else:
                seqhits[nfractions] = [1,[index]]
SeqIO.write(remainderrecord,os.path.join(data_folder,data_name+"_fraction"+ str(nfractions) + ".faa"),"fasta")
tempdf

Unnamed: 0,sequence,description,charge at ph 7.0,ID
336,IQNDTGLPEESICSFRFLWRSTSVDDAVQIHWANGNIQVIRPVRGI...,sp|P04481|YPCB_ECOLX Uncharacterized protein i...,1.35,sp|P04481|YPCB_ECOLX
337,MVLPDIKKGKDMINILPFEIISRNTKTLLITYISSVDITHEGMKKV...,sp|P23184|MCBB_ECOLX Microcin B17-processing p...,1.34,sp|P23184|MCBB_ECOLX
338,MNMNELVFIDDFDNHVVIMSEVVMRLNSYRQTHYTSTESGGTLIGE...,sp|P0DTF3|CAP3_ECOLX CD-NTase-associated prote...,1.31,sp|P0DTF3|CAP3_ECOLX
339,MKRINKTAEDQFLINFKAQNPNGTWDEFRNHEQGILYKRLKQHICN...,sp|P0DV93|HNH83_ECOLX Retron Ec83 putative HNH...,1.23,sp|P0DV93|HNH83_ECOLX
340,MSKHELSLVEVTHYTDPEVLAIVKDFHVRGNFASLPEFAERTFVSA...,sp|P23185|MCBC_ECOLX Microcin B17-processing p...,1.1,sp|P23185|MCBC_ECOLX
341,MGSACSAESAHSCQHVLPVAPGCGAMRMHDRLHQNVDVWKLGNQTW...,sp|P21322|YR7H_ECOLX Protein ORFh in retron Ec...,1.07,sp|P21322|YR7H_ECOLX
342,MVDCTKPIAESNNFIILDKYNPDWKITESYQSEGDLERELIQDLVN...,sp|P17224|T1RP_ECOLX Type I restriction enzyme...,1.06,sp|P17224|T1RP_ECOLX
343,MLNIIHRLKSGMFPALFFLTSASVLAHPLTIPPGHWLEGMAVGVTE...,sp|P33784|FAEJ_ECOLX K88 minor fimbrial subuni...,1.06,sp|P33784|FAEJ_ECOLX
344,MSNKKQSNRLTEQHKLSQGVIGIFGDYAKAHDLAVGEVSKLVKKAL...,sp|P00642|T2E1_ECOLX Type II restriction enzym...,1.05,sp|P00642|T2E1_ECOLX
345,MNNTDTLEKIIRHQKNKDPAYPFQEHLLMQLCIRANKRMQDNISEF...,sp|P62088|PRSX_ECOLX HTH-type transcriptional ...,1.03,sp|P62088|PRSX_ECOLX


In [None]:
seqhits
for key in seqhits:
    print(f'There are {seqhits[key][0]} hits in fraction {key}')
    for hit in seqhits[key][1]:
        print(f'\tHit on index {hit}')

In [None]:
seqhits