In [3]:
from Bio import SeqIO
from Bio.SeqUtils.IsoelectricPoint import IsoelectricPoint as IP
from Bio.SeqUtils.ProtParam import ProteinAnalysis as PA
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import os
from collections import defaultdict
import pandas as pd
import ipywidgets as widgets
from ipywidgets import *

In [4]:
def listtostr(strlist):
    output = ""
    for word in strlist:
        output += word + "_"
    return(output[:len(output)-1])
def simplifyname(orgname):
    firstname = [orgname.split("_")[0][0]]
    lastname = orgname.split("_")[1:]
    fullname = firstname + lastname
    return(listtostr(fullname))

In [5]:
def getHistPercent(protstring):
    Hcount = 0
    for AA in protstring:
        if AA == "H":
            Hcount += 1
    return(Hcount/len(protstring))

def findHistChains(protstring):
    chainDict = {}
    chainlen = 1
    lastAA = ""
    for AA in protstring:
        if AA == "H":
            if lastAA == "H":
                chainlen += 1
        elif chainlen > 1 and lastAA == "H":
            #print(f"Chain found, current length {chainlen}")
            if str(chainlen) in chainDict:
                chainDict[str(chainlen)] = chainDict[str(chainlen)] + 1
            else:
                chainDict[str(chainlen)] = 1
            chainlen = 1
        lastAA = AA
    if chainlen > 1:
        if str(chainlen) in chainDict:
            chainDict[str(chainlen)] = chainDict[str(chainlen)] + 1
        else:
            chainDict[str(chainlen)] = 1
    return(chainDict)

def HistScore(protstring):
    last4 = ""
    last8 = ""
    CD4 = 0
    CD8 = 0
    score = 0
    for AA in protstring:
        #iterate last4 & last8
        if len(last4) < 4:
            last4 += AA
        else:
            last4 = last4[1:] + AA
        if len(last8) < 8:
            last8 += AA
        else:
            last8 = last8[1:] + AA
        
        #scan for non-chain H groupings 
        if getHistPercent(last4) >= 0.5 and CD4 < 1:
            score += 2
            CD4 = 4
        else:
            CD4 -= 1
        if getHistPercent(last8) >= 0.375 and CD8 < 1:
            score += 6
            CD8 = 8
        else:
            CD8 -= 1
    
    #scan for chains
    histChains = findHistChains(protstring)
    for key in histChains:
        score += histChains[key] * (int(key) ** 2)
    return(score)

In [6]:
#Create a selector for the intended size exclusion media
geldict = {
    "Q Media (Triethylamine +)" : -1,
    "S Media (Sulfite -)" : 1,
}
gelselect = Dropdown(options=geldict.keys(),description="Media")
display(gelselect)

pHslider = widgets.FloatSlider(value=7.0,min=0,max=14,step=0.1)
display(pHslider)

#testbutton = widgets.Button(description="Confirm")
#output = widgets.Output()
#display(testbutton, output)
#def on_button_clicked(b):
#    with output:
#        runpH = pHslider.value
#        gelcharge = int(geldict[gelselect.value])
    
#testbutton.on_click(on_button_clicked)



Dropdown(description='Media', options=('Q Media (Triethylamine +)', 'S Media (Sulfite -)'), value='Q Media (Tr…

FloatSlider(value=7.0, max=14.0)

In [62]:
#Generate a file path for the correct .fasta file
input_filename = input("Enter File Name")
datafile = os.path.join('data', input_filename)
organism_name = input_filename.replace(".faa","")

Enter File Name Yersinia_pestis[632].faa


In [69]:
data_folder = os.path.join('data', organism_name +"_ion_exchange")

if not os.path.isdir(data_folder):
    os.mkdir(data_folder)

In [70]:
unfiltered = []
wash = []
fractions = []
with open(datafile) as protfile:
    for record in SeqIO.parse(protfile,"fasta"):
        unfiltered.append(record)
        protparams = PA(str(record.seq))
        if protparams.charge_at_pH(pHslider.value)*int(geldict[gelselect.value]) < 0:
            wash.append(record)
        else:
            fractions.append(record)    

data_name = gelselect.value[0] + str(pHslider.value).replace(".","_") + "_" + simplifyname(organism_name)
SeqIO.write(unfiltered,os.path.join(data_folder,data_name+"_total.faa"),"fasta")
SeqIO.write(wash,os.path.join(data_folder,data_name+"_wash.faa"),"fasta")
SeqIO.write(fractions,os.path.join(data_folder,data_name+"_fractions.faa"),"fasta")


2273

In [71]:
#Converting modified fasta into a pd dataframe to show filter at work, preset file name used for simplicity

data = {
    'sequence' : [],
    'description' : [],
    f'charge at ph {pHslider.value}' : [],
    'ID' : []
}


protcharge = []
prot_desc = []
prot_name = []
prot_id = []
with open(os.path.join(data_folder,data_name+"_fractions.faa")) as protfile:
  for record in SeqIO.parse(protfile,"fasta"):
    sequence = str(record.seq)
    data['sequence'].append(sequence)
    protparams = PA(sequence)
    charge = round(protparams.charge_at_pH(pHslider.value),2)
    data[f'charge at ph {pHslider.value}'].append(charge)
    data['description'].append(record.description)
    data['ID'].append(record.id)


df = pd.DataFrame.from_dict(data)

df.sort_values(by=[f'charge at ph {pHslider.value}'], inplace=True, ascending = bool(int(geldict[gelselect.value]) == -1))
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,sequence,description,charge at ph 6.0,ID
0,MGSVIMRLMLTLFVLLFTQLFLNLAHASPQAHSSVEQKKGHVSQAS...,tr|Q0WED7|Q0WED7_YERPE Putative exported prote...,43.70,tr|Q0WED7|Q0WED7_YERPE
1,MAIVKCKPTSPGRRHVVKVVNPELHKGKPYAPLLEKLSKSGGRNNN...,sp|P60436|RL2_YERPE 50S ribosomal protein L2 O...,40.18,sp|P60436|RL2_YERPE
2,MNILLISQCHKRALNETRRILDQFAERKGDRTWQTAITQEGLNTLR...,tr|Q7CJ41|Q7CJ41_YERPE Predicted helicases OS=...,36.53,tr|Q7CJ41|Q7CJ41_YERPE
3,MSISLIQPERDLFSYQPYWAECYGTAPFLPMSREEMDILGWDSCDI...,sp|Q8CZS4|Y674_YERPE Putative UPF0313 protein ...,33.72,sp|Q8CZS4|Y674_YERPE
4,MSLTDAKIRTLKPSDKPFKVSDSHGLYLLVKPGGSRHWYLKYRISG...,tr|Q9Z3B4|Q9Z3B4_YERPE CP4-like integrase OS=Y...,33.05,tr|Q9Z3B4|Q9Z3B4_YERPE
...,...,...,...,...
2268,MKNVGFIGWRGMVGSVLMQRMIEERDFDGIRPVFFSTSQHGQAAPA...,tr|Q9ETB0|Q9ETB0_YERPE Aspartate-semialdehyde ...,0.02,tr|Q9ETB0|Q9ETB0_YERPE
2269,MEQLKAELSVILGESITRLERISEQPYAHLYAMYNRQDQAMPLLAK...,tr|Q0WH14|Q0WH14_YERPE Putative uncharacterize...,0.01,tr|Q0WH14|Q0WH14_YERPE
2270,MQISSPMGQLTNDIQQARQAYQNQMAAVNINDPEQMLTSQFTMNQY...,tr|Q74XK4|Q74XK4_YERPE Putative type III secre...,0.01,tr|Q74XK4|Q74XK4_YERPE
2271,MINEIRKDAEVRMEKCLEAFQNHISKIRTGRASPSILDGIQVEYYG...,sp|Q8ZH63|RRF_YERPE Ribosome-recycling factor ...,0.01,sp|Q8ZH63|RRF_YERPE


In [72]:
def param_of_interest(protdata):
    output = bool(HistScore(protdata) >= 4)
    return(output)

nfractions = 7
fraclen = round(len(df)/nfractions)
seqhits = {}
noise = 0.10
for n in range(nfractions - 1):
    fracrecords = []
    #Generate min/max range values
    fuzzymin = int(round(n*fraclen-noise*fraclen,0))
    fuzzymax = int(round((n+1)*fraclen + noise*fraclen,0))
    if fuzzymin < 0:
        fuzzymin = 0
    if fuzzymax > len(df):
        fuzzymax = len(df)
        
    tempdf = df.iloc[fuzzymin:fuzzymax]
    for index  in tempdf.index:
        record = SeqRecord(Seq(tempdf['sequence'][index]),id=str(tempdf["ID"][index]),description=tempdf["description"][index])
        fracrecords.append(record)
        if param_of_interest(tempdf['sequence'][index]) == True:
            #print("yes")
            if n+1 in seqhits:
                seqhits[n+1][0] += 1
                seqhits[n+1][1].append(index)
            else:
                seqhits[n+1] = [1,[index]]
    SeqIO.write(fracrecords,os.path.join(data_folder,data_name+"_fraction"+ str(n+1) + ".faa"),"fasta")

tempdf = df.iloc[(nfractions-1)*fraclen:]
remainderrecord = []
for index  in tempdf.index:
        record = SeqRecord(Seq(tempdf['sequence'][index]),id=str(tempdf["ID"][index]),description=tempdf["description"][index])
        remainderrecord.append(record)
        if param_of_interest(tempdf['sequence'][index]):
            if nfractions in seqhits:
                seqhits[nfractions][0] += 1
                seqhits[nfractions][1].append(index)
            else:
                seqhits[nfractions] = [1,[index]]
SeqIO.write(remainderrecord,os.path.join(data_folder,data_name+"_fraction"+ str(nfractions) + ".faa"),"fasta")
tempdf

Unnamed: 0,sequence,description,charge at ph 6.0,ID
1950,MPQISRSALVPFSVKQMYQLVNDVRSYPEFLPGCTGSRVLDATENE...,tr|Q0WHU3|Q0WHU3_YERPE Putative uncharacterize...,1.18,tr|Q0WHU3|Q0WHU3_YERPE
1951,MQHITVLEEDIKHDPTAIKQRKEILARGRRLITCQLSLLQTPENYQ...,tr|Q7CKS4|Q7CKS4_YERPE Putative type III secre...,1.18,tr|Q7CKS4|Q7CKS4_YERPE
1952,MSLSLLGIYLYNLSRDNLRPEEYERIVSAYAAWTRVCREYEFNDGY...,tr|Q7CFZ1|Q7CFZ1_YERPE Putative uncharacterize...,1.17,tr|Q7CFZ1|Q7CFZ1_YERPE
1953,MLISHGVKCMKHSIDSLKELGRYGDALAMAQDLLSRSPDNASLLYK...,tr|Q7CJI7|Q7CJI7_YERPE TPR-repeat-containing p...,1.16,tr|Q7CJI7|Q7CJI7_YERPE
1954,MTILATAEALSGELESASQSKDWPRLLLLDERVAHLLVSIAKQKLS...,tr|Q74RI2|Q74RI2_YERPE Putative uncharacterize...,1.16,tr|Q74RI2|Q74RI2_YERPE
...,...,...,...,...
2268,MKNVGFIGWRGMVGSVLMQRMIEERDFDGIRPVFFSTSQHGQAAPA...,tr|Q9ETB0|Q9ETB0_YERPE Aspartate-semialdehyde ...,0.02,tr|Q9ETB0|Q9ETB0_YERPE
2269,MEQLKAELSVILGESITRLERISEQPYAHLYAMYNRQDQAMPLLAK...,tr|Q0WH14|Q0WH14_YERPE Putative uncharacterize...,0.01,tr|Q0WH14|Q0WH14_YERPE
2270,MQISSPMGQLTNDIQQARQAYQNQMAAVNINDPEQMLTSQFTMNQY...,tr|Q74XK4|Q74XK4_YERPE Putative type III secre...,0.01,tr|Q74XK4|Q74XK4_YERPE
2271,MINEIRKDAEVRMEKCLEAFQNHISKIRTGRASPSILDGIQVEYYG...,sp|Q8ZH63|RRF_YERPE Ribosome-recycling factor ...,0.01,sp|Q8ZH63|RRF_YERPE


In [68]:
seqhits
for key in seqhits:
    print(f'There are {seqhits[key][0]} hits in fraction {key}')
    for hit in seqhits[key][1]:
        print(f'\tHit on index {hit}')

There are 111 hits in fraction 1
	Hit on index 1
	Hit on index 2
	Hit on index 3
	Hit on index 4
	Hit on index 7
	Hit on index 8
	Hit on index 9
	Hit on index 12
	Hit on index 14
	Hit on index 16
	Hit on index 17
	Hit on index 18
	Hit on index 19
	Hit on index 20
	Hit on index 21
	Hit on index 28
	Hit on index 29
	Hit on index 30
	Hit on index 33
	Hit on index 36
	Hit on index 37
	Hit on index 42
	Hit on index 47
	Hit on index 48
	Hit on index 55
	Hit on index 56
	Hit on index 57
	Hit on index 61
	Hit on index 62
	Hit on index 64
	Hit on index 70
	Hit on index 71
	Hit on index 73
	Hit on index 80
	Hit on index 82
	Hit on index 83
	Hit on index 84
	Hit on index 86
	Hit on index 88
	Hit on index 94
	Hit on index 95
	Hit on index 98
	Hit on index 99
	Hit on index 100
	Hit on index 101
	Hit on index 102
	Hit on index 104
	Hit on index 105
	Hit on index 106
	Hit on index 107
	Hit on index 108
	Hit on index 113
	Hit on index 114
	Hit on index 117
	Hit on index 120
	Hit on index 125
	Hit on i

In [49]:
seqhits

{'1': [24,
  [0,
   1,
   4,
   5,
   6,
   7,
   10,
   15,
   19,
   21,
   24,
   27,
   30,
   37,
   39,
   41,
   44,
   47,
   50,
   53,
   55,
   57,
   60,
   68]],
 '2': [21,
  [83,
   84,
   89,
   91,
   94,
   96,
   98,
   102,
   110,
   113,
   114,
   115,
   125,
   130,
   131,
   132,
   136,
   143,
   148,
   150,
   151]],
 '3': [19,
  [152,
   154,
   155,
   156,
   163,
   172,
   176,
   180,
   181,
   183,
   186,
   194,
   199,
   202,
   210,
   214,
   219,
   222,
   225]],
 '4': [16,
  [228,
   230,
   231,
   235,
   237,
   258,
   263,
   271,
   273,
   278,
   279,
   280,
   284,
   286,
   287,
   296]],
 '5': [27,
  [304,
   311,
   314,
   316,
   320,
   321,
   323,
   325,
   328,
   329,
   330,
   335,
   339,
   341,
   343,
   344,
   347,
   348,
   352,
   353,
   355,
   356,
   360,
   363,
   371,
   373,
   377]],
 '6': [25,
  [381,
   382,
   390,
   391,
   393,
   394,
   396,
   398,
   400,
   402,
   405,
   406,
   407,
 