In [1]:
from Bio import SeqIO
from Bio.SeqUtils.IsoelectricPoint import IsoelectricPoint as IP
from Bio.SeqUtils.ProtParam import ProteinAnalysis as PA
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import os
from collections import defaultdict
import pandas as pd
import ipywidgets as widgets
from ipywidgets import *

In [46]:
#Create a selector for the intended size exclusion media
geldict = {
    "Bio-P 0.1-1.8 kDa" : [100,1800],
    "Bio-P 0.8-4.0 kDa" : [800,4000],
    "Bio-P 1.0-6.0 kDa" : [1000,6000],
    "Bio-P 1.5-20.0 kDa" : [1500,20000],
    "Bio-P 2.5-40.0 kDA" : [2500,40000],
    "Bio-P 3.0-60.0 kDa" : [3000,60000],
    "Bio-P 5.0-100 kDa" : [5000,100000],
    "S-X 0.4-14.0 kDa" : [400,14000],
    "S-X <2.0 kDA" : [0,2000],
    "S-X <0.4 kDA" : [0,400],
    "Bio-A 10.0 - 500 kDA" : [10000,500000],
    "Bio-A 10.0 - 1500 kDA" : [10000,1500000],
}
gelselect = Dropdown(options=geldict.keys(),description="Media")
display(gelselect)



Dropdown(description='Media', options=('Bio-P 0.1-1.8 kDa', 'Bio-P 0.8-4.0 kDa', 'Bio-P 1.0-6.0 kDa', 'Bio-P 1…

In [3]:
def listtostr(strlist):
    output = ""
    for word in strlist:
        output += word + "_"
    return(output[:len(output)-1])
def simplifyname(orgname):
    firstname = [orgname.split("_")[0][0]]
    lastname = orgname.split("_")[1:]
    fullname = firstname + lastname
    return(listtostr(fullname))
def keytoindex(inputdict, string):
    n = 0
    for key in inputdict.keys():
        if key == string:
            return n
        n += 1

In [4]:
def getHistPercent(protstring):
    Hcount = 0
    for AA in protstring:
        if AA == "H":
            Hcount += 1
    return(Hcount/len(protstring))

def findHistChains(protstring):
    chainDict = {}
    chainlen = 1
    lastAA = ""
    for AA in protstring:
        if AA == "H":
            if lastAA == "H":
                chainlen += 1
        elif chainlen > 1 and lastAA == "H":
            #print(f"Chain found, current length {chainlen}")
            if str(chainlen) in chainDict:
                chainDict[str(chainlen)] = chainDict[str(chainlen)] + 1
            else:
                chainDict[str(chainlen)] = 1
            chainlen = 1
        lastAA = AA
    if chainlen > 1:
        if str(chainlen) in chainDict:
            chainDict[str(chainlen)] = chainDict[str(chainlen)] + 1
        else:
            chainDict[str(chainlen)] = 1
    return(chainDict)

def HistScore(protstring):
    last4 = ""
    last8 = ""
    CD4 = 0
    CD8 = 0
    score = 0
    for AA in protstring:
        #iterate last4 & last8
        if len(last4) < 4:
            last4 += AA
        else:
            last4 = last4[1:] + AA
        if len(last8) < 8:
            last8 += AA
        else:
            last8 = last8[1:] + AA
        
        #scan for non-chain H groupings 
        if getHistPercent(last4) >= 0.5 and CD4 < 1:
            score += 2
            CD4 = 4
        else:
            CD4 -= 1
        if getHistPercent(last8) >= 0.375 and CD8 < 1:
            score += 6
            CD8 = 8
        else:
            CD8 -= 1
    
    #scan for chains
    histChains = findHistChains(protstring)
    for key in histChains:
        score += histChains[key] * (int(key) ** 2)
    return(score)

In [102]:
#Generate a file path for the correct .fasta file
input_filename = input("Enter Input Name")
output_name = input("Enter Output Name")
datafile = os.path.join('data', input_filename)
#organism_name = input_filename.replace(".faa","")

Enter Input Name E_coli.faa
Enter Output Name E_coli_BioA_10000-1500000Da


In [103]:
data_folder = os.path.join('data', output_name)

if not os.path.isdir(data_folder):
    os.mkdir(data_folder)

In [104]:
size_min = geldict[gelselect.value][0]
size_max = geldict[gelselect.value][1]
unfiltered = []
wash = []
fractions = []
too_small = []
with open(datafile) as protfile:
    for record in SeqIO.parse(protfile,"fasta"):
        unfiltered.append(record)
        sequence = str(record.seq).replace("X","Q")
        protparams = PA(sequence)
        if protparams.molecular_weight() < size_min:
            too_small.append(record)
        elif protparams.molecular_weight() <= size_max:
            fractions.append(record)
        else:
            wash.append(record)
            
#data_name = f"Media{keytoindex(geldict,gelselect.value)}"+f"_{simplifyname(organism_name)}"   
data_name = output_name
SeqIO.write(unfiltered,os.path.join(data_folder,data_name+"_total.faa"),"fasta")
SeqIO.write(wash,os.path.join(data_folder,data_name+"_wash.faa"),"fasta")
SeqIO.write(fractions,os.path.join(data_folder,data_name+"_fractions.faa"),"fasta")
SeqIO.write(too_small,os.path.join(data_folder,data_name+"_unpassed.faa"),"fasta")

95

In [105]:
#Converting modified fasta into a pd dataframe to show filter at work, preset file name used for simplicity

data = {
    'sequence' : [],
    'MW' : [],
    f'charge at ph {pHslider.value}' : []
    'description' : [],
    'ID' : [],
}


protMW = []
prot_desc = []
prot_name = []
prot_id = []
with open(os.path.join(data_folder,data_name+"_fractions.faa")) as protfile:
  for record in SeqIO.parse(protfile,"fasta"):
    sequence = str(record.seq).replace("X","Q")
    data['sequence'].append(sequence)
    protparams = PA(sequence)
    data['MW'].append(protparams.molecular_weight())
    data['description'].append(record.description)
    data['ID'].append(record.id)
    data[f'charge at ph {pHslider.value}'].append(round(protparams.charge_at_pH(pHslider.value),2))


df = pd.DataFrame.from_dict(data)

df.sort_values(by=["MW"], inplace=True, ascending=False)
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,sequence,MW,description,ID
0,MHQPPVRFTYRLLSYLVSAIIAGQPLLPAVGAVITPQNGAGMDKAA...,318899.6992,sp|Q3YL96|CDIA_ECOLX Toxin CdiA OS=Escherichia...,sp|Q3YL96|CDIA_ECOLX
1,MLSFSVVKSAGSAGNYYTDKDNYYVLGSMGERWAGQGAEQLGLQGS...,191678.9117,sp|P22706|TRAI2_ECOLX Multifunctional conjugat...,sp|P22706|TRAI2_ECOLX
2,MPITKAEAQGVTRAFVRDYPGALELAYKFREDAAELYGPRAAEVPA...,158949.0376,sp|P27190|TRAC5_ECOLX DNA primase TraC OS=Esch...,sp|P27190|TRAC5_ECOLX
3,MNRIYSLRYSAVARGFIAVSEFARKCVHKSVRRLCFPVLLLIPVLF...,148255.1456,sp|O88093|HBP_ECOLX Hemoglobin-binding proteas...,sp|O88093|HBP_ECOLX
4,MNRIYSLRYSAVARGFIAVSEFARKCVHKSVRRLCFPVLLLIPVLF...,148225.0765,sp|Q47692|TSH_ECOLX Temperature-sensitive hema...,sp|Q47692|TSH_ECOLX
...,...,...,...,...
610,MTKIKTVTFVNTYPGGSMKNLLDTEGTVLFPFQTEIHFIWTIFSTV...,10122.0332,sp|P18024|ABI_ECOLX Abortive infection protein...,sp|P18024|ABI_ECOLX
611,MNKMAMIDLAKLFLASKITAIEFSERICVERRRLYGVKDLSPNILN...,10057.6077,sp|P11899|IMMD_ECOLX Colicin-D immunity protei...,sp|P11899|IMMD_ECOLX
612,MSQTENAVTSSLSQKRFVRRGKPMTDSEKQMAAVARKRLTHKEIKV...,10037.6500,sp|P62536|COPB3_ECOLX Replication regulatory p...,sp|P62536|COPB3_ECOLX
613,MKRLLLQNVCDLSNNMNGLRFPCFVKSGNAEVSALHHYVPDLHRRM...,10034.6266,sp|P03850|YPB2_ECOLX Uncharacterized 10.0 kDa ...,sp|P03850|YPB2_ECOLX


In [106]:
def param_of_interest(protdata):
    output = bool(HistScore(protdata) >= 4)
    return(output)

nfractions = 7
fraclen = round(len(df)/nfractions)
seqhits = {}
noise = 0.10
for n in range(nfractions - 1):
    fracrecords = []
    #Generate min/max range values
    fuzzymin = int(round(n*fraclen-noise*fraclen,0))
    fuzzymax = int(round((n+1)*fraclen + noise*fraclen,0))
    if fuzzymin < 0:
        fuzzymin = 0
    if fuzzymax > len(df):
        fuzzymax = len(df)
        
    tempdf = df.iloc[fuzzymin:fuzzymax]
    for index  in tempdf.index:
        record = SeqRecord(Seq(tempdf['sequence'][index]),id=str(tempdf["ID"][index]),description=tempdf["description"][index])
        fracrecords.append(record)
        if param_of_interest(tempdf['sequence'][index]):
            if str(n+1) in seqhits:
                seqhits[str(n+1)][0] += 1
                seqhits[str(n+1)][1].append(index)
            else:
                seqhits[str(n+1)] = [1,[index]]
    SeqIO.write(fracrecords,os.path.join(data_folder,data_name+"_fraction"+ str(n+1) + ".faa"),"fasta")

tempdf = df.iloc[(nfractions-1)*fraclen:]
remainderrecord = []
for index  in tempdf.index:
        record = SeqRecord(Seq(tempdf['sequence'][index]),id=str(tempdf["ID"][index]),description=tempdf["description"][index])
        remainderrecord.append(record)
        if param_of_interest(tempdf['sequence'][index]):
            if str(n+1) in seqhits:
                seqhits[str(n+1)][0] += 1
                seqhits[str(n+1)][1].append(index)
            else:
                seqhits[str(n+1)] = [1,[index]]
SeqIO.write(remainderrecord,os.path.join(data_folder,data_name+"_fraction"+ str(nfractions) + ".faa"),"fasta")
tempdf

Unnamed: 0,sequence,MW,description,ID
528,MPKSYTDELAEWVESRAAKKRRRDEAAVAFLAVRADVEAALASGYA...,14716.6333,sp|P17910|TRAK4_ECOLX Protein TraK OS=Escheric...,sp|P17910|TRAK4_ECOLX
529,MPKIQTYVNNNVYEQITDLVTIRKQEGIEEASLSNVSSMLLELGLR...,14664.5223,sp|P33788|TRAM8_ECOLX Relaxosome protein TraM ...,sp|P33788|TRAM8_ECOLX
530,MPKTYPEELAEWVKGREAKKPRQDKHVVAFLAVKSDVQAALDAGYA...,14621.6465,sp|P17908|TRAK5_ECOLX Protein TraK OS=Escheric...,sp|P17908|TRAK5_ECOLX
531,MAKVNLYISNDAYEKINAIIEKRRQEGAREKDVSFSATASMLLELG...,14555.4019,sp|P18808|TRAM6_ECOLX Relaxosome protein TraM ...,sp|P18808|TRAM6_ECOLX
532,MARVNLYISNEVHEKINMIVEKRRQEGARDKDISLSGTASMLLELG...,14542.4313,sp|P18807|TRAM9_ECOLX Relaxosome protein TraM ...,sp|P18807|TRAM9_ECOLX
...,...,...,...,...
610,MTKIKTVTFVNTYPGGSMKNLLDTEGTVLFPFQTEIHFIWTIFSTV...,10122.0332,sp|P18024|ABI_ECOLX Abortive infection protein...,sp|P18024|ABI_ECOLX
611,MNKMAMIDLAKLFLASKITAIEFSERICVERRRLYGVKDLSPNILN...,10057.6077,sp|P11899|IMMD_ECOLX Colicin-D immunity protei...,sp|P11899|IMMD_ECOLX
612,MSQTENAVTSSLSQKRFVRRGKPMTDSEKQMAAVARKRLTHKEIKV...,10037.6500,sp|P62536|COPB3_ECOLX Replication regulatory p...,sp|P62536|COPB3_ECOLX
613,MKRLLLQNVCDLSNNMNGLRFPCFVKSGNAEVSALHHYVPDLHRRM...,10034.6266,sp|P03850|YPB2_ECOLX Uncharacterized 10.0 kDa ...,sp|P03850|YPB2_ECOLX


In [24]:
seqhits
for key in seqhits:
    print(f'There are {seqhits[key][0]} hits in fraction {key}')
    for hit in seqhits[key][1]:
        print(f'\tHit on index {hit}')

There are 1 hits in fraction 2
	Hit on index 15
There are 2 hits in fraction 3
	Hit on index 15
	Hit on index 20
There are 1 hits in fraction 4
	Hit on index 25
There are 1 hits in fraction 5
	Hit on index 38


In [None]:
df