In [2]:
from Bio import SeqIO
from Bio.SeqUtils.IsoelectricPoint import IsoelectricPoint as IP
from Bio.SeqUtils.ProtParam import ProteinAnalysis as PA
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import os
from collections import defaultdict
import pandas as pd
import ipywidgets as widgets
from ipywidgets import *

In [3]:
#Create a selector for the intended size exclusion media
geldict = {
    "Bio-P 0.1-1.8 kDa" : [100,1800],
    "Bio-P 0.8-4.0 kDa" : [800,4000],
    "Bio-P 1.0-6.0 kDa" : [1000,6000],
    "Bio-P 1.5-20.0 kDa" : [1500,20000],
    "Bio-P 2.5-40.0 kDA" : [2500,40000],
    "Bio-P 3.0-60.0 kDa" : [3000,60000],
    "Bio-P 5.0-100 kDa" : [5000,100000],
    "S-X 0.4-14.0 kDa" : [400,14000],
    "S-X <2.0 kDA" : [0,2000],
    "S-X <0.4 kDA" : [0,400],
    "Bio-A 10.0 - 1500 kDA" : [10000,1500000],
}
gelselect = Dropdown(options=geldict.keys(),description="Media")
display(gelselect)



Dropdown(description='Media', options=('Bio-P 0.1-1.8 kDa', 'Bio-P 0.8-4.0 kDa', 'Bio-P 1.0-6.0 kDa', 'Bio-P 1…

In [5]:
def listtostr(strlist):
    output = ""
    for word in strlist:
        output += word + "_"
    return(output[:len(output)-1])
def simplifyname(orgname):
    firstname = [orgname.split("_")[0][0]]
    lastname = orgname.split("_")[1:]
    fullname = firstname + lastname
    return(listtostr(fullname))
def keytoindex(inputdict, string):
    n = 0
    for key in inputdict.keys():
        if key == string:
            return n
        n += 1

In [11]:
def getHistPercent(protstring):
    Hcount = 0
    for AA in protstring:
        if AA == "H":
            Hcount += 1
    return(Hcount/len(protstring))

def findHistChains(protstring):
    chainDict = {}
    chainlen = 1
    lastAA = ""
    for AA in protstring:
        if AA == "H":
            if lastAA == "H":
                chainlen += 1
        elif chainlen > 1 and lastAA == "H":
            #print(f"Chain found, current length {chainlen}")
            if str(chainlen) in chainDict:
                chainDict[str(chainlen)] = chainDict[str(chainlen)] + 1
            else:
                chainDict[str(chainlen)] = 1
            chainlen = 1
        lastAA = AA
    if chainlen > 1:
        if str(chainlen) in chainDict:
            chainDict[str(chainlen)] = chainDict[str(chainlen)] + 1
        else:
            chainDict[str(chainlen)] = 1
    return(chainDict)

def HistScore(protstring):
    last4 = ""
    last8 = ""
    CD4 = 0
    CD8 = 0
    score = 0
    for AA in protstring:
        #iterate last4 & last8
        if len(last4) < 4:
            last4 += AA
        else:
            last4 = last4[1:] + AA
        if len(last8) < 8:
            last8 += AA
        else:
            last8 = last8[1:] + AA
        
        #scan for non-chain H groupings 
        if getHistPercent(last4) >= 0.5 and CD4 < 1:
            score += 2
            CD4 = 4
        else:
            CD4 -= 1
        if getHistPercent(last8) >= 0.375 and CD8 < 1:
            score += 6
            CD8 = 8
        else:
            CD8 -= 1
    
    #scan for chains
    histChains = findHistChains(protstring)
    for key in histChains:
        score += histChains[key] * (int(key) ** 2)
    return(score)

In [21]:
#Generate a file path for the correct .fasta file
input_filename = input("Enter File Name")
datafile = os.path.join('data', input_filename)
organism_name = input_filename.replace(".faa","")

Enter File Name Yersinia_pestis[632].faa


In [22]:
data_folder = os.path.join('data', organism_name+"_size_exclusion_"+gelselect.value)

if not os.path.isdir(data_folder):
    os.mkdir(data_folder)

In [23]:
size_min = geldict[gelselect.value][0]
size_max = geldict[gelselect.value][1]
unfiltered = []
wash = []
fractions = []
too_small = []
with open(datafile) as protfile:
    for record in SeqIO.parse(protfile,"fasta"):
        unfiltered.append(record)
        protparams = PA(str(record.seq))
        if protparams.molecular_weight() < size_min:
            too_small.append(record)
        elif protparams.molecular_weight() <= size_max:
            fractions.append(record)
        else:
            wash.append(record)
            
data_name = f"Media{keytoindex(geldict,gelselect.value)}"+f"_{simplifyname(organism_name)}"        
SeqIO.write(unfiltered,os.path.join(data_folder,data_name+"_total.faa"),"fasta")
SeqIO.write(wash,os.path.join(data_folder,data_name+"_wash.faa"),"fasta")
SeqIO.write(fractions,os.path.join(data_folder,data_name+"_fractions.faa"),"fasta")
SeqIO.write(too_small,os.path.join(data_folder,data_name+"_unpassed.faa"),"fasta")

448

In [18]:
#Converting modified fasta into a pd dataframe to show filter at work, preset file name used for simplicity

data = {
    'sequence' : [],
    'MW' : [],
    'description' : [],
    'ID' : [],
}


protMW = []
prot_desc = []
prot_name = []
prot_id = []
with open(os.path.join(data_folder,data_name+"_fractions.faa")) as protfile:
  for record in SeqIO.parse(protfile,"fasta"):
    sequence = str(record.seq)
    data['sequence'].append(sequence)
    protparams = PA(sequence)
    data['MW'].append(protparams.molecular_weight())
    data['description'].append(record.description)
    data['ID'].append(record.id)


df = pd.DataFrame.from_dict(data)

df.sort_values(by=["MW"], inplace=True, ascending=False)
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,sequence,MW,description,ID
0,MKIRHWSALSLFVLPALAQAEALTGEVHRQPLNIQAIVMFLLFVGG...,59309.4662,sp|Q8ZJ73|ACTP_YERPE Cation/acetate symporter ...,sp|Q8ZJ73|ACTP_YERPE
1,MLAAGLIVAVALLAFAAIWHHAPTADWQSVWHDRYLWHVIRFTFWQ...,57891.0740,tr|Q0WJE3|Q0WJE3_YERPE Thiamine transport syst...,tr|Q0WJE3|Q0WJE3_YERPE
2,MANQSVSRTGTEVKESWVPMITIALAQILMSFNVASLPVALGGMVK...,57813.7572,tr|Q9ZC50|Q9ZC50_YERPE Putative membrane prote...,tr|Q9ZC50|Q9ZC50_YERPE
3,MTVRIVSNAVNALISGADDNVKRLVQEMLSYEVEAGDWKGTSTMFN...,57682.0573,tr|Q7ARE3|Q7ARE3_YERPE Putative uncharacterize...,tr|Q7ARE3|Q7ARE3_YERPE
4,MKSLIPVWNRAINYVFRPMIGLLLGLSMSAMAGPAQDFVAANRAQQ...,57219.7468,tr|Q0WHL2|Q0WHL2_YERPE Putative Branched-chain...,tr|Q0WHL2|Q0WHL2_YERPE
...,...,...,...,...
257,MPVIKVRENEPFDVALRRFKRSCEKAGVLAEVRRREFYEKPTTERK...,8499.8547,sp|P68686|RS21_YERPE 30S ribosomal protein S21...,sp|P68686|RS21_YERPE
258,MMLIHAKAKQPKLNDNKSLKHEQTTKIAYVNRVKKPTSAETRLHSL...,8064.6120,tr|Q7CKC1|Q7CKC1_YERPE Putative uncharacterize...,tr|Q7CKC1|Q7CKC1_YERPE
259,MPKIKTVRGAAKRFKKTANGGFKRKHANLRHILTKKATKRKRHLRP...,7318.8106,sp|Q8ZDW7|RL35_YERPE 50S ribosomal protein L35...,sp|Q8ZDW7|RL35_YERPE
260,MQVLSSLRSAKNRHPDCKIVRRRGRVYVICKSNPRFKAVQGGTHKKR,5463.4317,sp|Q8ZC86|RL362_YERPE 50S ribosomal protein L3...,sp|Q8ZC86|RL362_YERPE


In [24]:
def param_of_interest(protdata):
    output = bool(HistScore(protdata) >= 4)
    return(output)

nfractions = 7
fraclen = round(len(df)/nfractions)
seqhits = {}
noise = 0.10
for n in range(nfractions - 1):
    fracrecords = []
    #Generate min/max range values
    fuzzymin = int(round(n*fraclen-noise*fraclen,0))
    fuzzymax = int(round((n+1)*fraclen + noise*fraclen,0))
    if fuzzymin < 0:
        fuzzymin = 0
    if fuzzymax > len(df):
        fuzzymax = len(df)
        
    tempdf = df.iloc[fuzzymin:fuzzymax]
    for index  in tempdf.index:
        record = SeqRecord(Seq(tempdf['sequence'][index]),id=str(tempdf["ID"][index]),description=tempdf["description"][index])
        fracrecords.append(record)
        if param_of_interest(tempdf['sequence'][index]):
            if str(n+1) in seqhits:
                seqhits[str(n+1)][0] += 1
                seqhits[str(n+1)][1].append(index)
            else:
                seqhits[str(n+1)] = [1,[index]]
    SeqIO.write(fracrecords,os.path.join(data_folder,data_name+"_fraction"+ str(n+1) + ".faa"),"fasta")

tempdf = df.iloc[(nfractions-1)*fraclen:]
remainderrecord = []
for index  in tempdf.index:
        record = SeqRecord(Seq(tempdf['sequence'][index]),id=str(tempdf["ID"][index]),description=tempdf["description"][index])
        remainderrecord.append(record)
        if param_of_interest(tempdf['sequence'][index]):
            if str(n) in seqhits:
                seqhits[str(n+1)][0] += 1
                seqhits[str(n+1)][1].append(index)
            else:
                seqhits[str(n+1)] = [1,[index]]
SeqIO.write(remainderrecord,os.path.join(data_folder,data_name+"_fraction"+ str(nfractions) + ".faa"),"fasta")
tempdf

Unnamed: 0,sequence,MW,description,ID
222,MLQPKRTKFRKMHKGRNRGLAQGTDVSFGEFGLKACGRCRLTARQI...,15273.9899,sp|Q8ZJA5|RL16_YERPE 50S ribosomal protein L16...,sp|Q8ZJA5|RL16_YERPE
223,MLIISNSVTLASGEIELTAIRAQGAGGQHVNKTSTAIHLRFDIKAS...,15240.6293,tr|Q0WHX8|Q0WHX8_YERPE Putative uncharacterize...,tr|Q0WHX8|Q0WHX8_YERPE
224,MRLNTLSPAEGAKHAPKRVGRGIGSGLGKTAGRGHKGQNSRSGGGV...,15206.4718,sp|Q8ZJ93|RL15_YERPE 50S ribosomal protein L15...,sp|Q8ZJ93|RL15_YERPE
225,MQFCRLLYFGLAFVRKPTKVYSSIWMLYNDYWINSYSELIRLRGVF...,15125.6008,tr|Q0WEU9|Q0WEU9_YERPE Putative uncharacterize...,tr|Q0WEU9|Q0WEU9_YERPE
226,MAENQYYGTGRRKSSSARVFLKPGSGKIVINQRSLEVYFGRETARM...,14768.9187,sp|Q8ZB62|RS9_YERPE 30S ribosomal protein S9 O...,sp|Q8ZB62|RS9_YERPE
227,MRHRKSGRQLNRNSSHRQAMFRNMAGSLVRHEIIKTTLPKAKELRR...,14531.6307,sp|Q7CFS8|RL17_YERPE 50S ribosomal protein L17...,sp|Q7CFS8|RL17_YERPE
228,MGNKKQQRWLWYAWESRLKRIIAHVFGRRSKKTFRQLLGLLSGFNI...,14398.5391,tr|Q0WI79|Q0WI79_YERPE Insertion sequence prot...,tr|Q0WI79|Q0WI79_YERPE
229,MNKHVGWRKPHGITTQRRYNIALIKGYQLAKFIRVVMSHSGQAIDK...,14092.2973,tr|Q0WE89|Q0WE89_YERPE Putative membrane prote...,tr|Q0WE89|Q0WE89_YERPE
230,MSKHFRPTLHIYQCRCLLHGLSDHIGRGDIFNPQRRINGFMFFSLS...,14086.2832,tr|Q9RIC2|Q9RIC2_YERPE Putative uncharacterize...,tr|Q9RIC2|Q9RIC2_YERPE
231,MHHDYQRVYAACPAGDYFLLRNKWAIISQNSSPHITHICIVTDTNC...,13996.1212,tr|Q7CHY7|Q7CHY7_YERPE Putative uncharacterize...,tr|Q7CHY7|Q7CHY7_YERPE


In [20]:
seqhits
for key in seqhits:
    print(f'There are {seqhits[key][0]} hits in fraction {key}')
    for hit in seqhits[key][1]:
        print(f'\tHit on index {hit}')

There are 10 hits in fraction 1
	Hit on index 1
	Hit on index 11
	Hit on index 13
	Hit on index 16
	Hit on index 19
	Hit on index 20
	Hit on index 27
	Hit on index 28
	Hit on index 30
	Hit on index 36
There are 11 hits in fraction 2
	Hit on index 41
	Hit on index 42
	Hit on index 50
	Hit on index 52
	Hit on index 53
	Hit on index 57
	Hit on index 59
	Hit on index 62
	Hit on index 63
	Hit on index 65
	Hit on index 67
There are 15 hits in fraction 3
	Hit on index 79
	Hit on index 81
	Hit on index 87
	Hit on index 88
	Hit on index 92
	Hit on index 96
	Hit on index 97
	Hit on index 98
	Hit on index 100
	Hit on index 102
	Hit on index 104
	Hit on index 105
	Hit on index 107
	Hit on index 108
	Hit on index 109
There are 18 hits in fraction 4
	Hit on index 111
	Hit on index 112
	Hit on index 114
	Hit on index 119
	Hit on index 121
	Hit on index 123
	Hit on index 125
	Hit on index 126
	Hit on index 127
	Hit on index 128
	Hit on index 133
	Hit on index 134
	Hit on index 135
	Hit on index 139
	H

In [11]:
df

Unnamed: 0,sequence,MW,description,ID
0,MNRVQFNHHHHHHPD,1943.0746,sp|Q8D079|LPHI_YERPE his operon leader peptide...,sp|Q8D079|LPHI_YERPE
1,MAKEDNIEMQGTVLDTLPNTMFRVELENGHVVTAHISGKMRKNYIR...,8235.4593,sp|P65115|IF1_YERPE Translation initiation fac...,sp|P65115|IF1_YERPE
2,MLCAIYRSPKRDQTYLYIEKKDDFSRVPAELLASFGKPQFAMLLAL...,10226.8684,sp|Q7CID0|Y2080_YERPE YcgL domain-containing p...,sp|Q7CID0|Y2080_YERPE
3,MAWIILVIAGLLEVIWAIGLKYSHGFSRLTPSIITLVAMAASVFLL...,10650.8591,sp|Q8D1E4|GDX_YERPE Guanidinium exporter OS=Ye...,sp|Q8D1E4|GDX_YERPE
4,MKSSHFDEYDKTLKQAELAIADSDHRAKLLQEMCADIGLTPEAVMK...,11019.5549,sp|P69957|LCRG_YERPE Low calcium response locu...,sp|P69957|LCRG_YERPE
5,MFEQRVNSDVLTVATVNSQDQVTQKPLRDSVKQALKNYFAQLNGQD...,11196.7883,sp|Q8ZAX8|FIS_YERPE DNA-binding protein Fis OS...,sp|Q8ZAX8|FIS_YERPE
6,MLEFEGRIIDTDAQGYLKNSTDWSEALAPVLAEQEGITLTEPHWEV...,12394.1662,sp|Q8ZG65|TUSE_YERPE Sulfurtransferase TusE OS...,sp|Q8ZG65|TUSE_YERPE
7,MDNASKPTFQDVLEFVRMFRRKNKLQREIVDNEKKIRDNQKRVLLL...,12461.2426,sp|Q8ZFW5|Y1560_YERPE UPF0265 protein YPO1560/...,sp|Q8ZFW5|Y1560_YERPE
8,MSQRDTGAHYENLARRHLERAGLVFQAANVAFRGGEIDLIMRDGDA...,13076.6017,sp|Q8ZB75|Y3549_YERPE UPF0102 protein YPO3549/...,sp|Q8ZB75|Y3549_YERPE
9,MTAIDVMWVGLGGGIGSLLRWWIGLSIGKVYKGNFPLGTFLINISG...,13344.5506,sp|Q8ZDB1|CRCB3_YERPE Putative fluoride ion tr...,sp|Q8ZDB1|CRCB3_YERPE
