In [3]:
from Bio import SeqIO
from Bio.SeqUtils.IsoelectricPoint import IsoelectricPoint as IP
from Bio.SeqUtils.ProtParam import ProteinAnalysis as PA
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import os
from collections import defaultdict
import pandas as pd
import ipywidgets as widgets
from ipywidgets import *

In [8]:
#Create a selector for the intended size exclusion media
geldict = {
    "Bio-P 0.1-1.8 kDa" : [100,1800],
    "Bio-P 0.8-4.0 kDa" : [800,4000],
    "Bio-P 1.0-6.0 kDa" : [1000,6000],
    "Bio-P 1.5-20.0 kDa" : [1500,20000],
    "Bio-P 2.5-40.0 kDA" : [2500,40000],
    "Bio-P 3.0-60.0 kDa" : [3000,60000],
    "Bio-P 5.0-100 kDa" : [5000,100000],
    "S-X 0.4-14.0 kDa" : [400,14000],
    "S-X <2.0 kDA" : [0,2000],
    "S-X <0.4 kDA" : [0,400],
    "Bio-A 10.0 - 1500 kDA" : [10000,1500000],
}
gelselect = Dropdown(options=geldict.keys(),description="Media")
display(gelselect)



Dropdown(description='Media', options=('Bio-P 0.1-1.8 kDa', 'Bio-P 0.8-4.0 kDa', 'Bio-P 1.0-6.0 kDa', 'Bio-P 1…

In [20]:
def listtostr(strlist):
    output = ""
    for word in strlist:
        output += word + "_"
    return(output[:len(output)-1])
def simplifyname(orgname):
    firstname = [orgname.split("_")[0][0]]
    lastname = orgname.split("_")[1:]
    fullname = firstname + lastname
    return(listtostr(fullname))
def keytoindex(inputdict, string):
    n = 0
    for key in inputdict.keys():
        if key == string:
            return n
        n += 1

['Bio-P 0.1-1.8 kDa',
 'Bio-P 0.8-4.0 kDa',
 'Bio-P 1.0-6.0 kDa',
 'Bio-P 1.5-20.0 kDa',
 'Bio-P 2.5-40.0 kDA',
 'Bio-P 3.0-60.0 kDa',
 'Bio-P 5.0-100 kDa',
 'S-X 0.4-14.0 kDa',
 'S-X <2.0 kDA',
 'S-X <0.4 kDA',
 'Bio-A 10.0 - 1500 kDA']

In [56]:
#Generate a file path for the correct .fasta file
input_filename = input("Enter File Name")
datafile = os.path.join('data', input_filename)
organism_name = input_filename.replace(".faa","")

Enter File Name Yersinia_pestis.faa


In [57]:
data_folder = os.path.join('data', organism_name+"_size_exclusion_"+gelselect.value)

if not os.path.isdir(data_folder):
    os.mkdir(data_folder)

In [58]:
size_min = geldict[gelselect.value][0]
size_max = geldict[gelselect.value][1]
unfiltered = []
wash = []
fractions = []
unpassed = []
with open(datafile) as protfile:
    for record in SeqIO.parse(protfile,"fasta"):
        unfiltered.append(record)
        protparams = PA(str(record.seq))
        if protparams.molecular_weight() < size_min:
            wash.append(record)
        elif protparams.molecular_weight() <= size_max:
            fractions.append(record)
        else:
            unpassed.append(record)
            
data_name = f"Media{keytoindex(geldict,gelselect.value)}"+f"_{simplifyname(organism_name)}"        
SeqIO.write(unfiltered,os.path.join(data_folder,data_name+"_total.faa"),"fasta")
SeqIO.write(wash,os.path.join(data_folder,data_name+"_wash.faa"),"fasta")
SeqIO.write(fractions,os.path.join(data_folder,data_name+"_fractions.faa"),"fasta")
SeqIO.write(unpassed,os.path.join(data_folder,data_name+"_unpassed.faa"),"fasta")

771

In [66]:
#Converting modified fasta into a pd dataframe to show filter at work, preset file name used for simplicity

data = {
    'sequence' : [],
    'MW' : [],
    'description' : [],
    'ID' : [],
}


protMW = []
prot_desc = []
prot_name = []
prot_id = []
with open(os.path.join(data_folder,data_name+"_fractions.faa")) as protfile:
  for record in SeqIO.parse(protfile,"fasta"):
    sequence = str(record.seq)
    data['sequence'].append(sequence)
    protparams = PA(sequence)
    data['MW'].append(protparams.molecular_weight())
    data['description'].append(record.description)
    data['ID'].append(record.id)


df = pd.DataFrame.from_dict(data)

df.sort_values(by=["MW"], inplace=True)
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,sequence,MW,description,ID
0,MNRVQFNHHHHHHPD,1943.0746,sp|Q8D079|LPHI_YERPE his operon leader peptide...,sp|Q8D079|LPHI_YERPE
1,MRYISLNTTIITTTETTGYGAG,2364.6268,sp|Q8CKC6|LPT_YERPE thr operon leader peptide ...,sp|Q8CKC6|LPT_YERPE
2,MKAILQVINLVLISVVVIIIPPCGAALGRRKA,3370.2549,sp|Q8D1L4|LPID_YERPE ilv operon leader peptide...,sp|Q8D1L4|LPID_YERPE
3,MSQISTKHRTVLFRRWMAIICCLIINIAYLVY,3857.7223,sp|O68693|YSCA_YERPE Yop proteins translocatio...,sp|O68693|YSCA_YERPE
4,MKVRASVKKLCRNCKIVKRNGVVRVICSAEPKHKQRQG,4349.2785,sp|Q8ZJ91|RL361_YERPE 50S ribosomal protein L3...,sp|Q8ZJ91|RL361_YERPE
...,...,...,...,...
251,MYHVIAATTNPAKINAITLAFDDVYGPGQYRIEGVNVDSGVPLQPI...,19694.2785,sp|Q8ZIP1|NCPP_YERPE Inosine/xanthosine tripho...,sp|Q8ZIP1|NCPP_YERPE
252,MIGILNRWRQFGRRYFWPHLLLGMVAASLGVPLNLSGVPDHAALAN...,19846.5564,sp|Q8ZIE1|SECM_YERPE Secretion monitor OS=Yers...,sp|Q8ZIE1|SECM_YERPE
253,MMSLWIAIGALSTLALVSGVVLGFAARRFQVDEDPVVEQVDAILPQ...,19886.8811,sp|Q8ZEC9|RNFB_YERPE Ion-translocating oxidore...,sp|Q8ZEC9|RNFB_YERPE
254,MFIGDASILKPIQWCATEHPELPADIADWLMELGSMTRRFEQHCQR...,19915.6190,sp|Q8ZJ20|UBIC_YERPE Chorismate pyruvate-lyase...,sp|Q8ZJ20|UBIC_YERPE


In [67]:
nfractions = 7
fraclen = round(len(df)/nfractions)
seqofinterest = "HHHHHH"
seqhits = {}
for n in range(nfractions - 1):
    fracrecords = []
    tempdf = df.iloc[n*fraclen:(n+1)*fraclen]
    for index  in tempdf.index:
        record = SeqRecord(Seq(tempdf['sequence'][index]),id=str(tempdf["ID"][index]),description=tempdf["description"][index])
        fracrecords.append(record)
        if seqofinterest in tempdf['sequence'][index]:
            if str(n) in seqhits:
                seqhits[str(n+1)][0] += 1
                seqhits[str(n+1)][1].append(index)
            else:
                seqhits[str(n+1)] = [1,[index]]
    SeqIO.write(fracrecords,os.path.join(data_folder,data_name+"_fraction"+ str(n+1) + ".faa"),"fasta")

tempdf = df.iloc[(nfractions-1)*fraclen:]
remainderrecord = []
for index  in tempdf.index:
        record = SeqRecord(Seq(tempdf['sequence'][index]),id=str(tempdf["ID"][index]),description=tempdf["description"][index])
        remainderrecord.append(record)
        if seqofinterest in tempdf['sequence'][index]:
            if str(n) in seqhits:
                seqhits[str(n+1)][0] += 1
                seqhits[str(n+1)][1].append(index)
            else:
                seqhits[str(n+1)] = [1,[index]]
SeqIO.write(remainderrecord,os.path.join(data_folder,data_name+"_fraction"+ str(nfractions) + ".faa"),"fasta")
tempdf

Unnamed: 0,sequence,MW,description,ID
222,MKYQQLENLESGWKWAYLVKKHREGEAITRHIENSAAQDAVEQLMK...,17933.3086,sp|Q8ZG78|MATP_YERPE Macrodomain Ter protein O...,sp|Q8ZG78|MATP_YERPE
223,MNPRRKSRLYLAMVVLIGISLTTTLVLYALRSNIDLFYTPGEILQG...,18032.4853,sp|Q8ZD54|CCME_YERPE Cytochrome c-type biogene...,sp|Q8ZD54|CCME_YERPE
224,MLNIVLFEPEIPPNTGNIIRLCANTGCQLHLIKPLGFTWDDKRLRR...,18233.64,sp|Q74Y93|TRML_YERPE tRNA (cytidine(34)-2'-O)-...,sp|Q74Y93|TRML_YERPE
225,MKKWLCAASLGLALAASASVQAADKIAIVNVSSIFQQLPAREAVAK...,18278.6191,sp|P58607|SKP_YERPE Chaperone protein Skp OS=Y...,sp|P58607|SKP_YERPE
226,MTVTLNRGSITSLMSSSQAVSTLQPVASELKTQLENKLKSESAEKT...,18347.7399,sp|P68590|YSCH_YERPE Yop proteins translocatio...,sp|P68590|YSCH_YERPE
227,MPSFDIVSEIDMQEVRNAVENATRDLANRWDFRNVPASFELNEKNE...,18349.6449,sp|Q8ZC52|Y3170_YERPE UPF0234 protein YPO3170/...,sp|Q8ZC52|Y3170_YERPE
228,MRMSTTTEIIAHHWAFAVFLIGAVGLCGLMLLGAYFLGGRAQARAK...,18376.116,sp|Q0WDX2|NUOA_YERPE NADH-quinone oxidoreducta...,sp|Q0WDX2|NUOA_YERPE
229,MTKKKAYKPGSATIAQNKRARHEYFIEEEFEAGLALQGWEVKSLRA...,18407.1084,sp|Q8ZH14|SSRP_YERPE SsrA-binding protein OS=Y...,sp|Q8ZH14|SSRP_YERPE
230,MFDVLIYLFETYMHNEPEMLVDQDKITDDLADAGFYREDINNALNW...,18471.8873,sp|Q8ZJ77|SMG_YERPE Protein Smg OS=Yersinia pe...,sp|Q8ZJ77|SMG_YERPE
231,MKAKSLTLISITVMFFLFLIYSFNDLFFYSEVKYGDIHEHLDLRMQ...,18528.9512,sp|Q56978|PSAF_YERPE Protein PsaF OS=Yersinia ...,sp|Q56978|PSAF_YERPE


In [68]:
seqhits
for key in seqhits:
    print(f'Hits in fraction {key}')
    for hit in seqhits[key][1]:
        print(f'\tHit on index {hit}')

Hits in fraction 1
	Hit on index 0


In [65]:
df

Unnamed: 0,sequence,MW,description,ID
0,MNRVQFNHHHHHHPD,1943.0746,sp|Q8D079|LPHI_YERPE his operon leader peptide...,sp|Q8D079|LPHI_YERPE
1,MRYISLNTTIITTTETTGYGAG,2364.6268,sp|Q8CKC6|LPT_YERPE thr operon leader peptide ...,sp|Q8CKC6|LPT_YERPE
2,MKAILQVINLVLISVVVIIIPPCGAALGRRKA,3370.2549,sp|Q8D1L4|LPID_YERPE ilv operon leader peptide...,sp|Q8D1L4|LPID_YERPE
3,MSQISTKHRTVLFRRWMAIICCLIINIAYLVY,3857.7223,sp|O68693|YSCA_YERPE Yop proteins translocatio...,sp|O68693|YSCA_YERPE
4,MKVRASVKKLCRNCKIVKRNGVVRVICSAEPKHKQRQG,4349.2785,sp|Q8ZJ91|RL361_YERPE 50S ribosomal protein L3...,sp|Q8ZJ91|RL361_YERPE
...,...,...,...,...
251,MYHVIAATTNPAKINAITLAFDDVYGPGQYRIEGVNVDSGVPLQPI...,19694.2785,sp|Q8ZIP1|NCPP_YERPE Inosine/xanthosine tripho...,sp|Q8ZIP1|NCPP_YERPE
252,MIGILNRWRQFGRRYFWPHLLLGMVAASLGVPLNLSGVPDHAALAN...,19846.5564,sp|Q8ZIE1|SECM_YERPE Secretion monitor OS=Yers...,sp|Q8ZIE1|SECM_YERPE
253,MMSLWIAIGALSTLALVSGVVLGFAARRFQVDEDPVVEQVDAILPQ...,19886.8811,sp|Q8ZEC9|RNFB_YERPE Ion-translocating oxidore...,sp|Q8ZEC9|RNFB_YERPE
254,MFIGDASILKPIQWCATEHPELPADIADWLMELGSMTRRFEQHCQR...,19915.6190,sp|Q8ZJ20|UBIC_YERPE Chorismate pyruvate-lyase...,sp|Q8ZJ20|UBIC_YERPE
