In [1]:
from Bio import SeqIO
from Bio.SeqUtils.IsoelectricPoint import IsoelectricPoint as IP
from Bio.SeqUtils.ProtParam import ProteinAnalysis as PA
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import os
from collections import defaultdict
import pandas as pd
import ipywidgets as widgets
from ipywidgets import *

In [2]:
#Create a selector for the intended size exclusion media
geldict = {
    "Bio-P 0.1-1.8 kDa" : [100,1800],
    "Bio-P 0.8-4.0 kDa" : [800,4000],
    "Bio-P 1.0-6.0 kDa" : [1000,6000],
    "Bio-P 1.5-20.0 kDa" : [1500,20000],
    "Bio-P 2.5-40.0 kDA" : [2500,40000],
    "Bio-P 3.0-60.0 kDa" : [3000,60000],
    "Bio-P 5.0-100 kDa" : [5000,100000],
    "S-X 0.4-14.0 kDa" : [400,14000],
    "S-X <2.0 kDA" : [0,2000],
    "S-X <0.4 kDA" : [0,400],
    "Bio-A 10.0 - 1500 kDA" : [10000,1500000],
}
gelselect = Dropdown(options=geldict.keys(),description="Media")
display(gelselect)



Dropdown(description='Media', options=('Bio-P 0.1-1.8 kDa', 'Bio-P 0.8-4.0 kDa', 'Bio-P 1.0-6.0 kDa', 'Bio-P 1â€¦

In [4]:
def listtostr(strlist):
    output = ""
    for word in strlist:
        output += word + "_"
    return(output[:len(output)-1])
def simplifyname(orgname):
    firstname = [orgname.split("_")[0][0]]
    lastname = orgname.split("_")[1:]
    fullname = firstname + lastname
    return(listtostr(fullname))
def keytoindex(inputdict, string):
    n = 0
    for key in inputdict.keys():
        if key == string:
            return n
        n += 1

In [5]:
#Generate a file path for the correct .fasta file
input_filename = input("Enter File Name")
datafile = os.path.join('data', input_filename)
organism_name = input_filename.replace(".faa","")

Enter File Name Y_pestis_ionexchange.faa


In [6]:
data_folder = os.path.join('data', organism_name+"_size_exclusion_"+gelselect.value)

if not os.path.isdir(data_folder):
    os.mkdir(data_folder)

In [7]:
size_min = geldict[gelselect.value][0]
size_max = geldict[gelselect.value][1]
unfiltered = []
wash = []
fractions = []
unpassed = []
with open(datafile) as protfile:
    for record in SeqIO.parse(protfile,"fasta"):
        unfiltered.append(record)
        protparams = PA(str(record.seq))
        if protparams.molecular_weight() < size_min:
            wash.append(record)
        elif protparams.molecular_weight() <= size_max:
            fractions.append(record)
        else:
            unpassed.append(record)
            
data_name = f"Media{keytoindex(geldict,gelselect.value)}"+f"_{simplifyname(organism_name)}"        
SeqIO.write(unfiltered,os.path.join(data_folder,data_name+"_total.faa"),"fasta")
SeqIO.write(wash,os.path.join(data_folder,data_name+"_wash.faa"),"fasta")
SeqIO.write(fractions,os.path.join(data_folder,data_name+"_fractions.faa"),"fasta")
SeqIO.write(unpassed,os.path.join(data_folder,data_name+"_unpassed.faa"),"fasta")

58

In [8]:
#Converting modified fasta into a pd dataframe to show filter at work, preset file name used for simplicity

data = {
    'sequence' : [],
    'MW' : [],
    'description' : [],
    'ID' : [],
}


protMW = []
prot_desc = []
prot_name = []
prot_id = []
with open(os.path.join(data_folder,data_name+"_fractions.faa")) as protfile:
  for record in SeqIO.parse(protfile,"fasta"):
    sequence = str(record.seq)
    data['sequence'].append(sequence)
    protparams = PA(sequence)
    data['MW'].append(protparams.molecular_weight())
    data['description'].append(record.description)
    data['ID'].append(record.id)


df = pd.DataFrame.from_dict(data)

df.sort_values(by=["MW"], inplace=True)
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,sequence,MW,description,ID
0,MNRVQFNHHHHHHPD,1943.0746,sp|Q8D079|LPHI_YERPE his operon leader peptide...,sp|Q8D079|LPHI_YERPE
1,MAKEDNIEMQGTVLDTLPNTMFRVELENGHVVTAHISGKMRKNYIR...,8235.4593,sp|P65115|IF1_YERPE Translation initiation fac...,sp|P65115|IF1_YERPE
2,MLCAIYRSPKRDQTYLYIEKKDDFSRVPAELLASFGKPQFAMLLAL...,10226.8684,sp|Q7CID0|Y2080_YERPE YcgL domain-containing p...,sp|Q7CID0|Y2080_YERPE
3,MAWIILVIAGLLEVIWAIGLKYSHGFSRLTPSIITLVAMAASVFLL...,10650.8591,sp|Q8D1E4|GDX_YERPE Guanidinium exporter OS=Ye...,sp|Q8D1E4|GDX_YERPE
4,MKSSHFDEYDKTLKQAELAIADSDHRAKLLQEMCADIGLTPEAVMK...,11019.5549,sp|P69957|LCRG_YERPE Low calcium response locu...,sp|P69957|LCRG_YERPE
5,MFEQRVNSDVLTVATVNSQDQVTQKPLRDSVKQALKNYFAQLNGQD...,11196.7883,sp|Q8ZAX8|FIS_YERPE DNA-binding protein Fis OS...,sp|Q8ZAX8|FIS_YERPE
6,MLEFEGRIIDTDAQGYLKNSTDWSEALAPVLAEQEGITLTEPHWEV...,12394.1662,sp|Q8ZG65|TUSE_YERPE Sulfurtransferase TusE OS...,sp|Q8ZG65|TUSE_YERPE
7,MDNASKPTFQDVLEFVRMFRRKNKLQREIVDNEKKIRDNQKRVLLL...,12461.2426,sp|Q8ZFW5|Y1560_YERPE UPF0265 protein YPO1560/...,sp|Q8ZFW5|Y1560_YERPE
8,MSQRDTGAHYENLARRHLERAGLVFQAANVAFRGGEIDLIMRDGDA...,13076.6017,sp|Q8ZB75|Y3549_YERPE UPF0102 protein YPO3549/...,sp|Q8ZB75|Y3549_YERPE
9,MTAIDVMWVGLGGGIGSLLRWWIGLSIGKVYKGNFPLGTFLINISG...,13344.5506,sp|Q8ZDB1|CRCB3_YERPE Putative fluoride ion tr...,sp|Q8ZDB1|CRCB3_YERPE


In [9]:
nfractions = 7
fraclen = round(len(df)/nfractions)
seqofinterest = "HHHHHH"
seqhits = {}
for n in range(nfractions - 1):
    fracrecords = []
    tempdf = df.iloc[n*fraclen:(n+1)*fraclen]
    for index  in tempdf.index:
        record = SeqRecord(Seq(tempdf['sequence'][index]),id=str(tempdf["ID"][index]),description=tempdf["description"][index])
        fracrecords.append(record)
        if seqofinterest in tempdf['sequence'][index]:
            if str(n) in seqhits:
                seqhits[str(n+1)][0] += 1
                seqhits[str(n+1)][1].append(index)
            else:
                seqhits[str(n+1)] = [1,[index]]
    SeqIO.write(fracrecords,os.path.join(data_folder,data_name+"_fraction"+ str(n+1) + ".faa"),"fasta")

tempdf = df.iloc[(nfractions-1)*fraclen:]
remainderrecord = []
for index  in tempdf.index:
        record = SeqRecord(Seq(tempdf['sequence'][index]),id=str(tempdf["ID"][index]),description=tempdf["description"][index])
        remainderrecord.append(record)
        if seqofinterest in tempdf['sequence'][index]:
            if str(n) in seqhits:
                seqhits[str(n+1)][0] += 1
                seqhits[str(n+1)][1].append(index)
            else:
                seqhits[str(n+1)] = [1,[index]]
SeqIO.write(remainderrecord,os.path.join(data_folder,data_name+"_fraction"+ str(nfractions) + ".faa"),"fasta")
tempdf

Unnamed: 0,sequence,MW,description,ID


In [10]:
seqhits
for key in seqhits:
    print(f'Hits in fraction {key}')
    for hit in seqhits[key][1]:
        print(f'\tHit on index {hit}')

Hits in fraction 1
	Hit on index 0


In [11]:
df

Unnamed: 0,sequence,MW,description,ID
0,MNRVQFNHHHHHHPD,1943.0746,sp|Q8D079|LPHI_YERPE his operon leader peptide...,sp|Q8D079|LPHI_YERPE
1,MAKEDNIEMQGTVLDTLPNTMFRVELENGHVVTAHISGKMRKNYIR...,8235.4593,sp|P65115|IF1_YERPE Translation initiation fac...,sp|P65115|IF1_YERPE
2,MLCAIYRSPKRDQTYLYIEKKDDFSRVPAELLASFGKPQFAMLLAL...,10226.8684,sp|Q7CID0|Y2080_YERPE YcgL domain-containing p...,sp|Q7CID0|Y2080_YERPE
3,MAWIILVIAGLLEVIWAIGLKYSHGFSRLTPSIITLVAMAASVFLL...,10650.8591,sp|Q8D1E4|GDX_YERPE Guanidinium exporter OS=Ye...,sp|Q8D1E4|GDX_YERPE
4,MKSSHFDEYDKTLKQAELAIADSDHRAKLLQEMCADIGLTPEAVMK...,11019.5549,sp|P69957|LCRG_YERPE Low calcium response locu...,sp|P69957|LCRG_YERPE
5,MFEQRVNSDVLTVATVNSQDQVTQKPLRDSVKQALKNYFAQLNGQD...,11196.7883,sp|Q8ZAX8|FIS_YERPE DNA-binding protein Fis OS...,sp|Q8ZAX8|FIS_YERPE
6,MLEFEGRIIDTDAQGYLKNSTDWSEALAPVLAEQEGITLTEPHWEV...,12394.1662,sp|Q8ZG65|TUSE_YERPE Sulfurtransferase TusE OS...,sp|Q8ZG65|TUSE_YERPE
7,MDNASKPTFQDVLEFVRMFRRKNKLQREIVDNEKKIRDNQKRVLLL...,12461.2426,sp|Q8ZFW5|Y1560_YERPE UPF0265 protein YPO1560/...,sp|Q8ZFW5|Y1560_YERPE
8,MSQRDTGAHYENLARRHLERAGLVFQAANVAFRGGEIDLIMRDGDA...,13076.6017,sp|Q8ZB75|Y3549_YERPE UPF0102 protein YPO3549/...,sp|Q8ZB75|Y3549_YERPE
9,MTAIDVMWVGLGGGIGSLLRWWIGLSIGKVYKGNFPLGTFLINISG...,13344.5506,sp|Q8ZDB1|CRCB3_YERPE Putative fluoride ion tr...,sp|Q8ZDB1|CRCB3_YERPE
