In [88]:
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis as PA
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import os
import pandas as pd
import ipywidgets as widgets
from ipywidgets import *
import time
import stat
import math

pd.set_option('display.max_rows', 150)

In [53]:
#Additional columns can be added by including an attribute name and lambda function to define it, as long as includeDefaults=True the default attributes don't need to be redefined
def fasta_to_dataframe(fastaPath,includeDefaults = True, pH = 7.0, attributes={}): 
    #if includeDefaults is enabled, re-add the predefined parameters
    attributes.clear()
    if includeDefaults:
        attributes['Sequence'] = lambda record : str(record.seq)
        attributes['ID'] = lambda record : record.id
        attributes['Description'] = lambda record : record.description
        attributes['Molecular Weight'] = lambda record : PA(record.seq.replace('X','Q')).molecular_weight()
        attributes['Charge at pH {}'.format(pH)] = lambda record : PA(record.seq.replace('X','Q')).charge_at_pH(pH)
    elif not attributes:
        raise Exception('No attribute functions, set includeDefault = False or add attribute functions')
    
    #This is built to use the from_dict method to create a dataframe where each dictionary key is a column and it's associated data is in a list bound to that key.
    columns = attributes.keys() #List of our tracked variables for ease of access
    dataDict = {} #dictionary the dataframe will be made from
    for x in columns: #prep dataDict to have the data added to it, will be a dictionary where each key is tied to an empty list
        dataDict[x] = []
    with open(fastaPath):
        for record in SeqIO.parse(fastaPath,'fasta'):
            for attribute in columns: #For each attribute, append the output of it's defined function to the appropriate list in dataDict
                dataDict[attribute].append(attributes[attribute](record)) 
    df = pd.DataFrame.from_dict(dataDict) #This command builds the dataframe from dataDict
    return(df)
                
def fractionate(df,numFractions,noise=0.0,newIndices = False): #Evenly splits a dataframe into a list of fraction dataframes. Noise is how much each fraction will bleed over past it's exact boundaries 0.1 = 10% more data either direction.
    fracLen = round(len(df) / numFractions) #Base length each subdataframe will be (excluding the last one)
    fuzz = fracLen * noise #How many additional entries will be added as "noise"
    dfFractions = [] 
    for n in range(numFractions):
        fuzzyMin = max(0,int(round(n*fracLen - fuzz))) #Determine the minimum index for this subdivision
        fuzzyMax = min(len(df),int(round((n+1)*fracLen + fuzz))) #Determine the maximum index for this subdivision
        if n == numFractions - 1: #check if this fraction is the last one to prevent cutting off data near the end
            fuzzyMax = len(df)
            
        output = df.iloc[fuzzyMin:fuzzyMax]  #This creates a new dataframe for each subdivision so any necessary changes, like adjusting the index numbers, can be made before appending
        if newIndices:
            output.reset_index(drop=True, inplace=True)
            
        dfFractions.append(output)
    return(dfFractions)

In [95]:
#These dictionaries are used to link dropdown options to a boolean function that determines whether or not the sequence is separated properly by the media
sizeDict = {
    #"Bio-P 0.1-1.8 kDa" : lambda x : bool(100<x.molecular_weight()<1800),  #Currently disabled because the size range is too small to be useful
    "Bio-P 0.8-4.0 kDa" : lambda x : bool(800<x.molecular_weight()<4000),
    "Bio-P 1.0-6.0 kDa" : lambda x : bool(1000<x.molecular_weight()<6000),
    "Bio-P 1.5-20.0 kDa" : lambda x : bool(1500<x.molecular_weight()<20000),
    "Bio-P 2.5-40.0 kDA" : lambda x : bool(2500<x.molecular_weight()<40000),
    "Bio-P 3.0-60.0 kDa" : lambda x : bool(3000<x.molecular_weight()<60000),
    "Bio-P 5.0-100 kDa" : lambda x : bool(5000<x.molecular_weight()<100000),
    "S-X 0.4-14.0 kDa" : lambda x : bool(400<x.molecular_weight()<14000),
    #"S-X <2.0 kDA" : lambda x : bool(0<x.molecular_weight()<2000),         #Currently disabled because the size range is too small to be useful
    #"S-X <0.4 kDA" : lambda x : bool(0<x.molecular_weight()<400),          #Currently disabled because the size range is too small to be useful
    "Bio-A 10.0 - 500 kDA" : lambda x : bool(10000<x.molecular_weight()<500000),
    "Bio-A 10.0 - 1500 kDA" : lambda x : bool(10000<x.molecular_weight()<1500000),
}
ionDict = {
    "Q Media (Triethylamine +)" : lambda x : bool(x.charge_at_pH(pHslider.value)<=-0.01),
    "S Media (Sulfite -)" : lambda x : bool(x.charge_at_pH(pHslider.value)>=0.01),
}
affinityDict ={}

#This dictionary holds the previous ones for the method selection menu
methodDict = {
    'Size Exclusion' : sizeDict,
    'Ion Exchange' : ionDict,
    'Affinity Chromatography' : affinityDict,
}
sortColumn = {
    'Size Exclusion' : 'Molecular Weight',
    'Ion Exchange' : 'Charge at pH 7.0',
    'Affinity Chromatography' : ''
}


previousInputs=os.listdir('data') #List to keep track of previous files for conveinience
previousInputs.remove('.ipynb_checkpoints')
style = {'description_width': 'initial'} #shorthands for widget appearances description width
autoLayout = Layout(width='auto')

In [96]:
def confirmMethod(_): #Swaps media selection to match method
    mediaSelect.options = methodDict[methodSelect.value].keys()

sortColumn['Ion Exchange'] = 'Charge at pH 7.0'
methodSelect = widgets.Dropdown(options=methodDict.keys(),description='Method:')
methodButton = widgets.Button(description='Confirm Method')
methodButton.on_click(confirmMethod)
mediaSelect = widgets.Dropdown(options=methodDict[methodSelect.value].keys(),description='Media')
pHslider = widgets.FloatSlider(value=7.0,min=0,max=14,step=0.1,description='pH',style=style)
selectDisplay = TwoByTwoLayout(top_left=methodSelect,top_right=methodButton,bottom_left=mediaSelect,bottom_right=pHslider,layout=Layout(width='50%'))
display(selectDisplay)

ionDictValue = widgets.Output()

def foo(_): #Very temporary fix to a problem I found, will refine later
    with ionDictValue:
        sortColumn['Ion Exchange'] = 'Charge at pH {}'.format(pHslider.value)
        
pHslider.observe(foo)
ionDictValue
            

TwoByTwoLayout(children=(Dropdown(description='Method:', layout=Layout(grid_area='top-left'), options=('Size E…

Output()

In [97]:
def confirmInput(_):
    if inputFile.value in os.listdir('data'): #Check if the input is valid, show the error message otherwise
        if inputFile.value not in previousInputs:
            previousInputs.append(inputFile.value)
        currentInput.value = inputFile.value
        inputFile.options = previousInputs
        errorText.value = ''
    else:
        errorText.value = 'Error: \"{}\" not found'.format(inputFile.value)

boxLayout = Layout(width='30%')

inputFile = widgets.Combobox(value=previousInputs[0],placeholder='Enter a file to be separated',options=previousInputs,description='Unseparated data',style = style)
inputButton = widgets.Button(description='Confirm File')
currentInput = widgets.HTML(value=inputFile.value,description='Current input:',style = style)
errorText = widgets.HTML(value=None,description='\t',style={'text_color':'#CC0000','font_size':'16px'})


inputButton.on_click(confirmInput)




inputTop = Box(children=[inputFile,inputButton],layout=boxLayout)
inputBottom = Box(children=[currentInput,errorText],layout=boxLayout)
inputDisplay=VBox([inputTop,inputBottom])

display(inputDisplay)




VBox(children=(Box(children=(Combobox(value='Bacillus_subtilis.fasta', description='Unseparated data', options…

In [6]:
out = widgets.Output()

def checkFolder(_):
    if os.path.isdir(os.path.join('outputs',outputFile.value)):
        with out:
            header.description=f'\"{outputFile.value}\" already exists. Some files may be replaced. Continue?'
            outputFile.disabled = outputButton.disabled = True
            display(overwriteCheck)
    else:
        with out:
            dataFolder = outputFile.value
            os.mkdir(os.path.join('outputs',dataFolder))
            outputFile.disabled = outputButton.disabled = True
        

def confirmOverwrite(_):
    with out:
        out.clear_output()
        display(outputDisplay)
    
def denyOverwrite(_):
    with out:
        out.clear_output()
        display(outputDisplay)
        outputFile.value = ''
        outputFile.disabled = outputButton.disabled = False

#Generate the output name entry
outputFile = widgets.Text(placeholder='Enter a label for the output folder.', style=style,disabled=False,value=currentInput.value.partition('.')[0])
outputButton = widgets.Button(description='Confirm',disabled=False)
outputDisplay = Box(children=[outputFile,outputButton])

#Generate the overwrite prompt
confirmButton = widgets.Button(description='Yes',layout=Layout(width='auto'))
denyButton = widgets.Button(description='No',layout=Layout(width='auto'))
header = widgets.HTML(description=f'\"{outputFile.value}\" already exists. Continue?',layout=Layout(justify_content='center',width='auto'),style=style)
YesNo = Box(children=[confirmButton, denyButton],layout=Layout())
overwriteCheck = VBox([header,YesNo],layout=Layout(justify_content='center',),)

#Define button functionality
outputButton.on_click(checkFolder)
confirmButton.on_click(confirmOverwrite)
denyButton.on_click(denyOverwrite)

with out:
    display(outputDisplay)

out

Output()

In [92]:
#Bind user inputs to named variables for simplicity
inFile = os.path.join('data',inputFile.value)
outFolder = os.path.join('outputs',outputFile.value)
method = methodDict[methodSelect.value][mediaSelect.value]

#Create lists of SeqRecord objects that biopython can transform into .fasta files
totals = []
wash = []
separated = []

#Separate proteins by the parameter
with open(inFile):
    for record in SeqIO.parse(inFile,'fasta'):
        totals.append(record)
        sequence = str(record.seq)
        
        #X is a notation for any residue, biopython doesn't accept it though so for the protien analysis we substituted Glutamine since it's close to the average for the properties we care about.
        protparams = PA(sequence.replace("X","Q"))
        if method(protparams):
            separated.append(record)
        else:
            wash.append(record)

SeqIO.write(totals,os.path.join(outFolder,'{}_total'.format(outputFile.value)),'fasta')
SeqIO.write(separated,os.path.join(outFolder,'{}_separated'.format(outputFile.value)),'fasta')
SeqIO.write(wash,os.path.join(outFolder,'{}_wash'.format(outputFile.value)),'fasta')

main_df = fasta_to_dataframe(os.path.join(outFolder,'{}_separated'.format(outputFile.value)),includeDefaults=True,pH=pHslider.value)
main_df.index.name = 'Protein Index #'
main_df.sort_values(by=sortColumn[methodSelect.value],inplace=True)
fraction_dfs = fractionate(main_df,7)
for n in range(len(fraction_dfs)):
    records = []
    for index in range(len(fraction_dfs[n])):
        single_frac = fraction_dfs[n]
        records.append(SeqRecord(seq = Seq(single_frac.iloc[index]['Sequence']),id=single_frac.iloc[index]['ID'],description=single_frac.iloc[index]['Description']))
    SeqIO.write(records,os.path.join(outFolder,'{}_fraction_{}'.format(outputFile.value,n+1)),'fasta')
                                     
    

In [30]:
outPages = widgets.Output()

def refresh(_):
    with outPages:
        outPages.clear_output()
        temp_df = main_df.sort_values(by=sortBy.value,ascending=bool(ascButton.icon == 'sort-amount-asc'))
        display(dfMenu)
        display(temp_df.iloc[(pageNum.value-1)*pageSize.value:pageNum.value*pageSize.value])
        
def pageUp(_):
    with outPages:
        if (pageNum.value < pageNum.max):
            pageNum.value += 1
        if pageNum.value == pageNum.max:
            nextButton.disabled = True
        else:
            nextButton.disabled = False
        prevButton.disabled = False
        
        outPages.clear_output()
        temp_df = main_df.sort_values(by=sortBy.value,ascending=bool(ascButton.icon == 'sort-amount-asc'))
        display(dfMenu)
        display(temp_df.iloc[(pageNum.value-1)*pageSize.value:pageNum.value*pageSize.value])
def pageDown(_):
    with outPages:
        if (pageNum.value > pageNum.min):
            pageNum.value -= 1
        if pageNum.value == pageNum.min:
            prevButton.disabled = True
        else:
            prevButton.disabled = False
        nextButton.disabled = False
        
        outPages.clear_output()
        temp_df=main_df.sort_values(by=sortBy.value)
        display(dfMenu)
        display(temp_df.iloc[(pageNum.value-1)*pageSize.value:pageNum.value*pageSize.value])
        
def adjustMax(_):
    with outPages:
        pageNum.max = math.ceil(len(main_df)/pageSize.value)
        pageNum.value = 1
        
        outPages.clear_output()
        temp_df = main_df.sort_values(by=sortBy.value,ascending=bool(ascButton.icon == 'sort-amount-asc'))
        display(dfMenu)
        display(temp_df.iloc[(pageNum.value-1)*pageSize.value:pageNum.value*pageSize.value])
        

pageSize = widgets.BoundedIntText(value=25,min=5,max=100,step=5,description='Items per Page',style=style,layout=Layout(width='30%'))
pageNum = widgets.BoundedIntText(value=1,min=1,max=math.ceil(len(main_df)/pageSize.value),description='Page:',style=style,layout=Layout(width='18%'))
pageSize.observe(adjustMax)


#define navigation buttons
prevButton = widgets.Button(description='Prev',style=style,layout=Layout(width='10%'),disabled=True)
prevButton.on_click(pageDown)
nextButton = widgets.Button(description='Next',style=style,layout=Layout(width='10%'))
nextButton.on_click(pageUp)

#Define a list of categories it makes sense to sort, will automatically add any new columns to list
sortCategories = [main_df.index.name]
if len(sortCategories) <= 1:
    for column in list(main_df.keys()):
        sortCategories.append(column)
for i in ['ID','Description','Sequence']: #Define column titles that can be ignored as sortable values
    sortCategories.remove(i)
    
sortBy = widgets.Dropdown(options = sortCategories,description='Sort By:')
def changeSortBy(_):
    with outPages:
        outPages.clear_output()
        temp_df = main_df.sort_values(by=sortBy.value,ascending=bool(ascButton.icon == 'sort-amount-asc'))
        display(dfMenu)
        display(temp_df.iloc[(pageNum.value-1)*pageSize.value:pageNum.value*pageSize.value])

sortBy.observe(changeSortBy)
ascButton = widgets.Button(icon='sort-amount-asc',layout=Layout(max_width='35px'))
def swapSort(_):
    with outPages:
        if ascButton.icon == 'sort-amount-asc':
            ascButton.icon = 'sort-amount-desc'
        else:
            ascButton.icon = 'sort-amount-asc'
        outPages.clear_output()
        temp_df = main_df.sort_values(by=sortBy.value,ascending=bool(ascButton.icon == 'sort-amount-asc'))
        display(dfMenu)
        display(temp_df.iloc[(pageNum.value-1)*pageSize.value:pageNum.value*pageSize.value])

    
ascButton.on_click(swapSort)

pageBox = Box(children=[prevButton,pageNum,nextButton,pageSize],layout=Layout(width='35%'))
dfMenu = Box(children=[pageBox,sortBy,ascButton])

temp_df = main_df.sort_values(by=sortBy.value,ascending=bool(ascButton.icon == 'sort-amount-asc'))
with outPages:
    display(dfMenu)
    display(temp_df.iloc[(pageNum.value-1)*pageSize.value:pageNum.value*pageSize.value])
    
outPages

Output()

In [None]:
nextButton.disabled

In [29]:
sortCategories

['Protein Index #', 'charge at pH 7.0', 'Molecular Weight']

In [None]:
type(iteratePage)

In [None]:
with outPages:
    outPages.clear_output()

In [None]:
sortBy

In [122]:
sortCategories = []

main_df.sort_values(by='Molecular Weight')

In [93]:
print(sortColumn[methodSelect.value])
main_df

Charge at pH 5.2


Unnamed: 0_level_0,Sequence,ID,Description,Molecular Weight,Charge at pH 5.2
Protein Index #,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
71,MNEQTTTNTAAHDEPLAVLPPVDDDAAGREAVREKMADALTPGFQV...,sp|P27192|TRAD5_ECOLX,sp|P27192|TRAD5_ECOLX Protein TraD OS=Escheric...,13600.4916,-18.813572
72,MNDPKTVQQDDFAPFDDTANAAAALREKLADAMTPGFQVEFDPEEA...,sp|P27193|TRAD4_ECOLX,sp|P27193|TRAD4_ECOLX Protein TraD OS=Escheric...,9218.68,-15.70934
75,MDNAILNSELIAIQAGNIIVYNYDGGNREYISASTEYLAVGVGIPA...,sp|Q47427|TFAB_ECOLX,sp|Q47427|TFAB_ECOLX Tail fiber assembly prote...,22198.2712,-12.947646
68,MNNIPPIPQLGIYVSKIDPTLRITVTDVDIVDGEDDSPDDELFYLV...,sp|P21321|YR7G_ECOLX,sp|P21321|YR7G_ECOLX Protein ORFg in retron Ec...,12017.1133,-12.588549
12,MREAVIAEVSTQLSEVVGVIERHLEPTLLAVHLYGSAVDGGLKPHS...,sp|P0AG05|S3AD_ECOLX,sp|P0AG05|S3AD_ECOLX Aminoglycoside (3'') (9) ...,29331.9272,-10.581448
7,MGLKLDLTWFDKSTEDFKGEEYSKDFGDDGSVMESLGVPFKDNVNN...,sp|P02984|IMM3_ECOLX,sp|P02984|IMM3_ECOLX Colicin-E3 immunity prote...,9903.6998,-10.370982
10,MTAAQAKTTKKNTAAAAQEAAGAAQPSGLGLDSIGDLSSLLDAPAA...,sp|P07674|KORB2_ECOLX,sp|P07674|KORB2_ECOLX Transcriptional represso...,39010.6766,-10.050167
11,MDTTQVTLIHKILAAADERNLPLWIGGGWAIDARLGRVTRKHDDID...,sp|P0AE04|AADB_ECOLX,sp|P0AE04|AADB_ECOLX 2''-aminoglycoside nucleo...,19873.1785,-9.542451
21,MAELNLSNLTEADIITKCVMPAILNAGWDNTTQIRQEVKLRDGKVI...,sp|Q07736|T1RA_ECOLX,sp|Q07736|T1RA_ECOLX Type I restriction enzyme...,92070.0585,-9.232618
58,MSSRSELLLDRFAEKIGIGSISFNENRLCSFAIDEIYYISLSDAND...,sp|Q47015|CEST_ECOLX,sp|Q47015|CEST_ECOLX Tir chaperone OS=Escheric...,17746.804,-9.183543


In [34]:
fraction_dfs[6].sort_values(by='Molecular Weight')

Unnamed: 0_level_0,charge at pH 7.0,Sequence,ID,Description,Molecular Weight
Protein Index #,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
487,-0.515034,MPGKVQDFFLCSLLLRIVSAGWCD,sp|P62550|REPL2_ECOLX,sp|P62550|REPL2_ECOLX Positive regulator of Re...,2699.2169
450,0.472565,MGISPPCYGNWPTLDETVKNVFTCLNFVVVWRVIFNPQRQGSWISC,sp|P19755|YPC4_ECOLX,sp|P19755|YPC4_ECOLX Uncharacterized 5.3 kDa p...,5304.1105
442,5.840035,MGGRFSGRVGIEKGGHPPSAADHSAGHLGPVCRFFRHPVITTRFNI...,sp|P14504|YP54_ECOLX,sp|P14504|YP54_ECOLX Uncharacterized 5.4 kDa p...,5472.3023
443,6.503207,MKAVGRGATPLGGWEAALAAGWGSRRGGTPLRRPIILRGTLGRFAA...,sp|P14505|YP55_ECOLX,sp|P14505|YP55_ECOLX Uncharacterized 5.5 kDa p...,5511.3783
448,2.593177,MQSLAQFKSSGLWVTTHAWLNDRFLLPESQQKNLAELKRSFLDPAL...,sp|P18352|YPC3_ECOLX,sp|P18352|YPC3_ECOLX Uncharacterized 6.6 kDa p...,6581.5586
...,...,...,...,...,...
475,-6.989848,MKITDHKLSEGIALTFRVPEGNIKHPLIILCHGFCGIRNVLLPCFA...,sp|P29368|YPT1_ECOLX,sp|P29368|YPT1_ECOLX Uncharacterized 31.7 kDa ...,31758.9649
454,5.102524,MGKSKKNRAAATNQLKHKSQTSAEAFSFGDPVPVLDRRELLDYVEC...,sp|P21314|YR75_ECOLX,sp|P21314|YR75_ECOLX Protein ORF5 in retron Ec...,36818.5742
463,6.048823,MSELVVFKANELAISRYDLTEHETKLILCCVALLNPTIENPTRKER...,sp|P22308|REPY_ECOLX,sp|P22308|REPY_ECOLX Replication initiation pr...,37354.7330
503,6.211080,MGAIHEETANRSPIPDGHQGAGDRAADHRHSARRAGRWPAPGGVCH...,sp|Q52312|INCC1_ECOLX,sp|Q52312|INCC1_ECOLX Protein IncC OS=Escheric...,38246.0355


In [71]:
a = widgets.IntSlider(description='a')
b = widgets.IntSlider(description='b')
c = widgets.IntSlider(description='c')
def f(a, b, c):
    print('{}*{}*{}={}'.format(a, b, c, a*b*c))

out = widgets.interactive_output(f, {'a': a, 'b': b, 'c': c})

widgets.HBox([widgets.VBox([a, b, c]), out])

HBox(children=(VBox(children=(IntSlider(value=0, description='a'), IntSlider(value=0, description='b'), IntSli…

In [None]:
attributes={'Sequence' : lambda record : str(record.seq),
                                                                         'ID' : lambda record : record.id,
                                                                         'Description' : lambda record : record.description,
                                                                         'Molecular Weight' : lambda record : PA(record.seq).molecular_weight(),
                                                                         'Charge': lambda record : PA(record.seq.replace('X','Q')).charge_at_pH()