In [30]:
'''
PACKAGES
'''
#Bio packages for reading protein data
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis as PA
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

import os #File reading and manipulation
import pandas as pd #Dataframes to keep track of, and display data
import ipywidgets as widgets #Interactable elements
from ipywidgets import *
import math #Simplifying certain operations

pd.set_option('display.max_rows', 150)

'''
FUNCTIONS
'''
def fasta_to_dataframe(fastaPath,includeDefaults = True, pH = 7.0, attributes={}): 
    '''
    Reads a FASTA formatted file of a proteome and returns a pandas dataframe recording the attributes of each proteins entry, attributes are adjusted default includes sequence, molecular weight,
    ID, and description. All but molecular weight are necessary for rewriting proteins into another FASTA file.
    
    fastaPath - File path to the FASTA file being read
    includeDefautlts - Bool value on whether or not to include the 4 predefined columns
    attributes - Columns to go in the dataframe, formatted as a dictionary where each entry key is the column label and the entry is a lambda function that defines the data in that column
    '''
    attributes.clear()
    if includeDefaults:
        attributes['Sequence'] = lambda record : str(record.seq)
        attributes['ID'] = lambda record : record.id
        attributes['Description'] = lambda record : record.description
        attributes['Molecular Weight'] = lambda record : PA(record.seq.replace('X','Q')).molecular_weight()
        attributes['Charge at pH {}'.format(pH)] = lambda record : PA(record.seq.replace('X','Q')).charge_at_pH(pH)
    elif not attributes:
        raise Exception('No attribute functions, set includeDefault = False or add attribute functions')
    
    #This is built to use the from_dict method to create a dataframe where each dictionary key is a column and it's associated data is in a list bound to that key.
    columns = attributes.keys() #List of our tracked variables for ease of access
    dataDict = {} #dictionary the dataframe will be made from
    for x in columns: #prep dataDict to have the data added to it, will be a dictionary where each key is tied to an empty list
        dataDict[x] = []
    with open(fastaPath):
        for record in SeqIO.parse(fastaPath,'fasta'):
            for attribute in columns: #For each attribute, append the output of it's defined function to the appropriate list in dataDict
                dataDict[attribute].append(attributes[attribute](record)) 
    df = pd.DataFrame.from_dict(dataDict) #This command builds the dataframe from dataDict
    return(df)
                
def fractionate(df,numFractions,noise=0.0,newIndices = False):
    '''
    Splits a dataframe into equal segments, returned as a list of sub-dataframes.
    
    df - Dataframe being split
    numFractions - Int value, how many subdivisions are made.
    noise - Float value determining how much subdivisions can bleed over into one another. (Ex: noise=0.1 would result in each subdivision growing 10% larger in either 
            direction, overlapping that data with its neighbors
    newIndices - Determines whether the index numbers for the subdivisions will be reset to start from 0 or retain the indices of those entries in the original df
    '''
    fracLen = round(len(df) / numFractions) #Base length each sub-dataframe will be (excluding the last one)
    fuzz = fracLen * noise #How many additional entries will be added as "noise"
    dfFractions = [] 
    for n in range(numFractions):
        fuzzyMin = max(0,int(round(n*fracLen - fuzz))) #Determine the minimum index for this subdivision
        fuzzyMax = min(len(df),int(round((n+1)*fracLen + fuzz))) #Determine the maximum index for this subdivision
        if n == numFractions - 1: #check if this fraction is the last one to prevent cutting off data near the end
            fuzzyMax = len(df)
            
        output = df.iloc[fuzzyMin:fuzzyMax]  #This creates a new dataframe for each subdivision so any necessary changes, like adjusting the index numbers, can be made before appending
        if newIndices:
            output.reset_index(drop=True, inplace=True)
            
        dfFractions.append(output)
    return(dfFractions)



'''
GLOBAL VARIABLES
'''
#These dictionaries are used to link dropdown options to a boolean function that determines whether or not the sequence is separated properly by the media
sizeDict = {
    #"Bio-P 0.1-1.8 kDa" : lambda x : bool(100<x.molecular_weight()<1800),  #Currently disabled because the size range is too small to be useful
    "Bio-P 0.8-4.0 kDa" : lambda x : bool(800<x.molecular_weight()<4000),
    "Bio-P 1.0-6.0 kDa" : lambda x : bool(1000<x.molecular_weight()<6000),
    "Bio-P 1.5-20.0 kDa" : lambda x : bool(1500<x.molecular_weight()<20000),
    "Bio-P 2.5-40.0 kDA" : lambda x : bool(2500<x.molecular_weight()<40000),
    "Bio-P 3.0-60.0 kDa" : lambda x : bool(3000<x.molecular_weight()<60000),
    "Bio-P 5.0-100 kDa" : lambda x : bool(5000<x.molecular_weight()<100000),
    "S-X 0.4-14.0 kDa" : lambda x : bool(400<x.molecular_weight()<14000),
    #"S-X <2.0 kDA" : lambda x : bool(0<x.molecular_weight()<2000),         #Currently disabled because the size range is too small to be useful
    #"S-X <0.4 kDA" : lambda x : bool(0<x.molecular_weight()<400),          #Currently disabled because the size range is too small to be useful
    "Bio-A 10.0 - 500 kDA" : lambda x : bool(10000<x.molecular_weight()<500000),
    "Bio-A 10.0 - 1500 kDA" : lambda x : bool(10000<x.molecular_weight()<1500000),
}
ionDict = {
    "Q Media (Triethylamine +)" : lambda x : bool(x.charge_at_pH(pHslider.value)<=-0.01),
    "S Media (Sulfite -)" : lambda x : bool(x.charge_at_pH(pHslider.value)>=0.01),
}
affinityDict ={} #Empty for now until more affinity separations are researched and implemented

#This dictionary holds the previous ones for the method selection menu
methodDict = {
    'Size Exclusion' : sizeDict,
    'Ion Exchange' : ionDict,
    'Affinity Chromatography' : affinityDict,
}
sortColumn = {
    'Size Exclusion' : 'Molecular Weight',
    'Ion Exchange' : 'Charge at pH 7.0',
    'Affinity Chromatography' : ''
}
data=os.listdir('data') #List to keep track of previous files for conveinience
data.remove('.ipynb_checkpoints')
style = {'description_width': 'initial'} #shorthands for widget appearances description width
autoLayout = Layout(width='auto')


'''
MEDIA & METHOD SELECTION
'''
def confirmMethod(_): #Swaps media selection to match method
    with init:
        mediaSelect.options = methodDict[methodSelect.value].keys()
        
def pHUpdate(_):
    with init:
        sortColumn['Ion Exchange'] = 'Charge at pH {}'.format(pHslider.value)
        
init = widgets.Output()
methodSelect = widgets.Dropdown(options=methodDict.keys(),description='Method:')
methodSelect.observe(confirmMethod)
mediaSelect = widgets.Dropdown(options=methodDict[methodSelect.value].keys(),description='Media')
pHslider = widgets.FloatSlider(value=7.0,min=0,max=14,step=0.1,description='pH',style=style)
pHslider.observe(pHUpdate)
selectDisplay = TwoByTwoLayout(top_left=methodSelect,bottom_left=mediaSelect,bottom_right=pHslider,layout=Layout(width='50%'))


'''
FASTA INPUT SELECTION
'''
def confirmInput(_):
    with init:
        file = inputFile.value
        fileEnd = file.split('.')[1]
        fastaEnds = ['fasta', 'fna', 'ffn', 'faa', 'frn', 'fa']
        if fileEnd in fastaEnds:
            if inputFile.value not in previousInputs:
                previousInputs.append(inputFile.value)
            currentInput.value = inputFile.value
            inputFile.options = previousInputs
            errorText.value = ''
        else:
            errorText.value = 'Error: \"{}\" is not recognized as a FASTA file'.format(inputFile.value)

boxLayout = Layout(width='30%')

inputFile = widgets.Combobox(value=data[0],placeholder='Enter a file to be separated',options=data,description='Unseparated data',style = style)

inputButton = widgets.Button(description='Confirm File')
currentInput = widgets.HTML(value=inputFile.value,description='Current input:',style = style)
errorText = widgets.HTML(value=None,description='\t',style={'text_color':'#CC0000','font_size':'16px'})


inputButton.on_click(confirmInput)
inputFile.continuous_update = False
inputFile.observe(confirmInput)



inputTop = Box(children=[inputFile,inputButton],layout=boxLayout)
inputBottom = Box(children=[currentInput,errorText],layout=boxLayout)
inputDisplay=VBox([inputTop,inputBottom])

    
    
    
    
    
    
    
'''
INITIALIZATION
'''
with init:
    display(selectDisplay,inputDisplay)
init

Output()