In [None]:
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis as PA
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import os
import pandas as pd

def fasta_to_dataframe(fastaPath,includeDefaults = True, pH = 7.0, attributes={}): 
    #if includeDefaults is enabled, re-add the predefined parameters
    attributes.clear()
    if includeDefaults:
        attributes['Sequence'] = lambda record : str(record.seq)
        attributes['ID'] = lambda record : record.id
        attributes['Description'] = lambda record : record.description
        attributes['Molecular Weight'] = lambda record : PA(str(record.seq).replace('X','Q')).molecular_weight()
        attributes['Charge at pH {}'.format(pH)] = lambda record : PA(str(record.seq).replace('X','Q')).charge_at_pH(pH)
    elif not attributes:
        raise Exception('No attribute functions, set includeDefault = False or add attribute functions')
    
    #This is built to use the from_dict method to create a dataframe where each dictionary key is a column and it's associated data is in a list bound to that key.
    columns = attributes.keys() #List of our tracked variables for ease of access
    dataDict = {} #dictionary the dataframe will be made from
    for x in columns: #prep dataDict to have the data added to it, will be a dictionary where each key is tied to an empty list
        dataDict[x] = []
    with open(fastaPath):
        for record in SeqIO.parse(fastaPath,'fasta'):
            for attribute in columns: #For each attribute, append the output of it's defined function to the appropriate list in dataDict
                dataDict[attribute].append(attributes[attribute](record)) 
    df = pd.DataFrame.from_dict(dataDict) #This command builds the dataframe from dataDict
    return(df)

def fractionate(df,numFractions,noise=0.0,newIndices = False): #Evenly splits a dataframe into a list of fraction dataframes. Noise is how much each fraction will bleed over past it's exact boundaries 0.1 = 10% more data either direction.
    fracLen = round(len(df) / numFractions) #Base length each subdataframe will be (excluding the last one)
    fuzz = fracLen * noise #How many additional entries will be added as "noise"
    dfFractions = [] 
    for n in range(numFractions):
        fuzzyMin = max(0,int(round(n*fracLen - fuzz))) #Determine the minimum index for this subdivision
        fuzzyMax = min(len(df),int(round((n+1)*fracLen + fuzz))) #Determine the maximum index for this subdivision
        if n == numFractions - 1: #check if this fraction is the last one to prevent cutting off data near the end
            fuzzyMax = len(df)
            
        output = df.iloc[fuzzyMin:fuzzyMax]  #This creates a new dataframe for each subdivision so any necessary changes, like adjusting the index numbers, can be made before appending
        if newIndices:
            output.reset_index(drop=True, inplace=True)
            
        dfFractions.append(output)
    return(dfFractions)

data=os.listdir('data') #List to keep track of previous files for conveinience
if '.ipynb_checkpoints' in data:
    data.remove('.ipynb_checkpoints')
'''
SELECT INPUT METHOD
'''
print('Select separation method')
methods = ['Size Exclusion', 'Ion Exchange']
for n in range(len(methods)):
    print('{} - {}'.format(n,methods[n]))

validMethod = False
while not validMethod:
    methodIndex = input()
    if methodIndex.isdigit():
        if int(methodIndex) < len(methods):
            methodSelect = methods[int(methodIndex)]
            validMethod = True
        else:
            print('Number out of index range')
    else:
        print('Please enter only the index number of intended method')
'''
SELECT INPUT MEDIA
'''    
sizeDict = {
    "Bio-P 0.8-4.0 kDa" : lambda x : bool(800<x.molecular_weight()<4000),
    "Bio-P 1.0-6.0 kDa" : lambda x : bool(1000<x.molecular_weight()<6000),
    "Bio-P 1.5-20.0 kDa" : lambda x : bool(1500<x.molecular_weight()<20000),
    "Bio-P 2.5-40.0 kDA" : lambda x : bool(2500<x.molecular_weight()<40000),
    "Bio-P 3.0-60.0 kDa" : lambda x : bool(3000<x.molecular_weight()<60000),
    "Bio-P 5.0-100 kDa" : lambda x : bool(5000<x.molecular_weight()<100000),
    "S-X 0.4-14.0 kDa" : lambda x : bool(400<x.molecular_weight()<14000),
    "Bio-A 10.0 - 500 kDA" : lambda x : bool(10000<x.molecular_weight()<500000),
    "Bio-A 10.0 - 1500 kDA" : lambda x : bool(10000<x.molecular_weight()<1500000),
}
ionDict = {
    "Q Media (Triethylamine +)" : lambda x : bool(x.charge_at_pH(pHvalue)<=-0.01),
    "S Media (Sulfite -)" : lambda x : bool(x.charge_at_pH(pHvalue)>=0.01),
}
methodDict = {       #This dictionary holds the previous ones for the method selection menu
    'Size Exclusion' : sizeDict,
    'Ion Exchange' : ionDict,
}
sortColumn = { 
    'Size Exclusion' : 'Molecular Weight',
    'Ion Exchange' : 'Charge at pH 7.0',
}    
print('Select separation media')
mediaDict = methodDict[methodSelect]
mediaList = list(mediaDict.keys())
for n in range(len(mediaList)):
    print('{} - {}'.format(n,mediaList[n]))

validMedia = False
while not validMedia:
    mediaIndex = input()
    if mediaIndex.isdigit():
        if int(mediaIndex) < len(mediaList):
            mediaSelect = mediaList[int(mediaIndex)]
            mediaFunc = mediaDict[mediaSelect]
            validMedia = True
        else:
            print('Number out of index range')
    else:
        print('Please enter only the index number of intended media')
        
        
'''
Select Input File
'''    
print('Select data set to separate')
for n in range(len(data)):
    print('{} - {}'.format(n,data[n]))

validInput = False
while not validInput:
    inputIndex = input()
    if inputIndex.isdigit():
        if int(inputIndex) < len(data):
            inputFile = data[int(inputIndex)]
            validInput = True
        else:
            print('Number out of index range')
    else:
        print('Please enter only the index number of intended file')

print('Enter simulation pH')
pHvalid = False
while not pHvalid:
    pHentry = input()
    try:
        pHvalue = float(pHentry)
        if pHvalue >= 0.0 and pHvalue <= 14.0: 
            pHvalid = True
            sortColumn['Ion Exchange'] = ['Charge at pH {}'.format(pHvalue)]
        else:
            print('Enter a number within the range 0.0-14.0')
    except:
        print('Enter a valid floating point number')

outFolder = '{} {} at pH {}'.format(inputFile.split('.')[0],mediaSelect,pHvalue)
outFolder = outFolder.replace('.',',')
if not os.path.isdir(os.path.join('outputs',outFolder)):
    os.mkdir(os.path.join('outputs',outFolder))

#Create lists of SeqRecord objects that biopython can transform into .fasta files
totals = []
wash = []
separated = []

#Separate proteins by the parameter
with open(os.path.join('data',inputFile)):
    for record in SeqIO.parse(os.path.join('data',inputFile),'fasta'):
        totals.append(record)
        sequence = str(record.seq)
        
        #X is a notation for any residue, biopython doesn't accept it though so for the protien analysis we substituted Glutamine since it's close to the average for the properties we care about.
        protparams = PA(sequence.replace("X","Q"))
        if mediaFunc(protparams):
            separated.append(record)
        else:
            wash.append(record)

SeqIO.write(totals,'outputs\{}\\total.faa'.format(outFolder),'fasta')
SeqIO.write(separated,'outputs\{}\separated.faa'.format(outFolder),'fasta')
SeqIO.write(wash,'outputs\{}\wash.faa'.format(outFolder),'fasta')

main_df = fasta_to_dataframe('outputs\{}\separated.faa'.format(outFolder),includeDefaults=True,pH=pHvalue)
main_df.sort_values(by=sortColumn[methodSelect],inplace=True)
fraction_dfs = fractionate(main_df,7)
for n in range(len(fraction_dfs)):
    records = []
    for index in range(len(fraction_dfs[n])):
        single_frac = fraction_dfs[n]
        records.append(SeqRecord(seq = Seq(single_frac.iloc[index]['Sequence']),id=single_frac.iloc[index]['ID'],description=single_frac.iloc[index]['Description']))
    SeqIO.write(records,'outputs\\{}\\fraction_{}.faa'.format(outFolder,n+1),'fasta')

print('Fractions successfully generated in \"outputs/{}\"'.format(outFolder))

Select separation method
0 - Size Exclusion
1 - Ion Exchange


In [5]:
import platform
platform.system()

'Windows'