In [1]:
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis as PA
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import os
from collections import defaultdict
import pandas as pd
import ipywidgets as widgets
from ipywidgets import *

In [2]:
def listtostr(strlist):
    output = ""
    for word in strlist:
        output += word + "_"
    return(output[:len(output)-1])
def simplifyname(orgname):
    firstname = [orgname.split("_")[0][0]]
    lastname = orgname.split("_")[1:]
    fullname = firstname + lastname
    return(listtostr(fullname))

In [28]:
#Generate a file path for the correct .fasta file
input_filename = input("Enter Input Name")
output_name = input("Enter Output Name")
datafile = os.path.join('data', input_filename)
#organism_name = input_filename.replace(".faa","")

Enter Input Name E_coli.faa
Enter Output Name E_coli_affinity


In [29]:
data_folder = os.path.join('data', output_name)

if not os.path.isdir(data_folder):
    os.mkdir(data_folder)

In [30]:
def getHistPercent(protstring):
    Hcount = 0
    for AA in protstring:
        if AA == "H":
            Hcount += 1
    return(Hcount/len(protstring))

In [31]:
def findHistChains(protstring):
    chainDict = {}
    chainlen = 1
    lastAA = ""
    for AA in protstring:
        if AA == "H":
            if lastAA == "H":
                chainlen += 1
        elif chainlen > 1 and lastAA == "H":
            #print(f"Chain found, current length {chainlen}")
            if str(chainlen) in chainDict:
                chainDict[str(chainlen)] = chainDict[str(chainlen)] + 1
            else:
                chainDict[str(chainlen)] = 1
            chainlen = 1
        lastAA = AA
    if chainlen > 1:
        if str(chainlen) in chainDict:
            chainDict[str(chainlen)] = chainDict[str(chainlen)] + 1
        else:
            chainDict[str(chainlen)] = 1
    return(chainDict)

In [32]:
def HistScore(protstring):
    last4 = ""
    last8 = ""
    CD4 = 0
    CD8 = 0
    score = 0
    for AA in protstring:
        #iterate last4 & last8
        if len(last4) < 4:
            last4 += AA
        else:
            last4 = last4[1:] + AA
        if len(last8) < 8:
            last8 += AA
        else:
            last8 = last8[1:] + AA
        
        #scan for non-chain H groupings 
        if getHistPercent(last4) >= 0.5 and CD4 < 1:
            score += 2
            CD4 = 4
        else:
            CD4 -= 1
        if getHistPercent(last8) >= 0.375 and CD8 < 1:
            score += 6
            CD8 = 8
        else:
            CD8 -= 1
    
    #scan for chains
    histChains = findHistChains(protstring)
    for key in histChains:
        score += histChains[key] * (int(key) ** 2)
    return(score)

In [33]:
unfiltered = []
wash = []
fractions = []
with open(datafile) as protfile:
    for record in SeqIO.parse(protfile,"fasta"):
        unfiltered.append(record)
        protparams = PA(str(record.seq))
        if HistScore(record.seq) < 8:
            wash.append(record)
        else:
            fractions.append(record)    

#data_name = "Affinity_chromatography_" + simplifyname(organism_name)
data_name = output_name
SeqIO.write(unfiltered,os.path.join(data_folder,data_name+"_total.faa"),"fasta")
SeqIO.write(wash,os.path.join(data_folder,data_name+"_wash.faa"),"fasta")
SeqIO.write(fractions,os.path.join(data_folder,data_name+"_fractions.faa"),"fasta")


71

In [34]:
datadict = {
    'sequence' : [],
    'length' : [],
    'isoelectric_point' : [],
    'hist_score' : [],
    'max_chain_length': [],
    'ID' : [],
    'description' : []
}

with open(os.path.join(data_folder,data_name+"_fractions.faa"),"r") as prot_file:
    for record in SeqIO.parse(prot_file,"fasta"):
        sequence = str(record.seq)  #It is not strictly necessary to record the sequence as a string, but it is easier to work with and displays better. (Ex:len() does not work on Seq objects)
        parameters = PA(sequence)
        datadict['sequence'].append(sequence)
        datadict['length'].append(len(sequence))
        datadict['isoelectric_point'].append(parameters.isoelectric_point())
        datadict['hist_score'].append(HistScore(sequence))
        datadict['max_chain_length'].append(max(findHistChains(sequence).keys(),default = '0'))
        datadict['ID'].append(record.id)
        datadict['description'].append(record.description)

df = pd.DataFrame.from_dict(datadict)
df.sort_values('hist_score',inplace = True)
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,sequence,length,isoelectric_point,hist_score,max_chain_length,ID,description
0,MHLKYYLHNLPESLIPWILILIFNDNDNTPLLFIFISSIHVLLYPY...,96,9.380875,8,0,sp|Q47503|IMMK_ECOLX,sp|Q47503|IMMK_ECOLX Colicin-K immunity protei...
1,MKKLAIVFTMLLIAGCSSSQDSANNQIDELGKENNSLFTFRNIQSG...,181,6.887885,8,0,sp|Q46670|CDTC_ECOLX,sp|Q46670|CDTC_ECOLX Cytolethal distending tox...
2,MKNRNNAVGPQIRAKKPKASKTVPILAGLSLGAGLQTATQYFAHSF...,635,8.918379,8,2,sp|Q00185|TRAG4_ECOLX,sp|Q00185|TRAG4_ECOLX Conjugal transfer protei...
3,MKLIVGMTGATGAPLGVALLQALREMPNVETHLVMSKWAKTTIELE...,197,6.441529,8,2,sp|P69774|PADL_ECOLX,sp|P69774|PADL_ECOLX Probable UbiX-like flavin...
4,MKTATAPLPPLRSVKVLDQLRERIRYLHYSLRTEQAYVHWVRAFIR...,337,10.248749,8,2,sp|P62590|INT2_ECOLX,sp|P62590|INT2_ECOLX Integrase/recombinase OS=...
...,...,...,...,...,...,...,...
66,MTKDLNTLVSELPEIYQTIFGHPEWDGDAARDCNQRLDLITEQYDN...,708,5.864841,22,2,sp|Q47592|WBDD2_ECOLX,sp|Q47592|WBDD2_ECOLX O-antigen chain terminat...
67,MSHADMSDSSGFNEAAAAFSWNGPKKAINPYLDPAEVAPFSALSNL...,865,9.292747,22,2,sp|P21312|ENDOR_ECOLX,sp|P21312|ENDOR_ECOLX Probable replication end...
68,MNKIYSLKYSHITGGLIAVSELSGRVSSRATGKKKHKRILALCFLG...,1300,6.364910,24,2,sp|O32591|ESPP_ECOLX,sp|O32591|ESPP_ECOLX Serine protease EspP OS=E...
69,MKQRLSLAQSALEKLSARRGNTWYPIFHLAPPAGWMNDPNGLIYFN...,476,5.733770,26,2,sp|P16553|RAFD_ECOLX,sp|P16553|RAFD_ECOLX Raffinose invertase OS=Es...


In [35]:
def param_of_interest(protdata):
    output = bool(HistScore(protdata) >= 4)
    return(output)

nfractions = 7
fraclen = round(len(df)/nfractions)
seqofinterest = "HHHHHH"
seqhits = {}
noise = 0.10
for n in range(nfractions - 1):
    fracrecords = []
    #Generate min/max range values
    fuzzymin = int(round(n*fraclen-noise*fraclen,0))
    fuzzymax = int(round((n+1)*fraclen + noise*fraclen,0))
    if fuzzymin < 0:
        fuzzymin = 0
    if fuzzymax > len(df):
        fuzzymax = len(df)
    tempdf = df.iloc[fuzzymin:fuzzymax]
    for index  in tempdf.index:
        record = SeqRecord(Seq(tempdf['sequence'][index]),id=str(tempdf["ID"][index]),description=tempdf["description"][index])
        fracrecords.append(record)
        if param_of_interest(tempdf['sequence'][index]) == True:
            if n+1 in seqhits:
                seqhits[n+1][0] += 1
                seqhits[n+1][1].append(index)
            else:
                seqhits[n+1] = [1,[index]]
    SeqIO.write(fracrecords,os.path.join(data_folder,data_name+"_fraction"+ str(n+1) + ".faa"),"fasta")

tempdf = df.iloc[int(round((nfractions-1)*fraclen-noise*fraclen,0)):]
remainderrecord = []
for index  in tempdf.index:
        record = SeqRecord(Seq(tempdf['sequence'][index]),id=str(tempdf["ID"][index]),description=tempdf["description"][index])
        remainderrecord.append(record)
        if param_of_interest(tempdf['sequence'][index]) == True:
            if nfractions in seqhits:
                seqhits[nfractions][0] += 1
                seqhits[nfractions][1].append(index)
            else:
                seqhits[nfractions] = [1,[index]]
SeqIO.write(remainderrecord,os.path.join(data_folder,data_name+"_fraction"+ str(nfractions) + ".faa"),"fasta")
tempdf

Unnamed: 0,sequence,length,isoelectric_point,hist_score,max_chain_length,ID,description
59,MYVVSTKQMLNNAQRGGYAVPAFNIHNLETMQVVVETAANLHAPVI...,284,5.870183,18,2,sp|P0C8J7|GATY_ECOLX,"sp|P0C8J7|GATY_ECOLX D-tagatose-1,6-bisphospha..."
60,MTKDLNTLVSELPEIYQTIFGHPEWDGDAARDCNQRLDLITEQYDN...,708,5.839149,20,2,sp|J7I4B7|WBDD_ECOLX,sp|J7I4B7|WBDD_ECOLX O-antigen chain terminato...
61,MSRLKQPIFLKKIKKVINTIPRLEEQIFACRNKKRSDNPLLFIDRK...,342,9.526832,20,2,sp|P18809|FINQ_ECOLX,sp|P18809|FINQ_ECOLX Protein FinQ OS=Escherich...
62,MTMITPSFPGNSLAVVLQRRDWENPGVTQLNRLAAHPPFASWRNSE...,1029,5.304294,20,2,sp|Q8VNN2|BGAL_ECOLX,sp|Q8VNN2|BGAL_ECOLX Beta-galactosidase OS=Esc...
63,MLLKTSRRTFLKGLTLSGVAGSLGVWSFNARSSLSLPVAASLQGTQ...,605,5.872571,20,0,sp|Q47452|PCOA_ECOLX,sp|Q47452|PCOA_ECOLX Copper resistance protein...
64,MQDDNIKRRNDAIVAGRLFSGSVQDRETERCHHCGELLHPFPEPEY...,461,6.579079,20,2,sp|P52605|KLCB2_ECOLX,sp|P52605|KLCB2_ECOLX Protein KlcB OS=Escheric...
65,MQGNALTVLLSGKKYLLLQGPMGPFFSDVAEWLESLGRNAVNVVFN...,389,9.72533,22,2,sp|P42218|KPSS5_ECOLX,sp|P42218|KPSS5_ECOLX Capsule polysaccharide e...
66,MTKDLNTLVSELPEIYQTIFGHPEWDGDAARDCNQRLDLITEQYDN...,708,5.864841,22,2,sp|Q47592|WBDD2_ECOLX,sp|Q47592|WBDD2_ECOLX O-antigen chain terminat...
67,MSHADMSDSSGFNEAAAAFSWNGPKKAINPYLDPAEVAPFSALSNL...,865,9.292747,22,2,sp|P21312|ENDOR_ECOLX,sp|P21312|ENDOR_ECOLX Probable replication end...
68,MNKIYSLKYSHITGGLIAVSELSGRVSSRATGKKKHKRILALCFLG...,1300,6.36491,24,2,sp|O32591|ESPP_ECOLX,sp|O32591|ESPP_ECOLX Serine protease EspP OS=E...


In [17]:
seqhits
for key in seqhits:
    print(f'Hits in fraction {key}')
    for hit in seqhits[key][1]:
        print(f'\tHit on index {hit}')

Hits in fraction 1
	Hit on index 0
	Hit on index 1
	Hit on index 2
	Hit on index 3
	Hit on index 4
	Hit on index 5
	Hit on index 6
	Hit on index 7
	Hit on index 8
	Hit on index 9
	Hit on index 10
	Hit on index 11
	Hit on index 12
	Hit on index 13
	Hit on index 14
	Hit on index 15
	Hit on index 16
	Hit on index 17
	Hit on index 18
	Hit on index 19
	Hit on index 20
	Hit on index 21
	Hit on index 22
	Hit on index 23
	Hit on index 24
	Hit on index 25
	Hit on index 26
	Hit on index 27
	Hit on index 28
	Hit on index 29
	Hit on index 30
	Hit on index 31
	Hit on index 32
	Hit on index 33
	Hit on index 34
	Hit on index 35
	Hit on index 36
	Hit on index 37
	Hit on index 38
	Hit on index 39
	Hit on index 40
	Hit on index 41
	Hit on index 42
	Hit on index 43
	Hit on index 44
	Hit on index 45
	Hit on index 46
	Hit on index 47
	Hit on index 48
	Hit on index 49
	Hit on index 50
	Hit on index 51
	Hit on index 52
	Hit on index 53
	Hit on index 54
	Hit on index 55
	Hit on index 56
	Hit on index 57
	Hit 

In [1]:
df.iloc[10]['sequence']

NameError: name 'df' is not defined

In [None]:
data_folder