# Creating new 1DE protein sets
The purpose of this notebook is to create new protein sets for separation on the 1DE simulation. Code will be created that analyzes proteins for molecular weight, isoelectric point and hydrophobicity for subsequent simulated separations by gel filtration, ion exchange and hydrophobic interaction chromatography.
For this notebook, we will need several libraries: os, Bio.SeqUtils, pandas, re, Bio.SeqIO, and collections.defaultdict.

| Library | Uses | Abbreviation |
| :------- | :----: | :------------: |
| os | file management in operating systems | os |
| pandas | data management | pd |
| Bio.SeqIO | parsing FASTA files | SeqIO |
| collections.defaultdict | not sure why | defaultdict |
| Bio.SeqUtils.ProtParam.ProteinAnalysis | protein calculations | PA |

In [72]:
# Import required libraries
from Bio.SeqUtils.IsoelectricPoint import IsoelectricPoint as IP
import os

In [73]:
protein = IP('MKQAVYVASPDSQQIHVWQLDSAGELTLLQTVDVPGQVQPMAISPNQRHLYVGVRPDFGIVSYHIADDGTLTAAGMAPLPGSPTHIDTDRQGRFLFSASYSFNCVSISPIDTHGVVQAPIQQLDDLPAPHSANIDPTNQILLVPCLKEDKVRLFDLSAEGQLTPHAQADITVAAGAGPRHMAFHPNHQVAYCVNELNSSVDVYQISNNGQEYHLVQSLDAMPADFTGTRWAADIHITPNGRYLYISDRTANLLGIFTVSEDGRVISLVGHHLTEAQPRGFNIDHSGNFLIASGQKSDHIEVYRIDQNTGELTTLKRYPVGKGPMWVSIRGAQNS')
print(protein.sequence)
print(protein.pi())
print(f'IEP of protein {protein.sequence} is {protein.pi():.2f}')
print(f'The charge on the protein at pH 7 is {protein.charge_at_pH(5.55):.2f}')

MKQAVYVASPDSQQIHVWQLDSAGELTLLQTVDVPGQVQPMAISPNQRHLYVGVRPDFGIVSYHIADDGTLTAAGMAPLPGSPTHIDTDRQGRFLFSASYSFNCVSISPIDTHGVVQAPIQQLDDLPAPHSANIDPTNQILLVPCLKEDKVRLFDLSAEGQLTPHAQADITVAAGAGPRHMAFHPNHQVAYCVNELNSSVDVYQISNNGQEYHLVQSLDAMPADFTGTRWAADIHITPNGRYLYISDRTANLLGIFTVSEDGRVISLVGHHLTEAQPRGFNIDHSGNFLIASGQKSDHIEVYRIDQNTGELTTLKRYPVGKGPMWVSIRGAQNS
5.553590202331542
IEP of protein MKQAVYVASPDSQQIHVWQLDSAGELTLLQTVDVPGQVQPMAISPNQRHLYVGVRPDFGIVSYHIADDGTLTAAGMAPLPGSPTHIDTDRQGRFLFSASYSFNCVSISPIDTHGVVQAPIQQLDDLPAPHSANIDPTNQILLVPCLKEDKVRLFDLSAEGQLTPHAQADITVAAGAGPRHMAFHPNHQVAYCVNELNSSVDVYQISNNGQEYHLVQSLDAMPADFTGTRWAADIHITPNGRYLYISDRTANLLGIFTVSEDGRVISLVGHHLTEAQPRGFNIDHSGNFLIASGQKSDHIEVYRIDQNTGELTTLKRYPVGKGPMWVSIRGAQNS is 5.55
The charge on the protein at pH 7 is 0.04


In [74]:
datafile = os.path.join('data', 'yersinia_pestis[632].fasta')

In [75]:
datafile = os.path.join('data', 'yersinia_pestis[632].fasta')
print(datafile)

data/yersinia_pestis[632].fasta


In [76]:
import pandas as pd
from Bio import SeqIO
from collections import defaultdict


data = defaultdict(list)
datalen = []
with open(datafile) as fp:
  for record in SeqIO.parse(fp,"fasta"):
    
    sequence = str(record.seq)
    length = len(sequence)
    # data['organism'].append(organism)
    data['sequence'].append(sequence)
    datalen.append(length) 

df = pd.DataFrame.from_dict(data)
# df['lengths'] = data2
df['lengths'] = datalen
# print(data2)
df

Unnamed: 0,sequence,lengths
0,MKQAVYVASPDSQQIHVWQLDSAGELTLLQTVDVPGQVQPMAISPN...,334
1,MSTFSLKIIRVGITVLVVVLAVIAIFNVWAFYTESPWTRDAKFTAD...,311
2,MTHPSFIRLRFAFKLSFAIVAALFLGFHLQLETPRWSVLTAAIVSA...,651
3,MSLLPVMVIFGLSFPPIFLELLISLALFFVVRRILQPTGIYEFVWH...,67
4,MAYRLLRALFRGLFRVTIDGVTDQFKHEKLIITPNHVSFLDGALLA...,718
...,...,...
3903,MNKLAIAVALAATVLSGCANNTASGDTFTASQARQVQTVTYGSIVS...,155
3904,MSDRINALPWQPWSLKDFASQSEAPLSESMPDISLLFPNEPMEATA...,238
3905,MSYQQLPKDFLWGGAVAAHQVEGGWDKGGKGVSIADVLSGGSHGVD...,478
3906,MTNARPPESTSRITGENVFFAKRILQTMALLVGMTIIIAAVGIIYI...,65


In [67]:
import pandas as pd
import re
from Bio import SeqIO
from collections import defaultdict
from Bio.SeqUtils.ProtParam import ProteinAnalysis as PA

data = defaultdict(list)
datalen = []
protpI = []
protMW = []
prot_charge_at_pH = []
with open(datafile) as fp:
  for record in SeqIO.parse(fp,"fasta"):
    
    sequence = str(record.seq)
    length = len(sequence)
    # data['organism'].append(organism)
    data['sequence'].append(sequence)
    datalen.append(length)
    protparams = PA(sequence)
    protpI.append(protparams.isoelectric_point())
    protMW.append(protparams.molecular_weight())
    prot_charge_at_pH.append(protparams.charge_at_pH(7))

df = pd.DataFrame.from_dict(data)

df['length'] = datalen

df['IEP'] = protpI

df['MW'] = protMW

df['charge at pH 7'] = prot_charge_at_pH

df

Unnamed: 0,sequence,length,IEP,MW,charge at pH 7
0,MKQAVYVASPDSQQIHVWQLDSAGELTLLQTVDVPGQVQPMAISPN...,334,5.553590,36445.4226,-12.099492
1,MSTFSLKIIRVGITVLVVVLAVIAIFNVWAFYTESPWTRDAKFTAD...,311,6.185810,34314.6216,-2.841953
2,MTHPSFIRLRFAFKLSFAIVAALFLGFHLQLETPRWSVLTAAIVSA...,651,9.467134,72436.4605,18.575468
3,MSLLPVMVIFGLSFPPIFLELLISLALFFVVRRILQPTGIYEFVWH...,67,7.824544,7790.4029,0.570321
4,MAYRLLRALFRGLFRVTIDGVTDQFKHEKLIITPNHVSFLDGALLA...,718,9.458237,79385.5252,16.933366
...,...,...,...,...,...
3903,MNKLAIAVALAATVLSGCANNTASGDTFTASQARQVQTVTYGSIVS...,155,11.113979,15211.7968,4.498460
3904,MSDRINALPWQPWSLKDFASQSEAPLSESMPDISLLFPNEPMEATA...,238,4.524861,25932.0103,-16.039771
3905,MSYQQLPKDFLWGGAVAAHQVEGGWDKGGKGVSIADVLSGGSHGVD...,478,5.540801,54703.4435,-14.122328
3906,MTNARPPESTSRITGENVFFAKRILQTMALLVGMTIIIAAVGIIYI...,65,9.687616,7124.3098,1.508560


In [69]:
df.iloc[3906,0]

'MTNARPPESTSRITGENVFFAKRILQTMALLVGMTIIIAAVGIIYIASQLNEQASVQSRFLIEKA'

In [71]:
from Bio.SeqUtils.ProtParam import ProteinAnalysis as PA
datapI = PA(df.iloc[3906,0])
print(datapI.sequence)
print(datapI.isoelectric_point())
print(datapI.charge_at_pH(7))
print(datapI.molecular_weight())

MTNARPPESTSRITGENVFFAKRILQTMALLVGMTIIIAAVGIIYIASQLNEQASVQSRFLIEKA
9.687615776062014
1.5085595328371788
7124.309799999999
