In [1]:
import sys
sys.path.append(r'/home/martinha/propythia/propythia/src/propythia/')
sys.path.append(r'/home/martinha/propythia/propythia/src/')

# Protein physicochemical descriptors and encodings 
This jupyter notebook will demonstrate how to obtain protein sequence-based features and protein encoding wth Propythia.

In [2]:
import pandas as pd
import numpy as np

from propythia.protein.sequence import ReadSequence
from propythia.protein.descriptors import ProteinDescritors
from propythia.protein.encoding import Encoding

2023-09-04 09:45:47.977969: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1


The first step is to get the data.
Protein data can be held using dataframes, lists or strings. They can also be retrieved using a fasta file. 

This example will use a pandas dataframe composed by 2 columns (one which contains protein sequences), a list of protein sequences and a single sequence.

In [3]:
data = {'sequence': 
        {0: 'MSYKPIAPAPSSTPGSSTPGPGTPVPTGSVPSPSGSVPGAGAPFRPLFNDFGPPSMGYVQAMKPPGAQGSQSTYTDLLSVIEEMGKEIRPTYAGSKSAMERLKRGIIHARALVRECLAETERNART',                           1:'MSDDLPIDIHSSKLLDWLVSRRHCNKDWQKSVVAIREKIKHAILDMPESPKIVELLQGAYINYFHCCQIIEILRDTEKDTKNFLGFYSSQRMKDWQEIEGMYKKDNVYLAEAAQILQRLAQYEIPALRKQISKMDQSVTDAIRKHSEYGKQAEDGRKQFEKEISRMQLKGVHLRKELLELAADLPAFYEKITAEIRKISAARDYFQAFRDYMSLGAAPKDAAPILPIIGLIGERGLDVTTYEWKYNQKPDKVEKPNFEMLLTAAEDSDEIDFGGGDEIDFGIAAEDDAVIDFSAVVDLVADDTGAVGEAIASGQDALHLLENSEAQKAVKHELIELLAFLSMRLDDETRETTADVLIRGAEKRPDGVAAVTEKRLKTWITEVEGILKELENPQKVHLFKIRGSPQYVEQVVEELEKKRDMEHRYKRLQTLMTENQETARQSVTKSNVELKTIVESTRVLQKQIEAEISKKYNGRRVNLMGGINQALGGN',                                
         2:'MPFDPAASPLSPSQARVLATLMEKARTVPDSYPMSLNGLLTGCNQKTSRDPVMALSEAQVQEALAALERLALVFENSGYRSPRWEHNFQRGAGVPEQSAVLLGLLMLRGPQTAAELRTNAERWYRFADISSVEAFLDELQQRSADKGGPLAVPLPRSPGTREQRWAHLLCGPVDAGRSNAGVEPVPAGVETLQERIGTLESELASLRATVQWLCQELGITPAPASMPQPGLPAGNGSPGS',                              
         3:'MIHFTKMHGLGNDFMVVDGVTQNVFFSPEQIRRLADRNFGIGFDQLLLVEPPYDPDLDFHYRIFNADGSEVEQCGNGARCFARFVRNKGLTQKNKIRVSTNSGKITLRIERDGNVTVNMGVPVIEPSQIPFKAKKSEKTYLLQTPMQTYLCGAISMGNPHCVIQVEDVQTVNVDEIGSSLTRHERFPKGVNVGFMQVINPGHIKLRVYERGAAETLACGTGACAAAAVGQLQDKLDKQVRVDLPGGSLIINWEGEGKPLWMTGPAEHVYDGQIQL',                             
         4:'MGSSTTEPDVGTTSNIETTTTLQNKNVNEVDQNKKSEQSNPSFKEVVLKDIGLGEATDLENVSDDVFNNYLAIRLERERKEIELLKESNLEKLSVIIKNCIECNSFTDETMKRLINLVDNNYNNSVHFSPKSKRRKLESTSPPMSSSSVPNKETNNIQQSNSYQLRNNYDEEQENQKSQTQGSKSLLSRPNIYSFPPKQTQPASQQHVQLAAIVQRQSTLTTPLSSTYGSNSNNSMNTQLPLSDKSLRSNVQEKIVQQGSMSQRDIINGSNMSSQYSSQVYPPGYYQTRYGQQMVVVYPDSDSPQINQTSTIQHQQQLPHTYPPHYHQQQQLHQNQLVPQQHQQLQQQSISKHQLFGQKNPMSPQSHYLPNNESQNLGVINHRRSFSSGTYPVVNSRSKSPDRSMPLTVQKQMNFLIHTPKHPPPT',                               
         5:'MSTNPIQPLLDVLYQGKSLNREQTAELFGALIRGEMSEAAMAGMLVALKMRGETIDEISGAADAMRAAAKPFPCPERNNNPLHNGIVDIVGTGGDGFNTINISTTAAFVAAAAGAKVAKHGNRSVSSKSGSSDLLAQFGIDLTMSPETASRCLDALNLCFLFAPHYHGGVKHAVPVRQALKTRTLFNVLGPLINPARPEFMLLGVYSPELVLPIAKVLKALGTKRAMVVHGSGLDEVALHGNTQVAELKDGDIVEYQLTPADLGVPLAQITDLEGGEPAQNALITEAILKGRGTEAHANAVAINAGCALYVCGIADSVKAGTLLALATIQSGKAFELLSQLAKVSGEALVNGQEKGR',                                
         6:'MPQRFIVVTGGVLSGIGKGIFSASLARILKDSGVNVNILKIDPYLNVDAGTMNPNQHGEVFVTDDGYEADLDLGHYERFLGINVSRKNNITAGQIYYSVIKREREGKYLGSTVQIVPHVTSEIKDRIKTMDGDLLVIEIGGTVGDIEGEVFLEAVRELAFEIGREKFHFVHVTYVPYLRTTNEFKTKPTQQSVQLLRRIGIHPDTIIVRTEMPIDANSLFKVSLFSGVPRNRVINLPDASNVYEVPDVLHSLNLHKLIAKELDIDINDRFNWSYPKSFELLKIGIVGKYLGTDDAYKSIIESIYLSGAQKPIVIDAQELEDMTDEQIKNYLDDFDALIIPGGFGRRGIEGKIKAIKYARENKKPILGICLGMQLMAIEFARNVGKLEGANSTEFDENTPYPVVNMMESQKEVLNLGGTMRLGAQKTQIMKGTLLSRIYDGQEVVYERHRHRYEVDAEAFPQLFKNPGEEGYKLTISARSDFVEAVELDDHPFFVGIQYHPEYKSKVGKPHPIFKWLVKAAGGKIND',                              
         7:'MTSLADLPVDVSPRHEGERIRSGDMYVELAGPKSFGAELFKVVDPDEIEPDKVEVIGPDIDEMEEGGRYPFAIYVKAAGEELEEDVEGVLERRIHEFCNYVEGFMHLNQRDQIWCRVSKNVTEKGFRLEHLGIALRELYKEEFGNVIDSVEVTIMTDEEKVEEFLEYARRVYKKRDERAKGLSEEDVNEFYVCLMCQSFAPTHVCVITPDRPSLCGSITWHDAKAAYKIDPEGPIFPIEKGECLDPEAGEYEGVNEAVKEHSQGTVERVYLHSCLEYPHTSCGCFQAVVFYIPEVDGFGIVDREYPGETPIGLPFSTMAGEASGGEQQPGFVGVSYGYMESDKFLQYDGGWERVVWMPKALKERMKHAIPDELYDKIATEEDATTVEELREFLEKVEHPVVERWAEEEEEEEEKAPEEEAPAEEPTMEVKELPIAPGGGLNVKIVLKNAKIYAEKVIIKRADREDKS',        
         8:'MGSADDRRFEVLRAIVADFVATKEPIGSKTLVERHNLGVSSATVRNDMAVLEAEGYITQPHTSSGRVPTEKGYREFVDRIDNVKPLSSSERRAILNFLESGVDLDDVLRRAVRLLAQLTRQVAIVQYPTLSTSSVRHLEVVALTPARLLLVVITDTGRVDQRIVELGDAIDEHELSKLRDMLGQAMEGKPLAQASIAVSDLASHLNGSDRLGDAVGRAATVLVETLVEHTEERLLLGGTANLTRNTADFGGSLRSVLEALEEQVVVLRLLAAQQEAGKVTVRIGHETEAEQMAGASVVSTAYGSSGKVYGGMGVVGPTRMDYPGTIANVAAVALYIGEVLGSR',
         9:'MDNIRNFSIIAHIDHGKSTLADRIIQLCGGLSDREMEAQVLDSMDIEKERGITIKAQTAALSYKARDGKVYNLNLIDTPGHVDFSYEVSRSLSACEGALLVVDASQGVEAQTVANCYTAIELGVEVVPVLNKIDLPAADPDNAIQEIEDVIGIDAADATRCSAKTGEGVADVLEALIAKVPAPKGDPAAPLQALIIDSWFDNYVGVVMLVRVVNGTLRAKDKVLLMATGAQHLVEQVGVFSPKSVPRESLSAGQVGFVIAGIKELKAAKVGDTITHVAPRKAEAPLPGFKEVKPQVFAGLYPVEANQYEALRESLEKLKLNDASLQYEPEVSQALGFGFRCGFLGLLHMEIVQERLEREFDMDLITTAPTVVYQVQLRDGTMVQVENPAKMPADPSKIEAILEPIVTVNLYMPQEYVGAVITLCEQKRGSQINMSYHGRQVQLTYEIPMGEIVLDFFDRLKSVSRGYASMDYEFKEYRVSDVVKVDILINGDKVDALSIIVHRSNSTYRGREVAAKMREIIPRQMYDVAIQAAIGANVIARENVKALRKNVLAKCYGGDISRKKKLLEKQKEGKKRMKQVGTVEIPQEAFLAILRVEEK'},
        'Is_transporter': 
        {0: '0', 1: '0', 2: '0', 3: '0', 4: '0', 5: '0', 6: '0', 7: '0', 8: '0', 9: '0'}}

dataframe_data = pd.DataFrame(data)

list_data= ['MSDDLPIDIHSSKLLDWLVSRRHCNKDWQKSVVAIREKIKHAILDMPESPKIVELLQGAYINYFHCCQIIEILRDTEKDTKNFLGFYSSQRMKDWQEIEGMYKKDNVYLAEAAQILQRLAQYEIPALRKQISKMDQSVTDAIRKHSEYGKQAEDGRKQFEKEISRMQLKGVHLRKELLELAADLPAFYEKITAEIRKISAARDYFQAFRDYMSLGAAPKDAAPILPIIGLIGERGLDVTTYEWKYNQKPDKVEKPNFEMLLTAAEDSDEIDFGGGDEIDFGIAAEDDAVIDFSAVVDLVADDTGAVGEAIASGQDALHLLENSEAQKAVKHELIELLAFLSMRLDDETRETTADVLIRGAEKRPDGVAAVTEKRLKTWITEVEGILKELENPQKVHLFKIRGSPQYVEQVVEELEKKRDMEHRYKRLQTLMTENQETARQSVTKSNVELKTIVESTRVLQKQIEAEISKKYNGRRVNLMGGINQALGGN','MPFDPAASPLSPSQARVLATLMEKARTVPDSYPMSLNGLLTGCNQKTSRDPVMALSEAQVQEALAALERLALVFENSGYRSPRWEHNFQRGAGVPEQSAVLLGLLMLRGPQTAAELRTNAERWYRFADISSVEAFLDELQQRSADKGGPLAVPLPRSPGTREQRWAHLLCGPVDAGRSNAGVEPVPAGVETLQERIGTLESELASLRATVQWLCQELGITPAPASMPQPGLPAGNGSPGS','MSTNPIQPLLDVLYQGKSLNREQTAELFGALIRGEMSEAAMAGMLVALKMRGETIDEISGAADAMRAAAKPFPCPERNNNPLHNGIVDIVGTGGDGFNTINISTTAAFVAAAAGAKVAKHGNRSVSSKSGSSDLLAQFGIDLTMSPETASRCLDALNLCFLFAPHYHGGVKHAVPVRQALKTRTLFNVLGPLINPARPEFMLLGVYSPELVLPIAKVLKALGTKRAMVVHGSGLDEVALHGNTQVAELKDGDIVEYQLTPADLGVPLAQITDLEGGEPAQNALITEAILKGRGTEAHANAVAINAGCALYVCGIADSVKAGTLLALATIQSGKAFELLSQLAKVSGEALVNGQEKGR']

sequence_data = 'MSTNPIQPLLDVLYQGKSLNREQTAELFGALIRGEMSEAAMAGMLVALKMRGETIDEISGAADAMRAAAKPFPCPERNNNPLHNGIVDIVGTGGDGFNTINISTTAAFVAAAAGAKVAKHGNRSVSSKSGSSDLLAQFGIDLTMSPETASRCLDALNLCFLFAPHYHGGVKHAVPVRQALKTRTLFNVLGPLINPARPEFMLLGVYSPELVLPIAKVLKALGTKRAMVVHGSGLDEVALHGNTQVAELKDGDIVEYQLTPADLGVPLAQITDLEGGEPAQNALITEAILKGRGTEAHANAVAINAGCALYVCGIADSVKAGTLLALATIQSGKAFELLSQLAKVSGEALVNGQEKGRBZUQJX'

The module ReadSequence contains functions built to preprocess the protein sequences, by replacing or remove certain amino acids. 

The function "par_preprocessing" was designed to deal with pandas dataframes (it is required to specify the atribute dataset and the column of protein sequences) while the function "get_preprocessing" was designed to process only one sequence.

The preprocessing phase may be required to calculate certain descriptors features or encodings.

In [4]:
read_seqs = ReadSequence()
res = read_seqs.par_preprocessing(dataset= dataframe_data, col = 'sequence', B ='N', Z = 'Q', U = 'C', O = 'K', J = 'I', X = '')
res

Unnamed: 0,sequence,Is_transporter
0,MSYKPIAPAPSSTPGSSTPGPGTPVPTGSVPSPSGSVPGAGAPFRP...,0
1,MSDDLPIDIHSSKLLDWLVSRRHCNKDWQKSVVAIREKIKHAILDM...,0
2,MPFDPAASPLSPSQARVLATLMEKARTVPDSYPMSLNGLLTGCNQK...,0
3,MIHFTKMHGLGNDFMVVDGVTQNVFFSPEQIRRLADRNFGIGFDQL...,0
4,MGSSTTEPDVGTTSNIETTTTLQNKNVNEVDQNKKSEQSNPSFKEV...,0
5,MSTNPIQPLLDVLYQGKSLNREQTAELFGALIRGEMSEAAMAGMLV...,0
6,MPQRFIVVTGGVLSGIGKGIFSASLARILKDSGVNVNILKIDPYLN...,0
7,MTSLADLPVDVSPRHEGERIRSGDMYVELAGPKSFGAELFKVVDPD...,0
8,MGSADDRRFEVLRAIVADFVATKEPIGSKTLVERHNLGVSSATVRN...,0
9,MDNIRNFSIIAHIDHGKSTLADRIIQLCGGLSDREMEAQVLDSMDI...,0


In [5]:
res = read_seqs.get_preprocessing(ProteinSequence= sequence_data, B ='N', Z = 'Q', U = 'C', O = 'K', J = 'I', X = '')
res

'MSTNPIQPLLDVLYQGKSLNREQTAELFGALIRGEMSEAAMAGMLVALKMRGETIDEISGAADAMRAAAKPFPCPERNNNPLHNGIVDIVGTGGDGFNTINISTTAAFVAAAAGAKVAKHGNRSVSSKSGSSDLLAQFGIDLTMSPETASRCLDALNLCFLFAPHYHGGVKHAVPVRQALKTRTLFNVLGPLINPARPEFMLLGVYSPELVLPIAKVLKALGTKRAMVVHGSGLDEVALHGNTQVAELKDGDIVEYQLTPADLGVPLAQITDLEGGEPAQNALITEAILKGRGTEAHANAVAINAGCALYVCGIADSVKAGTLLALATIQSGKAFELLSQLAKVSGEALVNGQEKGRNQCQI'

The module ProteinDescriptors is responsible to perform the calculation of various sequence-based descriptors.

The module accepts as dataset a pandas dataframe, list of sequences and a sequence (string) as dataset. The parameter col is the name of the column to store the sequences, or the column where the sequences are present (pandas dataframe).

In [6]:
descriptors_df = ProteinDescritors(dataset= dataframe_data ,  col= 'sequence')
descriptors_df.dataset

Unnamed: 0,sequence,Is_transporter
0,MSYKPIAPAPSSTPGSSTPGPGTPVPTGSVPSPSGSVPGAGAPFRP...,0
1,MSDDLPIDIHSSKLLDWLVSRRHCNKDWQKSVVAIREKIKHAILDM...,0
2,MPFDPAASPLSPSQARVLATLMEKARTVPDSYPMSLNGLLTGCNQK...,0
3,MIHFTKMHGLGNDFMVVDGVTQNVFFSPEQIRRLADRNFGIGFDQL...,0
4,MGSSTTEPDVGTTSNIETTTTLQNKNVNEVDQNKKSEQSNPSFKEV...,0
5,MSTNPIQPLLDVLYQGKSLNREQTAELFGALIRGEMSEAAMAGMLV...,0
6,MPQRFIVVTGGVLSGIGKGIFSASLARILKDSGVNVNILKIDPYLN...,0
7,MTSLADLPVDVSPRHEGERIRSGDMYVELAGPKSFGAELFKVVDPD...,0
8,MGSADDRRFEVLRAIVADFVATKEPIGSKTLVERHNLGVSSATVRN...,0
9,MDNIRNFSIIAHIDHGKSTLADRIIQLCGGLSDREMEAQVLDSMDI...,0


In [7]:
descriptors_list = ProteinDescritors(dataset= list_data ,  col= 'seqs')
descriptors_list.dataset

Unnamed: 0,seqs
0,MSDDLPIDIHSSKLLDWLVSRRHCNKDWQKSVVAIREKIKHAILDM...
1,MPFDPAASPLSPSQARVLATLMEKARTVPDSYPMSLNGLLTGCNQK...
2,MSTNPIQPLLDVLYQGKSLNREQTAELFGALIRGEMSEAAMAGMLV...


In [8]:
descriptors_str = ProteinDescritors(dataset= sequence_data ,  col= 'seq')
descriptors_str.dataset

Unnamed: 0,seq
0,MSTNPIQPLLDVLYQGKSLNREQTAELFGALIRGEMSEAAMAGMLV...


 # calculate Descriptors 
 
 Machine and Deep learning requires numerical inputs. One needs to transform the aminoacid sequence into numerical features. This can be done using descriptors that describe the entire sequence (usually indepedent of the protein length) in terms of its properties. This include the length, physicochemical features such as charge, bonds, molecular weight, aromacity, isoeletric point and others. They can also include aminoacid and pseudoaminoacid composition, dipeptide and tripeptide composition, correlation descriptors. 
 
Descriptors are handcrafted features, manually calculated that can be fed to the model. 

This module comprehends functions to computing different types of Protein descriptors. It receives a sequence object (from previous module) and retrieves a dataframe with name of feature and value. The user can calculate individual descriptors and also calculate all descriptors. 

The implemented descriptors can be found below:


| |Function number|name|parameters|description|function name|
|:----|:----|:----|:----|:----|:----|
|physicochemical features|1|length| |length|get_lenght()|
| |2|charge|ph = 7.4, amide = False|charge|get_charge()|
| |3|charge density|ph = 7.4, amide = False|charge density|get_charge_density()|
| |4|formula|amide = False|calculates number of C,H,N,O and S of the protein sequence|get_formula()|
| |5|bond| |total number of hydrogen,single, double and aromatic bonds|get_bond()|
| |6|mw - molecular weigth| | |get_mw()|
| |7|gravy| |gravy from a sequence ( accordingly to biopython)|get_gravy()|
| |8|aromacity| |aromacity ( accordingly to biopython)|get_aromacity()|
| |9|isoelectric point| |isoelectric ( accordingly to biopython)|get_isoelectric_point()|
| |10|instability index| |instability ( accordingly to biopython)|get_instability_index()|
| |11|secondary structure| |fraction of aa that tend to be in helix, turn or sheet|get_sec_struct()|
| |12|molar extinction coefficient| |value of reduced cysteins and oxidized (with disulfid bridges)|get_molar_extinction_coefficient|
| |13|flexibility| |flexibility according to Vihinen, 1994 (return proteinsequencelenght-9 values ) from biopython|get_flexibility()|
| |14|aliphatic index| |aliphatic index of sequence (1 value) from modlamp|get_aliphatic_index()|
| |15|boman index| |boman index of sequence (1 value) from modlamp|get_boman_index()|
| |16|hydrophobic ratio| |hydrophobic ratio from modlamp|get_hydrophobic_ratio()|
| |17|all physicochemical features| |all functions from 1-16|get_all_physicochemical()|
|aminoacid composition|18|aminoacid composition| |aminoacid composition|get_aa_comp()|
| |19|dipeptide composition| |dipeptide compostion|get_dp_comp()|
| |20|tripeptide composition| |tripeptide composition|get_tp_comp()|
| |21|get all aminoacid composition| |get all aac - functions 18,19 and 20|get_all_aac()|
|Pseudo Aminoacid composition|22|pseudo aac (PAAC)|lamda = 10, weight = 0.05|Type I Pseudo amino acid composition (default is 30, depends on lamda) from pydpi|get_paac()|
| |23|PAAC propertie|lamda = 10, weight = 0.05, AAP = [list of properties]|Type I Pseudo amino acid composition for a given property (default is 30, depends on lamda) from pydpi|get_paac_p()|
| |24|amphiphilic PAAC|lamda = 10, weight = 0.05|Type II Pseudo amino acid composition - Amphiphilic (default is 30, depends on lamda) from pydpi|get_apaac()|
| |25|all PAAC|lamda = 10, weight = 0.05|all PAAC - Pseudo and amphiphilic pseudo aminoacid composition|get_all_paac|
|autocorrelation|26|Moreau broto autocorrelation| |Normalized Moreau-Broto autocorrelation (240 values)|get_moreau_broto_auto()|
| |27|moran autocorrelation| |Moran autocorrelation (240 values)|get_moran_auto()|
| |28|Geary autocorrelation| |Geary autocorrelation (240 values)|get_geary_auto()|
| |29|All autocorrelation| |Get all autocorrelation - functions 26,27 and 28|get_all_correlation()|
| |30|CTD| |Composition Transition Distribution descriptors (147 values)|get_ctd()|
| |31|Conjoint triad| |Conjoint Triad descriptors (343 descriptors)|get_conj_t()|
|sequence order|32|SOCN|maxlag = 45 (<len protein)|Sequence order coupling numbers  (retrieves 90 values by default)|get_socn()|
| |33|SOCN_p|maxlag = 45, distancematrix|SOCN with a user specified distance matrix|get_socn_p()|
| |34|QSO|maxlag = 30, weight = 0.1|Quasi sequence order  (retrieves 100 values by default)|get_qso()|
| |35|QSO_p|maxlag = 30, weight = 0.1, distancematrix|QSO with a user specified distance matrix|get_qso_p()|
| |36|all Sequence order|maxlag_socn, maxlag_qso, wigth_qso|SOCN and QSO functions 32 and 34 respectively|get_all_sequenceorder|
|base class from modlamp take significan time to run |37|autocorrealtion|window= 7, scalename = 'Eisenberg'|autocorrelation of amino acid values for a given descriptor scale from modlamp|calculate_autocorr|
| |38|cross correlation|window= 7, scalename = 'Eisenberg'|cross correlation of amino acid values for a given descriptor scale from modlamp|calculate_crosscorr|
| |39|moment|window= 1000, angle= 100, modality = 'max',scalename = 'Eisenberg'|moment of sequence (1 value) from modlamp|calculate_moment|
| |40|global|window = 1000, modality= 'max', scalename= 'Eisenberg'| global / window averaging descriptor value of a given AA scale of sequence (1 value) from modlamp|calculate_global|
| |41|profile|prof_type= 'uH', window = 7, scalename = 'Eisenberg'|hydrophobicity or hydrophobic moment profiles for given sequences and fitting for slope and intercep (2 values) from modlamp|calculate_profile|
| |42|arc|modality= "max", scalename = 'peparc'|arcs as seen in the helical wheel plot. Use for binary amino acid scales only (5 values) from modlamp|calculate_arc|
| |43|all base class|window, scalename, scalename_arc, angle, modality,prof_type|all base class descriptors - 37-42 . They take a significant time to run! |get_all_base_class|
| |44|all |ph, amide, lamda_paac,weight_paac, lamda_apaac, weight_apaac, maxlag_socn,maxlag_qso, weight_qso, window, scalename,scalename_arc, angle, modality,prof_type, tricomp|calculates all descriptors. Tricomposition =False |get_all|
| | | | | | |
| | |in all functions: |n_jobs = 4| | |


To calculate a descriptor, one can be call a specific function as get_length, get_aa_comp, or others.

In [9]:
descriptors_df.get_lenght(n_jobs=4)

Unnamed: 0,sequence,length
0,MSYKPIAPAPSSTPGSSTPGPGTPVPTGSVPSPSGSVPGAGAPFRP...,126.0
1,MSDDLPIDIHSSKLLDWLVSRRHCNKDWQKSVVAIREKIKHAILDM...,489.0
2,MPFDPAASPLSPSQARVLATLMEKARTVPDSYPMSLNGLLTGCNQK...,240.0
3,MIHFTKMHGLGNDFMVVDGVTQNVFFSPEQIRRLADRNFGIGFDQL...,275.0
4,MGSSTTEPDVGTTSNIETTTTLQNKNVNEVDQNKKSEQSNPSFKEV...,426.0
5,MSTNPIQPLLDVLYQGKSLNREQTAELFGALIRGEMSEAAMAGMLV...,357.0
6,MPQRFIVVTGGVLSGIGKGIFSASLARILKDSGVNVNILKIDPYLN...,526.0
7,MTSLADLPVDVSPRHEGERIRSGDMYVELAGPKSFGAELFKVVDPD...,467.0
8,MGSADDRRFEVLRAIVADFVATKEPIGSKTLVERHNLGVSSATVRN...,343.0
9,MDNIRNFSIIAHIDHGKSTLADRIIQLCGGLSDREMEAQVLDSMDI...,599.0


In [10]:
descriptors_df.get_aa_comp(n_jobs=4)

Unnamed: 0,sequence,A,R,N,D,C,E,Q,G,H,...,L,K,M,F,P,S,T,W,Y,V
0,MSYKPIAPAPSSTPGSSTPGPGTPVPTGSVPSPSGSVPGAGAPFRP...,9.524,6.349,1.587,1.587,0.794,5.556,2.381,11.111,0.794,...,4.762,3.968,3.968,2.381,14.286,11.905,7.143,0.0,3.175,4.762
1,MSDDLPIDIHSSKLLDWLVSRRHCNKDWQKSVVAIREKIKHAILDM...,8.589,5.726,2.863,7.362,0.613,10.02,5.317,5.317,2.045,...,9.407,8.589,2.454,2.658,2.658,4.908,4.09,1.022,3.067,5.726
2,MPFDPAASPLSPSQARVLATLMEKARTVPDSYPMSLNGLLTGCNQK...,12.083,7.5,2.917,2.917,1.25,7.5,5.833,8.333,0.833,...,12.5,1.25,2.5,2.083,9.583,8.333,4.583,1.667,1.25,5.833
3,MIHFTKMHGLGNDFMVVDGVTQNVFFSPEQIRRLADRNFGIGFDQL...,5.455,5.455,5.455,5.455,2.182,5.455,5.818,10.182,2.545,...,7.273,5.091,2.909,4.727,5.091,3.636,5.091,0.727,2.182,9.091
4,MGSSTTEPDVGTTSNIETTTTLQNKNVNEVDQNKKSEQSNPSFKEV...,1.174,3.756,9.624,3.286,0.469,5.164,12.207,3.052,3.052,...,7.277,5.634,2.347,1.878,7.042,13.615,6.573,0.0,3.756,5.634
5,MSTNPIQPLLDVLYQGKSLNREQTAELFGALIRGEMSEAAMAGMLV...,14.006,3.641,4.762,3.922,1.401,5.602,3.361,10.084,2.241,...,12.045,4.762,2.521,2.801,4.762,5.322,5.322,0.0,1.401,7.003
6,MPQRFIVVTGGVLSGIGKGIFSASLARILKDSGVNVNILKIDPYLN...,4.943,4.943,4.753,6.274,0.19,7.414,3.232,8.745,2.471,...,8.365,7.034,2.091,4.373,4.373,4.753,4.373,0.38,4.183,7.985
7,MTSLADLPVDVSPRHEGERIRSGDMYVELAGPKSFGAELFKVVDPD...,6.424,4.925,1.713,5.782,2.141,16.488,1.713,7.709,2.355,...,5.782,6.21,2.355,3.854,5.996,3.854,3.212,1.071,4.069,8.994
8,MGSADDRRFEVLRAIVADFVATKEPIGSKTLVERHNLGVSSATVRN...,10.496,8.163,2.332,5.539,0.0,7.58,3.207,8.746,2.041,...,11.662,2.332,2.041,1.458,2.624,7.289,6.706,0.0,2.041,11.953
9,MDNIRNFSIIAHIDHGKSTLADRIIQLCGGLSDREMEAQVLDSMDI...,10.017,5.175,3.172,6.01,1.169,7.679,4.674,6.678,1.336,...,8.681,6.678,2.671,2.504,4.174,5.008,3.506,0.167,3.172,10.351


It also can be called function to get all descriptors of a set of descriptors (for example, get all physico chemical descriptors) or the function get_all which calculates all descriptors available in the ProPythia package.

In [11]:
descriptors_df.get_all_physicochemical(ph=7, amide=False, n_jobs=4)

Unnamed: 0,sequence,Is_transporter,length,charge,chargedensity,formulaC,formulaH,formulaN,formulaO,formulaS,...,IsoelectricPoint,Instability_index,SecStruct_helix,SecStruct_turn,SecStruct_sheet,Molar_extinction_coefficient_reduced,Molar_extinction_coefficient_oxidized,aliphatic_index,bomanindex,hydrophobic_ratio
0,MSYKPIAPAPSSTPGSSTPGPGTPVPTGSVPSPSGSVPGAGAPFRP...,0,126.0,4.034,0.000308,569,905,159,181,6,...,9.5131,62.319841,0.190476,0.388889,0.238095,5960,5960,57.380952,1.478254,0.301587
1,MSDDLPIDIHSSKLLDWLVSRRHCNKDWQKSVVAIREKIKHAILDM...,0,489.0,-14.152,-0.000254,2438,3878,654,785,15,...,5.365737,41.241145,0.294479,0.157464,0.304703,49850,49975,91.390593,2.071166,0.370143
2,MPFDPAASPLSPSQARVLATLMEKARTVPDSYPMSLNGLLTGCNQK...,0,240.0,-3.983,-0.000155,1109,1754,312,360,9,...,5.342945,64.9775,0.245833,0.291667,0.345833,26470,26595,82.625,1.527125,0.375
3,MIHFTKMHGLGNDFMVVDGVTQNVFFSPEQIRRLADRNFGIGFDQL...,0,275.0,-0.697,-2.3e-05,1330,2079,365,413,14,...,6.656551,32.444,0.301818,0.243636,0.210909,19940,20315,84.290909,1.526327,0.378182
4,MGSSTTEPDVGTTSNIETTTTLQNKNVNEVDQNKKSEQSNPSFKEV...,0,426.0,5.166,0.000107,2025,3145,565,746,12,...,8.780546,71.051408,0.230047,0.333333,0.159624,23840,23965,63.286385,2.633779,0.232394
5,MSTNPIQPLLDVLYQGKSLNREQTAELFGALIRGEMSEAAMAGMLV...,0,357.0,-3.524,-9.5e-05,1626,2632,446,510,14,...,6.082648,27.845378,0.282913,0.2493,0.341737,7450,7700,100.952381,0.82507,0.448179
6,MPQRFIVVTGGVLSGIGKGIFSASLARILKDSGVNVNILKIDPYLN...,0,526.0,-7.741,-0.000131,2650,4156,694,800,12,...,5.887803,24.372624,0.344106,0.226236,0.228137,43780,43780,96.311787,1.523992,0.370722
7,MTSLADLPVDVSPRHEGERIRSGDMYVELAGPKSFGAELFKVVDPD...,0,467.0,-51.492,-0.000971,2346,3596,600,752,21,...,4.536968,50.438158,0.291221,0.192719,0.310493,55810,56435,75.931478,1.921692,0.349036
8,MGSADDRRFEVLRAIVADFVATKEPIGSKTLVERHNLGVSSATVRN...,0,343.0,-8.274,-0.000225,1586,2605,457,519,7,...,5.396658,25.243732,0.309038,0.209913,0.317784,10430,10430,105.422741,1.590554,0.413994
9,MDNIRNFSIIAHIDHGKSTLADRIIQLCGGLSDREMEAQVLDSMDI...,0,599.0,-10.627,-0.000161,2893,4656,768,909,23,...,5.498911,30.279299,0.320534,0.190317,0.290484,33810,34185,101.886477,1.387679,0.42571


In [12]:
descriptors_df.get_all()

Unnamed: 0,sequence,Is_transporter,length_x,charge_x,chargedensity_x,formulaC_x,formulaH_x,formulaN_x,formulaO_x,formulaS_x,...,crosscorr_6,moment,global,profile_0,profile_1,arc_0,arc_1,arc_2,arc_3,arc_4
0,MSYKPIAPAPSSTPGSSTPGPGTPVPTGSVPSPSGSVPGAGAPFRP...,0,126.0,4.034,0.000308,569,905,159,181,6,...,-0.023589,0.927272,0.52,0.003859,0.09695,120,180,60,60,60
1,MSDDLPIDIHSSKLLDWLVSRRHCNKDWQKSVVAIREKIKHAILDM...,0,489.0,-14.152,-0.000254,2438,3878,654,785,15,...,-0.051833,0.865172,1.0,-6.9e-05,0.415926,140,260,60,80,20
2,MPFDPAASPLSPSQARVLATLMEKARTVPDSYPMSLNGLLTGCNQK...,0,240.0,-3.983,-0.000155,1109,1754,312,360,9,...,-0.100935,0.850756,0.945714,-0.000378,0.405118,140,160,40,60,40
3,MIHFTKMHGLGNDFMVVDGVTQNVFFSPEQIRRLADRNFGIGFDQL...,0,275.0,-0.697,-2.3e-05,1330,2079,365,413,14,...,0.017756,1.027452,0.724286,-0.000369,0.409831,100,160,80,40,20
4,MGSSTTEPDVGTTSNIETTTTLQNKNVNEVDQNKKSEQSNPSFKEV...,0,426.0,5.166,0.000107,2025,3145,565,746,12,...,0.032734,0.839183,0.727143,-0.000248,0.353142,100,300,60,60,40
5,MSTNPIQPLLDVLYQGKSLNREQTAELFGALIRGEMSEAAMAGMLV...,0,357.0,-3.524,-9.5e-05,1626,2632,446,510,14,...,0.050793,0.799997,0.841429,-0.000279,0.385699,160,120,60,40,40
6,MPQRFIVVTGGVLSGIGKGIFSASLARILKDSGVNVNILKIDPYLN...,0,526.0,-7.741,-0.000131,2650,4156,694,800,12,...,0.009954,0.968823,0.892857,3.2e-05,0.352503,120,160,80,80,40
7,MTSLADLPVDVSPRHEGERIRSGDMYVELAGPKSFGAELFKVVDPD...,0,467.0,-51.492,-0.000971,2346,3596,600,752,21,...,-0.024954,1.014423,0.828571,-0.000166,0.389549,160,220,60,120,40
8,MGSADDRRFEVLRAIVADFVATKEPIGSKTLVERHNLGVSSATVRN...,0,343.0,-8.274,-0.000225,1586,2605,457,519,7,...,-0.081282,1.131478,0.978571,-0.000272,0.439268,140,140,60,40,20
9,MDNIRNFSIIAHIDHGKSTLADRIIQLCGGLSDREMEAQVLDSMDI...,0,599.0,-10.627,-0.000161,2893,4656,768,909,23,...,0.015365,0.922217,0.954286,9.1e-05,0.333468,200,220,80,80,40


To obtain a certain set of descriptors it can be used the function get_adaptable which calculates the descriptors present in the list_of_functions. The list in the example calculates all psycho chemical descriptors, AAC, DPC, PAAC, APAAC, CTD with the default parameters. All functions have assigned default parameters.

In [13]:
descriptors_df.get_adaptable([17, 18, 19, 22, 24, 30])

Unnamed: 0,sequence,Is_transporter,length_x,charge_x,chargedensity_x,formulaC_x,formulaH_x,formulaN_x,formulaO_x,formulaS_x,...,_HydrophobicityD2001_y,_HydrophobicityD2025_y,_HydrophobicityD2050_y,_HydrophobicityD2075_y,_HydrophobicityD2100_y,_HydrophobicityD3001_y,_HydrophobicityD3025_y,_HydrophobicityD3050_y,_HydrophobicityD3075_y,_HydrophobicityD3100_y
0,MSYKPIAPAPSSTPGSSTPGPGTPVPTGSVPSPSGSVPGAGAPFRP...,0,126.0,4.034,0.000308,569,905,159,181,6,...,2.74,28.767,57.534,101.37,172.603,3.846,169.231,296.154,380.769,450.0
1,MSDDLPIDIHSSKLLDWLVSRRHCNKDWQKSVVAIREKIKHAILDM...,0,489.0,-14.152,-0.000254,2438,3878,654,785,15,...,1.333,91.333,160.0,234.667,325.333,0.694,68.056,160.417,247.222,337.5
2,MPFDPAASPLSPSQARVLATLMEKARTVPDSYPMSLNGLLTGCNQK...,0,240.0,-3.983,-0.000155,1109,1754,312,360,9,...,1.852,51.852,117.593,173.148,222.222,1.538,84.615,163.077,261.538,355.385
3,MIHFTKMHGLGNDFMVVDGVTQNVFFSPEQIRRLADRNFGIGFDQL...,0,275.0,-0.697,-2.3e-05,1330,2079,365,413,14,...,3.191,81.915,158.511,228.723,288.298,1.099,53.846,141.758,216.484,302.198
4,MGSSTTEPDVGTTSNIETTTTLQNKNVNEVDQNKKSEQSNPSFKEV...,0,426.0,5.166,0.000107,2025,3145,565,746,12,...,1.227,87.117,142.331,199.387,261.35,1.064,95.745,207.447,325.532,443.617
5,MSTNPIQPLLDVLYQGKSLNREQTAELFGALIRGEMSEAAMAGMLV...,0,357.0,-3.524,-9.5e-05,1626,2632,446,510,14,...,1.299,61.039,109.74,174.026,231.169,0.909,80.909,169.091,234.545,318.182
6,MPQRFIVVTGGVLSGIGKGIFSASLARILKDSGVNVNILKIDPYLN...,0,526.0,-7.741,-0.000131,2650,4156,694,800,12,...,1.124,66.292,145.506,224.719,293.258,0.585,76.023,145.029,218.713,306.433
7,MTSLADLPVDVSPRHEGERIRSGDMYVELAGPKSFGAELFKVVDPD...,0,467.0,-51.492,-0.000971,2346,3596,600,752,21,...,1.274,82.803,158.599,212.102,297.452,0.725,77.536,150.0,240.58,331.884
8,MGSADDRRFEVLRAIVADFVATKEPIGSKTLVERHNLGVSSATVRN...,0,343.0,-8.274,-0.000225,1586,2605,457,519,7,...,1.46,63.504,141.606,204.38,249.635,0.943,89.623,153.774,238.679,320.755
9,MDNIRNFSIIAHIDHGKSTLADRIIQLCGGLSDREMEAQVLDSMDI...,0,599.0,-10.627,-0.000161,2893,4656,768,909,23,...,3.941,67.98,134.483,206.404,291.626,0.51,77.041,153.061,228.061,304.082


In [14]:
descriptors_list.get_adaptable([17, 19, 22, 24, 30])

Unnamed: 0,seqs,length,charge,chargedensity,formulaC,formulaH,formulaN,formulaO,formulaS,tot,...,_HydrophobicityD2001,_HydrophobicityD2025,_HydrophobicityD2050,_HydrophobicityD2075,_HydrophobicityD2100,_HydrophobicityD3001,_HydrophobicityD3025,_HydrophobicityD3050,_HydrophobicityD3075,_HydrophobicityD3100
0,MSDDLPIDIHSSKLLDWLVSRRHCNKDWQKSVVAIREKIKHAILDM...,489.0,-14.152,-0.000254,2438,3878,654,785,15,8946,...,1.333,91.333,160.0,234.667,325.333,0.694,68.056,160.417,247.222,337.5
1,MPFDPAASPLSPSQARVLATLMEKARTVPDSYPMSLNGLLTGCNQK...,240.0,-3.983,-0.000155,1109,1754,312,360,9,4154,...,1.852,51.852,117.593,173.148,222.222,1.538,84.615,163.077,261.538,355.385
2,MSTNPIQPLLDVLYQGKSLNREQTAELFGALIRGEMSEAAMAGMLV...,357.0,-3.524,-9.5e-05,1626,2632,446,510,14,6087,...,1.299,61.039,109.74,174.026,231.169,0.909,80.909,169.091,234.545,318.182


# Encodings
The module Encoding performs the encoding of the protein sequences.
Contrary to the descriptors, encodings, give a numerical representation to each character of the sequence (in this case aminoacids). Then, Deep Learning models can be used to find hidden patterns in the protein sequence of aminoacids. One hot encodings dont give any prior information about the aminoacids with each aminoacid being represented as a vector of 20 '0's and 1 '1'. Other encodings give prior information on the aminoacids, such as BLOSUM encodings ( evolutionary onformation on aminoacids) or z-scales (physicochemical properties of aminoacid). In encodings each protein sequence gets a representation of length N :  

    N = number of aminoacids * length of AA representation vector. 
    
Encodings mantain the order of AA sequence. Therefore, they are suitable for RNN and CNN models. They however require all the protein sequences to have the same length. 

As the ProteinDescriptors it accepts as dataset a pandas dataframe, list of sequences and a sequence (string) as dataset. The parameter col is the name of the column to store the sequences, or the column where the sequences are present (pandas dataframe).

In [15]:
enconde_df= Encoding(dataset= dataframe_data ,  col= 'sequence')
enconde_df.result

Unnamed: 0,sequence,Is_transporter
0,MSYKPIAPAPSSTPGSSTPGPGTPVPTGSVPSPSGSVPGAGAPFRP...,0
1,MSDDLPIDIHSSKLLDWLVSRRHCNKDWQKSVVAIREKIKHAILDM...,0
2,MPFDPAASPLSPSQARVLATLMEKARTVPDSYPMSLNGLLTGCNQK...,0
3,MIHFTKMHGLGNDFMVVDGVTQNVFFSPEQIRRLADRNFGIGFDQL...,0
4,MGSSTTEPDVGTTSNIETTTTLQNKNVNEVDQNKKSEQSNPSFKEV...,0
5,MSTNPIQPLLDVLYQGKSLNREQTAELFGALIRGEMSEAAMAGMLV...,0
6,MPQRFIVVTGGVLSGIGKGIFSASLARILKDSGVNVNILKIDPYLN...,0
7,MTSLADLPVDVSPRHEGERIRSGDMYVELAGPKSFGAELFKVVDPD...,0
8,MGSADDRRFEVLRAIVADFVATKEPIGSKTLVERHNLGVSSATVRN...,0
9,MDNIRNFSIIAHIDHGKSTLADRIIQLCGGLSDREMEAQVLDSMDI...,0


In [16]:
enconde_list= Encoding(dataset= list_data ,  col= 'seqs')
enconde_list.result

Unnamed: 0,seqs
0,MSDDLPIDIHSSKLLDWLVSRRHCNKDWQKSVVAIREKIKHAILDM...
1,MPFDPAASPLSPSQARVLATLMEKARTVPDSYPMSLNGLLTGCNQK...
2,MSTNPIQPLLDVLYQGKSLNREQTAELFGALIRGEMSEAAMAGMLV...


In [17]:
enconde_str =Encoding(dataset= sequence_data ,  col= 'seq')
enconde_str.result

Unnamed: 0,seq
0,MSTNPIQPLLDVLYQGKSLNREQTAELFGALIRGEMSEAAMAGMLV...


To perform the encoding operations the intended function must be called. It can be one-hot-encoded, NLF, Blosum, z_scale. It also can be performed a padding to all sequences in the dataframe.

In [18]:
hot_encoded = enconde_df.get_hot_encoded()
print(hot_encoded['One_hot_encoding'][0].shape)
hot_encoded

2023-08-06 20:45:44.510372: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
2023-08-06 20:45:44.510372: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
2023-08-06 20:45:44.510374: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
2023-08-06 20:45:44.510373: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
  warn(
  warn(
  warn(
  warn(
2023-08-06 20:45:46.947696: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
2023-08-06 20:45:47.353155: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
2023-08-06 20:45:47.756968: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] 

(126, 21)


Unnamed: 0,sequence,Is_transporter,One_hot_encoding
0,MSYKPIAPAPSSTPGSSTPGPGTPVPTGSVPSPSGSVPGAGAPFRP...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,MSDDLPIDIHSSKLLDWLVSRRHCNKDWQKSVVAIREKIKHAILDM...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,MPFDPAASPLSPSQARVLATLMEKARTVPDSYPMSLNGLLTGCNQK...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
3,MIHFTKMHGLGNDFMVVDGVTQNVFFSPEQIRRLADRNFGIGFDQL...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
4,MGSSTTEPDVGTTSNIETTTTLQNKNVNEVDQNKKSEQSNPSFKEV...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
5,MSTNPIQPLLDVLYQGKSLNREQTAELFGALIRGEMSEAAMAGMLV...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
6,MPQRFIVVTGGVLSGIGKGIFSASLARILKDSGVNVNILKIDPYLN...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
7,MTSLADLPVDVSPRHEGERIRSGDMYVELAGPKSFGAELFKVVDPD...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
8,MGSADDRRFEVLRAIVADFVATKEPIGSKTLVERHNLGVSSATVRN...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
9,MDNIRNFSIIAHIDHGKSTLADRIIQLCGGLSDREMEAQVLDSMDI...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


In [25]:
nlf = enconde_df.get_nlf()
nlf

Unnamed: 0,sequence,Is_transporter,One_hot_encoding,nlf,blosum,pad_seques
0,MSYKPIAPAPSSTPGSSTPGPGTPVPTGSVPSPSGSVPGAGAPFRP...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.72, 0.85, 0.34, 0.44, 0.01, 0.8, 0.16, 0.0...","[[-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1,...",MSYKPIAPAPSSTPGSSTPGPGTPVPTGSVPSPSGSVPGAGAPFRP...
1,MSDDLPIDIHSSKLLDWLVSRRHCNKDWQKSVVAIREKIKHAILDM...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.72, 0.85, 0.34, 0.44, 0.01, 0.8, 0.16, 0.0...","[[-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1,...",MSDDLPIDIHSSKLLDWLVSRRHCNKDWQKSVVAIREKIKHAILDM...
2,MPFDPAASPLSPSQARVLATLMEKARTVPDSYPMSLNGLLTGCNQK...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.72, 0.85, 0.34, 0.44, 0.01, 0.8, 0.16, 0.0...","[[-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1,...",MPFDPAASPLSPSQARVLATLMEKARTVPDSYPMSLNGLLTGCNQK...
3,MIHFTKMHGLGNDFMVVDGVTQNVFFSPEQIRRLADRNFGIGFDQL...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.72, 0.85, 0.34, 0.44, 0.01, 0.8, 0.16, 0.0...","[[-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1,...",MIHFTKMHGLGNDFMVVDGVTQNVFFSPEQIRRLADRNFGIGFDQL...
4,MGSSTTEPDVGTTSNIETTTTLQNKNVNEVDQNKKSEQSNPSFKEV...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.72, 0.85, 0.34, 0.44, 0.01, 0.8, 0.16, 0.0...","[[-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1,...",MGSSTTEPDVGTTSNIETTTTLQNKNVNEVDQNKKSEQSNPSFKEV...
5,MSTNPIQPLLDVLYQGKSLNREQTAELFGALIRGEMSEAAMAGMLV...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.72, 0.85, 0.34, 0.44, 0.01, 0.8, 0.16, 0.0...","[[-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1,...",MSTNPIQPLLDVLYQGKSLNREQTAELFGALIRGEMSEAAMAGMLV...
6,MPQRFIVVTGGVLSGIGKGIFSASLARILKDSGVNVNILKIDPYLN...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.72, 0.85, 0.34, 0.44, 0.01, 0.8, 0.16, 0.0...","[[-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1,...",MPQRFIVVTGGVLSGIGKGIFSASLARILKDSGVNVNILKIDPYLN...
7,MTSLADLPVDVSPRHEGERIRSGDMYVELAGPKSFGAELFKVVDPD...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.72, 0.85, 0.34, 0.44, 0.01, 0.8, 0.16, 0.0...","[[-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1,...",MTSLADLPVDVSPRHEGERIRSGDMYVELAGPKSFGAELFKVVDPD...
8,MGSADDRRFEVLRAIVADFVATKEPIGSKTLVERHNLGVSSATVRN...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.72, 0.85, 0.34, 0.44, 0.01, 0.8, 0.16, 0.0...","[[-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1,...",MGSADDRRFEVLRAIVADFVATKEPIGSKTLVERHNLGVSSATVRN...
9,MDNIRNFSIIAHIDHGKSTLADRIIQLCGGLSDREMEAQVLDSMDI...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.72, 0.85, 0.34, 0.44, 0.01, 0.8, 0.16, 0.0...","[[-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1,...",MDNIRNFSIIAHIDHGKSTLADRIIQLCGGLSDREMEAQVLDSMDI...


In [29]:
blosum = enconde_df.get_blosum()
print(np.array(blosum['blosum'][0]).shape)
blosum

(126, 23)


Unnamed: 0,sequence,Is_transporter,One_hot_encoding,nlf,blosum,pad_seques
0,MSYKPIAPAPSSTPGSSTPGPGTPVPTGSVPSPSGSVPGAGAPFRP...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.72, 0.85, 0.34, 0.44, 0.01, 0.8, 0.16, 0.0...","[[-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1,...",MSYKPIAPAPSSTPGSSTPGPGTPVPTGSVPSPSGSVPGAGAPFRP...
1,MSDDLPIDIHSSKLLDWLVSRRHCNKDWQKSVVAIREKIKHAILDM...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.72, 0.85, 0.34, 0.44, 0.01, 0.8, 0.16, 0.0...","[[-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1,...",MSDDLPIDIHSSKLLDWLVSRRHCNKDWQKSVVAIREKIKHAILDM...
2,MPFDPAASPLSPSQARVLATLMEKARTVPDSYPMSLNGLLTGCNQK...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.72, 0.85, 0.34, 0.44, 0.01, 0.8, 0.16, 0.0...","[[-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1,...",MPFDPAASPLSPSQARVLATLMEKARTVPDSYPMSLNGLLTGCNQK...
3,MIHFTKMHGLGNDFMVVDGVTQNVFFSPEQIRRLADRNFGIGFDQL...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.72, 0.85, 0.34, 0.44, 0.01, 0.8, 0.16, 0.0...","[[-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1,...",MIHFTKMHGLGNDFMVVDGVTQNVFFSPEQIRRLADRNFGIGFDQL...
4,MGSSTTEPDVGTTSNIETTTTLQNKNVNEVDQNKKSEQSNPSFKEV...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.72, 0.85, 0.34, 0.44, 0.01, 0.8, 0.16, 0.0...","[[-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1,...",MGSSTTEPDVGTTSNIETTTTLQNKNVNEVDQNKKSEQSNPSFKEV...
5,MSTNPIQPLLDVLYQGKSLNREQTAELFGALIRGEMSEAAMAGMLV...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.72, 0.85, 0.34, 0.44, 0.01, 0.8, 0.16, 0.0...","[[-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1,...",MSTNPIQPLLDVLYQGKSLNREQTAELFGALIRGEMSEAAMAGMLV...
6,MPQRFIVVTGGVLSGIGKGIFSASLARILKDSGVNVNILKIDPYLN...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.72, 0.85, 0.34, 0.44, 0.01, 0.8, 0.16, 0.0...","[[-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1,...",MPQRFIVVTGGVLSGIGKGIFSASLARILKDSGVNVNILKIDPYLN...
7,MTSLADLPVDVSPRHEGERIRSGDMYVELAGPKSFGAELFKVVDPD...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.72, 0.85, 0.34, 0.44, 0.01, 0.8, 0.16, 0.0...","[[-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1,...",MTSLADLPVDVSPRHEGERIRSGDMYVELAGPKSFGAELFKVVDPD...
8,MGSADDRRFEVLRAIVADFVATKEPIGSKTLVERHNLGVSSATVRN...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.72, 0.85, 0.34, 0.44, 0.01, 0.8, 0.16, 0.0...","[[-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1,...",MGSADDRRFEVLRAIVADFVATKEPIGSKTLVERHNLGVSSATVRN...
9,MDNIRNFSIIAHIDHGKSTLADRIIQLCGGLSDREMEAQVLDSMDI...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.72, 0.85, 0.34, 0.44, 0.01, 0.8, 0.16, 0.0...","[[-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1,...",MDNIRNFSIIAHIDHGKSTLADRIIQLCGGLSDREMEAQVLDSMDI...


The function get_pad_and_hot_encoding allows to perform the padding and the one hot encoding of the sequence at the same time. The one-hot-encoded sequence will have the shape of (length of the sequences, number of amino acids in the alphabet).

In [30]:
res = enconde_df.get_pad_and_hot_encoding(seq_len=300)
print(len(res.iloc[0,1])) #padded sequence
print(np.array(res.iloc[0,2]).shape) #one hot encoded sequence
res

1
(300, 21)


Unnamed: 0,sequence,Is_transporter,One_hot_encoding,nlf,blosum,pad_seques
0,MSYKPIAPAPSSTPGSSTPGPGTPVPTGSVPSPSGSVPGAGAPFRP...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.72, 0.85, 0.34, 0.44, 0.01, 0.8, 0.16, 0.0...","[[-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1,...",MSYKPIAPAPSSTPGSSTPGPGTPVPTGSVPSPSGSVPGAGAPFRP...
1,MSDDLPIDIHSSKLLDWLVSRRHCNKDWQKSVVAIREKIKHAILDM...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.72, 0.85, 0.34, 0.44, 0.01, 0.8, 0.16, 0.0...","[[-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1,...",MSDDLPIDIHSSKLLDWLVSRRHCNKDWQKSVVAIREKIKHAILDM...
2,MPFDPAASPLSPSQARVLATLMEKARTVPDSYPMSLNGLLTGCNQK...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.72, 0.85, 0.34, 0.44, 0.01, 0.8, 0.16, 0.0...","[[-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1,...",MPFDPAASPLSPSQARVLATLMEKARTVPDSYPMSLNGLLTGCNQK...
3,MIHFTKMHGLGNDFMVVDGVTQNVFFSPEQIRRLADRNFGIGFDQL...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.72, 0.85, 0.34, 0.44, 0.01, 0.8, 0.16, 0.0...","[[-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1,...",MIHFTKMHGLGNDFMVVDGVTQNVFFSPEQIRRLADRNFGIGFDQL...
4,MGSSTTEPDVGTTSNIETTTTLQNKNVNEVDQNKKSEQSNPSFKEV...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.72, 0.85, 0.34, 0.44, 0.01, 0.8, 0.16, 0.0...","[[-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1,...",MGSSTTEPDVGTTSNIETTTTLQNKNVNEVDQNKKSEQSNPSFKEV...
5,MSTNPIQPLLDVLYQGKSLNREQTAELFGALIRGEMSEAAMAGMLV...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.72, 0.85, 0.34, 0.44, 0.01, 0.8, 0.16, 0.0...","[[-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1,...",MSTNPIQPLLDVLYQGKSLNREQTAELFGALIRGEMSEAAMAGMLV...
6,MPQRFIVVTGGVLSGIGKGIFSASLARILKDSGVNVNILKIDPYLN...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.72, 0.85, 0.34, 0.44, 0.01, 0.8, 0.16, 0.0...","[[-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1,...",MPQRFIVVTGGVLSGIGKGIFSASLARILKDSGVNVNILKIDPYLN...
7,MTSLADLPVDVSPRHEGERIRSGDMYVELAGPKSFGAELFKVVDPD...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.72, 0.85, 0.34, 0.44, 0.01, 0.8, 0.16, 0.0...","[[-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1,...",MTSLADLPVDVSPRHEGERIRSGDMYVELAGPKSFGAELFKVVDPD...
8,MGSADDRRFEVLRAIVADFVATKEPIGSKTLVERHNLGVSSATVRN...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.72, 0.85, 0.34, 0.44, 0.01, 0.8, 0.16, 0.0...","[[-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1,...",MGSADDRRFEVLRAIVADFVATKEPIGSKTLVERHNLGVSSATVRN...
9,MDNIRNFSIIAHIDHGKSTLADRIIQLCGGLSDREMEAQVLDSMDI...,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[1.72, 0.85, 0.34, 0.44, 0.01, 0.8, 0.16, 0.0...","[[-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1,...",MDNIRNFSIIAHIDHGKSTLADRIIQLCGGLSDREMEAQVLDSMDI...


Now, with either physico chemical descriptors or encodings one can move on to ML and DL models. 
With physicochemical encodings one may need feature selection or dimensionality reduction.
Please check the correspondent notebooks. 