In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [4]:
bcell_df = pd.read_csv(os.path.join('data','input_bcell.csv'))
bcell_df.head()

Unnamed: 0,parent_protein_id,protein_seq,start_position,end_position,peptide_seq,chou_fasman,emini,kolaskar_tongaonkar,parker,isoelectric_point,aromaticity,hydrophobicity,stability,target
0,A2T3T0,MDVLYSLSKTLKDARDKIVEGTLYSNVSDLIQQFNQMIITMNGNEF...,161,165,SASFT,1.016,0.703,1.018,2.22,5.810364,0.103275,-0.143829,40.2733,1
1,F0V2I4,MTIHKVAINGFGRIGRLLFRNLLSSQGVQVVAVNDVVDIKVLTHLL...,251,255,LCLKI,0.77,0.179,1.199,-3.86,6.210876,0.065476,-0.036905,24.998512,1
2,O75508,MVATCLQVVGFVTSFVGWIGVIVTTSTNDWVVTCGYTIPTCRKLDE...,145,149,AHRET,0.852,3.427,0.96,4.28,8.223938,0.091787,0.879227,27.863333,1
3,O84462,MTNSISGYQPTVTTSTSSTTSASGASGSLGASSVSTTANATVTQTA...,152,156,SNYDD,1.41,2.548,0.936,6.32,4.237976,0.044776,-0.521393,30.765373,1
4,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,85,89,DGTYR,1.214,1.908,0.937,4.64,6.867493,0.103846,-0.578846,21.684615,1


In [5]:
sars_df = pd.read_csv(os.path.join('data','input_sars.csv'))
sars_df.head()

Unnamed: 0,parent_protein_id,protein_seq,start_position,end_position,peptide_seq,chou_fasman,emini,kolaskar_tongaonkar,parker,isoelectric_point,aromaticity,hydrophobicity,stability,target
0,AAU93319,MFIFLLFLTLTSGSDLDRCTTFDDVQAPNYTQHTSSMRGVYYPDEI...,1,17,MFIFLLFLTLTSGSDLD,0.887,0.04,1.056,-2.159,5.569763,0.116335,-0.061116,33.205116,0
1,AAU93319,MFIFLLFLTLTSGSDLDRCTTFDDVQAPNYTQHTSSMRGVYYPDEI...,1,15,MFIFLLFLTLTSGSD,0.869,0.047,1.056,-2.5,5.569763,0.116335,-0.061116,33.205116,0
2,AAU93319,MFIFLLFLTLTSGSDLDRCTTFDDVQAPNYTQHTSSMRGVYYPDEI...,2,10,FIFLLFLTL,0.621,0.042,1.148,-7.467,5.569763,0.116335,-0.061116,33.205116,0
3,AAU93319,MFIFLLFLTLTSGSDLDRCTTFDDVQAPNYTQHTSSMRGVYYPDEI...,6,20,LFLTLTSGSDLDRCT,1.021,0.23,1.049,0.927,5.569763,0.116335,-0.061116,33.205116,0
4,AAU93319,MFIFLLFLTLTSGSDLDRCTTFDDVQAPNYTQHTSSMRGVYYPDEI...,9,25,TLTSGSDLDRCTTFDDV,1.089,0.627,1.015,3.165,5.569763,0.116335,-0.061116,33.205116,0


## Protein feature Engineering and experimentation

In [10]:
bcell_df.protein_seq.values.reshape(-1,1)[0]

array(['MDVLYSLSKTLKDARDKIVEGTLYSNVSDLIQQFNQMIITMNGNEFQTGGIGNLPIRNWNFNFGLLGTTLLNLDANYVETARNTIDYFVDFVDNVCMDEMVRESQRNGIAPQSDSLRKLSAIKFKRINFDNSSEYIENWNLQNRRQRTGFTFHKPNIFPYSASFTLNRSQPAHDNLMGTMWLNAGSEIQVAGFDYSCAINAPANIQQFEHIVPLRRVLTTATITLLPDAERFSFPRVINSADGATTWFFNPVILRPNNVEVEFLLNGQIINTYQARFGTIVARNFDTIRLSFQLMRPPNMTPAVAVLFPNAQPFEHHATVGLTLRIESAVCESVLADASETLLANVTSVRQEYAIPVGPVFPPGMNWTDLITNYSPSREDNLQRVFTVASIRSMLIK'],
      dtype=object)

## Mapping for BCell and SARS

In [4]:
from sklearn.preprocessing import LabelEncoder
gle = LabelEncoder()

In [5]:
bcell_parent_protein_id_labels = gle.fit_transform(bcell_df.parent_protein_id)
bcell_parent_protein_id_mappings = {index: label for index, label in 
                  enumerate(gle.classes_)}
# bcell_parent_protein_id_mappings

In [6]:
bcell_protein_seq_labels = gle.fit_transform(bcell_df.protein_seq)
bcell_protein_seq_mappings = {index: label for index, label in 
                  enumerate(gle.classes_)}

# bcell_protein_seq_mappings

In [7]:
bcell_peptide_seq_labels = gle.fit_transform(bcell_df.peptide_seq)
bcell_peptide_seq_mappings = {index: label for index, label in 
                  enumerate(gle.classes_)}
# bcell_peptide_seq_mappings

In [8]:
bcell_df['parent_protein_id_num'] = bcell_parent_protein_id_labels
bcell_df['protein_seq_num'] = bcell_protein_seq_labels
bcell_df['peptide_seq_num'] = bcell_peptide_seq_labels
bcell_df.head()

Unnamed: 0,parent_protein_id,protein_seq,start_position,end_position,peptide_seq,chou_fasman,emini,kolaskar_tongaonkar,parker,isoelectric_point,aromaticity,hydrophobicity,stability,target,parent_protein_id_num,protein_seq_num,peptide_seq_num
0,A2T3T0,MDVLYSLSKTLKDARDKIVEGTLYSNVSDLIQQFNQMIITMNGNEF...,161,165,SASFT,1.016,0.703,1.018,2.22,5.810364,0.103275,-0.143829,40.2733,1,3,200,11098
1,F0V2I4,MTIHKVAINGFGRIGRLLFRNLLSSQGVQVVAVNDVVDIKVLTHLL...,251,255,LCLKI,0.77,0.179,1.199,-3.86,6.210876,0.065476,-0.036905,24.998512,1,43,684,6651
2,O75508,MVATCLQVVGFVTSFVGWIGVIVTTSTNDWVVTCGYTIPTCRKLDE...,145,149,AHRET,0.852,3.427,0.96,4.28,8.223938,0.091787,0.879227,27.863333,1,89,709,397
3,O84462,MTNSISGYQPTVTTSTSSTTSASGASGSLGASSVSTTANATVTQTA...,152,156,SNYDD,1.41,2.548,0.936,6.32,4.237976,0.044776,-0.521393,30.765373,1,94,693,11615
4,P00918,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,85,89,DGTYR,1.214,1.908,0.937,4.64,6.867493,0.103846,-0.578846,21.684615,1,108,614,1545


In [None]:
bcell_parent_protein_id_labels = gle.fit_transform(bcell_df.parent_protein_id)
bcell_parent_protein_id_mappings = {index: label for index, label in 
                  enumerate(gle.classes_)}
# bcell_parent_protein_id_mappings

In [9]:
sars_df['parent_protein_id_num'] = parent_protein_id_labels
sars_df

ValueError: Length of values (14387) does not match length of index (520)