## Protein Property calculations and data cleaning

In [1]:
import pandas as pd
import numpy as np
import os
from ProteinParam import SequenceAnalysis, ProteinProperties, ProteinMotifs

pd.set_option('display.max_columns', None)

sa = SequenceAnalysis()
pp = ProteinProperties()
pm = ProteinMotifs()

### Load initally collated and filtered data set

In [2]:
df1 = pd.read_csv('raw_data/XRD_combined.csv', encoding = 'utf8')
df1.drop(['Unnamed: 0'], axis=1, inplace=True)
df1.drop_duplicates(subset=['Sequence'], inplace=True)

float_columns = ['Matthews Coefficient','Percent Solvent Content','pH','Temp (K)',
                'Resolution (Å)','High Resolution Limit','Entity ID']

for i in float_columns:
    df1[i] = pd.to_numeric(df1[i], errors='coerce')

### Calculate additional properties

In [3]:
#basic sequence analysis parameters
df1['Sequence Length'] = df1['Sequence'].apply(sa.seq_length)
df1['Hydrophobic Residues'] = df1['Sequence'].apply(sa.hydrophobic_res)
df1['Hydrophilic Residues'] = df1['Sequence'].apply(sa.hydrophilic_res)
df1['Aromatic Residues'] = df1['Sequence'].apply(sa.aromatic_res)

#more advanced protein parameters
df1['Molecular Weight (kDa)'] = df1['Sequence'].apply(pp.molecular_weight)/1000
df1['Hydro Ratio'] = df1['Hydrophobic Residues']/df1['Hydrophilic Residues']
df1['Isoelectric Point'] = df1['Sequence'].apply(pp.isoelectric_point)
df1['Alipathic Index'] = df1['Sequence'].apply(pp.alipathic_index)
df1['Charge at pH 7'] = df1['Sequence'].apply(pp.charge_ph, ph=7.0)

#check for specific ligand binding motifs (no-0, yes-1)
df1['Cardin-Weintraub Motif'] = df1['Sequence'].apply(pm.cardin_weintraub)

df1.replace([np.inf, -np.inf], np.nan, inplace=True)
df1.dropna(inplace=True)
df1.drop(df1.loc[df1['Molecular Weight (kDa)']<0].index, inplace=True)
df1.reset_index(drop=True, inplace=True)

In [4]:
df1.head()

Unnamed: 0,Entry ID,Experimental Method,Matthews Coefficient,Percent Solvent Content,Crystallization Method,pH,Crystal Growth Procedure,Temp (K),DOI,PDB ID,Resolution (Å),High Resolution Limit,Structure Title,Sequence,Source Organism,Gene Name,Macromolecule Name,Entity ID,Sequence Length,Hydrophobic Residues,Hydrophilic Residues,Aromatic Residues,Molecular Weight (kDa),Hydro Ratio,Isoelectric Point,Alipathic Index,Charge at pH 7,Cardin-Weintraub Motif
0,5VYC,X-RAY DIFFRACTION,4.05,69.61,"VAPOR DIFFUSION, SITTING DROP",8.1,"50 mM HEPES-NaOH, 50 mM NH4 acetate, 50 mM NH4...",298.15,10.1016/j.celrep.2017.06.025,5VYC,6.0,6.0,Crystal structure of the human 40S ribosomal s...,MPGVTVKDVNQQEFVRALAAFLKKSGKLKVPEWVDTVKLAKHKELA...,Homo sapiens,RPS19,40S ribosomal protein S19,1.0,145,72,73,10,16.07588,0.986301,10.310703,74.0,4,1
1,5VYC,X-RAY DIFFRACTION,4.05,69.61,"VAPOR DIFFUSION, SITTING DROP",8.1,"50 mM HEPES-NaOH, 50 mM NH4 acetate, 50 mM NH4...",298.15,10.1016/j.celrep.2017.06.025,5VYC,6.0,6.0,Crystal structure of the human 40S ribosomal s...,MAFKDTGKTPVEPEVAIHRIRITLTSRNVKSLEKVCADLIRGAKEK...,Homo sapiens,RPS20,40S ribosomal protein S20,2.0,119,54,65,3,13.38824,0.830769,9.948584,91.680672,3,1
2,5VYC,X-RAY DIFFRACTION,4.05,69.61,"VAPOR DIFFUSION, SITTING DROP",8.1,"50 mM HEPES-NaOH, 50 mM NH4 acetate, 50 mM NH4...",298.15,10.1016/j.celrep.2017.06.025,5VYC,6.0,6.0,Crystal structure of the human 40S ribosomal s...,MQNDAGEFVDLYVPRKCSASNRIIGAKDHASIQMNVAEVDKVTGRF...,Homo sapiens,RPS21,40S ribosomal protein S21,3.0,83,40,43,6,9.12702,0.930233,8.683779,78.795181,1,0
3,5VYC,X-RAY DIFFRACTION,4.05,69.61,"VAPOR DIFFUSION, SITTING DROP",8.1,"50 mM HEPES-NaOH, 50 mM NH4 acetate, 50 mM NH4...",298.15,10.1016/j.celrep.2017.06.025,5VYC,6.0,6.0,Crystal structure of the human 40S ribosomal s...,MGKCRGLRTARKLRSHRRDQKWHDKQYKKAHLGTALKANPFGGASH...,Homo sapiens,RPS23,40S ribosomal protein S23,4.0,143,71,72,8,15.82299,0.986111,10.501013,83.846154,5,1
4,5VYC,X-RAY DIFFRACTION,4.05,69.61,"VAPOR DIFFUSION, SITTING DROP",8.1,"50 mM HEPES-NaOH, 50 mM NH4 acetate, 50 mM NH4...",298.15,10.1016/j.celrep.2017.06.025,5VYC,6.0,6.0,Crystal structure of the human 40S ribosomal s...,MTKKRRNNGRAKKGRGHVQPIRCTNCARCVPKDKAIKKFVIRNIVE...,Homo sapiens,RPS26,40S ribosomal protein S26,5.0,115,54,61,6,13.03088,0.885246,11.007993,69.565217,3,1


In [5]:
#save dataframe for data exploration
if(os.path.exists('raw_data/XRD_protein_properties.csv') and os.path.isfile('raw_data/XRD_protein_properties.csv')):
    os.remove('raw_data/XRD_protein_properties.csv')
    df1.to_csv('raw_data/XRD_protein_properties.csv', mode='a')
else:
    df1.to_csv('raw_data/XRD_protein_properties.csv', mode='a')

### General data set information

In [6]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36287 entries, 0 to 36286
Data columns (total 28 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Entry ID                  36287 non-null  object 
 1   Experimental Method       36287 non-null  object 
 2   Matthews Coefficient      36287 non-null  float64
 3   Percent Solvent Content   36287 non-null  float64
 4   Crystallization Method    36287 non-null  object 
 5   pH                        36287 non-null  float64
 6   Crystal Growth Procedure  36287 non-null  object 
 7   Temp (K)                  36287 non-null  float64
 8   DOI                       36287 non-null  object 
 9   PDB ID                    36287 non-null  object 
 10  Resolution (Å)            36287 non-null  float64
 11  High Resolution Limit     36287 non-null  float64
 12  Structure Title           36287 non-null  object 
 13  Sequence                  36287 non-null  object 
 14  Source

In [7]:
df1.head()

Unnamed: 0,Entry ID,Experimental Method,Matthews Coefficient,Percent Solvent Content,Crystallization Method,pH,Crystal Growth Procedure,Temp (K),DOI,PDB ID,Resolution (Å),High Resolution Limit,Structure Title,Sequence,Source Organism,Gene Name,Macromolecule Name,Entity ID,Sequence Length,Hydrophobic Residues,Hydrophilic Residues,Aromatic Residues,Molecular Weight (kDa),Hydro Ratio,Isoelectric Point,Alipathic Index,Charge at pH 7,Cardin-Weintraub Motif
0,5VYC,X-RAY DIFFRACTION,4.05,69.61,"VAPOR DIFFUSION, SITTING DROP",8.1,"50 mM HEPES-NaOH, 50 mM NH4 acetate, 50 mM NH4...",298.15,10.1016/j.celrep.2017.06.025,5VYC,6.0,6.0,Crystal structure of the human 40S ribosomal s...,MPGVTVKDVNQQEFVRALAAFLKKSGKLKVPEWVDTVKLAKHKELA...,Homo sapiens,RPS19,40S ribosomal protein S19,1.0,145,72,73,10,16.07588,0.986301,10.310703,74.0,4,1
1,5VYC,X-RAY DIFFRACTION,4.05,69.61,"VAPOR DIFFUSION, SITTING DROP",8.1,"50 mM HEPES-NaOH, 50 mM NH4 acetate, 50 mM NH4...",298.15,10.1016/j.celrep.2017.06.025,5VYC,6.0,6.0,Crystal structure of the human 40S ribosomal s...,MAFKDTGKTPVEPEVAIHRIRITLTSRNVKSLEKVCADLIRGAKEK...,Homo sapiens,RPS20,40S ribosomal protein S20,2.0,119,54,65,3,13.38824,0.830769,9.948584,91.680672,3,1
2,5VYC,X-RAY DIFFRACTION,4.05,69.61,"VAPOR DIFFUSION, SITTING DROP",8.1,"50 mM HEPES-NaOH, 50 mM NH4 acetate, 50 mM NH4...",298.15,10.1016/j.celrep.2017.06.025,5VYC,6.0,6.0,Crystal structure of the human 40S ribosomal s...,MQNDAGEFVDLYVPRKCSASNRIIGAKDHASIQMNVAEVDKVTGRF...,Homo sapiens,RPS21,40S ribosomal protein S21,3.0,83,40,43,6,9.12702,0.930233,8.683779,78.795181,1,0
3,5VYC,X-RAY DIFFRACTION,4.05,69.61,"VAPOR DIFFUSION, SITTING DROP",8.1,"50 mM HEPES-NaOH, 50 mM NH4 acetate, 50 mM NH4...",298.15,10.1016/j.celrep.2017.06.025,5VYC,6.0,6.0,Crystal structure of the human 40S ribosomal s...,MGKCRGLRTARKLRSHRRDQKWHDKQYKKAHLGTALKANPFGGASH...,Homo sapiens,RPS23,40S ribosomal protein S23,4.0,143,71,72,8,15.82299,0.986111,10.501013,83.846154,5,1
4,5VYC,X-RAY DIFFRACTION,4.05,69.61,"VAPOR DIFFUSION, SITTING DROP",8.1,"50 mM HEPES-NaOH, 50 mM NH4 acetate, 50 mM NH4...",298.15,10.1016/j.celrep.2017.06.025,5VYC,6.0,6.0,Crystal structure of the human 40S ribosomal s...,MTKKRRNNGRAKKGRGHVQPIRCTNCARCVPKDKAIKKFVIRNIVE...,Homo sapiens,RPS26,40S ribosomal protein S26,5.0,115,54,61,6,13.03088,0.885246,11.007993,69.565217,3,1


In [8]:
df1.describe()

Unnamed: 0,Matthews Coefficient,Percent Solvent Content,pH,Temp (K),Resolution (Å),High Resolution Limit,Entity ID,Sequence Length,Hydrophobic Residues,Hydrophilic Residues,Aromatic Residues,Molecular Weight (kDa),Hydro Ratio,Isoelectric Point,Alipathic Index,Charge at pH 7,Cardin-Weintraub Motif
count,36287.0,36287.0,36287.0,36287.0,36287.0,36287.0,36287.0,36287.0,36287.0,36287.0,36287.0,36287.0,36287.0,36287.0,36287.0,36287.0,36287.0
mean,2.789026,53.302971,6.81265,290.673621,2.363104,2.363104,1.741119,216.887287,101.686913,114.9561,19.736435,24.302671,0.945482,6.899303,75.356588,6.154876,0.422052
std,0.7595,10.212654,1.21271,6.923847,0.676512,0.676512,2.80848,178.518286,86.831546,92.343492,17.255249,19.967591,0.658701,1.766878,24.26312,6.115764,0.493894
min,0.41,0.36,1.0,100.0,0.75,0.75,1.0,2.0,0.0,1.0,0.0,0.02906,0.0,4.050028,0.0,0.0,0.0
25%,2.3,46.42,6.0,291.0,1.9,1.9,1.0,100.0,44.0,54.0,7.0,11.33106,0.747368,5.504936,66.666667,1.0,0.0
50%,2.61,52.95,7.0,293.0,2.3,2.3,1.0,207.0,90.0,112.0,18.0,22.87227,0.865285,6.41436,77.876448,4.0,0.0
75%,3.06,59.87,7.5,294.0,2.75,2.75,2.0,291.0,138.0,152.5,27.0,32.972955,1.0,8.469872,88.474548,9.0,1.0
max,13.89,90.01,11.1,323.0,9.7,9.7,89.0,4128.0,1911.0,2036.0,352.0,447.94464,26.0,11.999968,260.0,86.0,1.0
