## Protein Property calculations and data cleaning

In [1]:
import pandas as pd
import numpy as np
import os
from ProteinParam import SequenceAnalysis, ProteinProperties, ProteinMotifs

sa = SequenceAnalysis()
pp = ProteinProperties()
pm = ProteinMotifs()

### Load initally collated and filtered data set

In [2]:
df1 = pd.read_csv('raw_data/XRD_combined.csv', encoding = 'utf8')
df1.drop(['Unnamed: 0'], axis=1, inplace=True)
df1.drop_duplicates(subset=['Sequence'], inplace=True)

float_columns = ['Matthews Coefficient','Percent Solvent Content','pH','Temp (K)','Molecular Weight per Deposited Model',
                'Resolution (Å)','High Resolution Limit','Entity ID']

for i in float_columns:
    df1[i] = pd.to_numeric(df1[i], errors='coerce')

### Calculate additional properties

In [3]:
#basic sequence analysis parameters
df1['Sequence Length'] = df1['Sequence'].apply(sa.seq_length)
df1['Hydrophobic Residues'] = df1['Sequence'].apply(sa.hydrophobic_res)
df1['Hydrophilic Residues'] = df1['Sequence'].apply(sa.hydrophilic_res)
df1['Aromatic Residues'] = df1['Sequence'].apply(sa.aromatic_res)

#more advanced protein parameters
df1['Hydro Ratio'] = df1['Hydrophobic Residues']/df1['Hydrophilic Residues']
df1['Isoelectric Point'] = df1['Sequence'].apply(pp.isoelectric_point)
df1['Alipathic Index'] = df1['Sequence'].apply(pp.alipathic_index)
df1['Charge at pH 7'] = df1['Sequence'].apply(pp.charge_ph, ph=7.0)

#check for specific ligand binding motifs (no-0, yes-1)
df1['Cardin-Weintraub Motif'] = df1['Sequence'].apply(pm.cardin_weintraub)

df1.replace([np.inf, -np.inf], np.nan, inplace=True)
df1.dropna(inplace=True)
df1.reset_index(drop=True, inplace=True)

In [4]:
#save dataframe for data exploration
if(os.path.exists('raw_data/XRD_protein_properties.csv') and os.path.isfile('raw_data/XRD_protein_properties.csv')):
    os.remove('raw_data/XRD_protein_properties.csv')
    df1.to_csv('raw_data/XRD_protein_properties.csv', mode='a')
else:
    df1.to_csv('raw_data/XRD_protein_properties.csv', mode='a')

### General data set information

In [5]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36288 entries, 0 to 36287
Data columns (total 28 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Entry ID                              36288 non-null  object 
 1   Experimental Method                   36288 non-null  object 
 2   Matthews Coefficient                  36288 non-null  float64
 3   Percent Solvent Content               36288 non-null  float64
 4   Crystallization Method                36288 non-null  object 
 5   pH                                    36288 non-null  float64
 6   Crystal Growth Procedure              36288 non-null  object 
 7   Temp (K)                              36288 non-null  float64
 8   DOI                                   36288 non-null  object 
 9   PDB ID                                36288 non-null  object 
 10  Molecular Weight per Deposited Model  36288 non-null  float64
 11  Resolution (Å) 

In [6]:
df1.head()

Unnamed: 0,Entry ID,Experimental Method,Matthews Coefficient,Percent Solvent Content,Crystallization Method,pH,Crystal Growth Procedure,Temp (K),DOI,PDB ID,...,Entity ID,Sequence Length,Hydrophobic Residues,Hydrophilic Residues,Aromatic Residues,Hydro Ratio,Isoelectric Point,Alipathic Index,Charge at pH 7,Cardin-Weintraub Motif
0,5VYC,X-RAY DIFFRACTION,4.05,69.61,"VAPOR DIFFUSION, SITTING DROP",8.1,"50 mM HEPES-NaOH, 50 mM NH4 acetate, 50 mM NH4...",298.15,10.1016/j.celrep.2017.06.025,5VYC,...,1.0,145,72,73,10,0.986301,10.310703,74.0,4,1
1,5VYC,X-RAY DIFFRACTION,4.05,69.61,"VAPOR DIFFUSION, SITTING DROP",8.1,"50 mM HEPES-NaOH, 50 mM NH4 acetate, 50 mM NH4...",298.15,10.1016/j.celrep.2017.06.025,5VYC,...,2.0,119,54,65,3,0.830769,9.948584,91.680672,3,1
2,5VYC,X-RAY DIFFRACTION,4.05,69.61,"VAPOR DIFFUSION, SITTING DROP",8.1,"50 mM HEPES-NaOH, 50 mM NH4 acetate, 50 mM NH4...",298.15,10.1016/j.celrep.2017.06.025,5VYC,...,3.0,83,40,43,6,0.930233,8.683779,78.795181,1,0
3,5VYC,X-RAY DIFFRACTION,4.05,69.61,"VAPOR DIFFUSION, SITTING DROP",8.1,"50 mM HEPES-NaOH, 50 mM NH4 acetate, 50 mM NH4...",298.15,10.1016/j.celrep.2017.06.025,5VYC,...,4.0,143,71,72,8,0.986111,10.501013,83.846154,5,1
4,5VYC,X-RAY DIFFRACTION,4.05,69.61,"VAPOR DIFFUSION, SITTING DROP",8.1,"50 mM HEPES-NaOH, 50 mM NH4 acetate, 50 mM NH4...",298.15,10.1016/j.celrep.2017.06.025,5VYC,...,5.0,115,54,61,6,0.885246,11.007993,69.565217,3,1


In [7]:
df1.describe()

Unnamed: 0,Matthews Coefficient,Percent Solvent Content,pH,Temp (K),Molecular Weight per Deposited Model,Resolution (Å),High Resolution Limit,Entity ID,Sequence Length,Hydrophobic Residues,Hydrophilic Residues,Aromatic Residues,Hydro Ratio,Isoelectric Point,Alipathic Index,Charge at pH 7,Cardin-Weintraub Motif
count,36288.0,36288.0,36288.0,36288.0,36288.0,36288.0,36288.0,36288.0,36288.0,36288.0,36288.0,36288.0,36288.0,36288.0,36288.0,36288.0,36288.0
mean,2.789012,53.302758,6.812614,290.673244,114.743761,2.363082,2.363082,1.741127,216.881559,101.68411,114.95296,19.735891,0.945456,6.899265,75.354511,6.154707,0.42204
std,0.759495,10.212594,1.212713,6.924123,409.10205,0.676515,0.676515,2.808442,178.519162,86.831991,92.344157,17.255323,0.658711,1.766868,24.266011,6.115765,0.493892
min,0.41,0.36,1.0,100.0,0.42,0.75,0.75,1.0,2.0,0.0,1.0,0.0,0.0,4.050028,0.0,0.0,0.0
25%,2.3,46.42,6.0,291.0,35.43,1.9,1.9,1.0,100.0,44.0,54.0,7.0,0.747368,5.504936,66.666667,1.0,0.0
50%,2.61,52.945,7.0,293.0,60.185,2.3,2.3,1.0,207.0,90.0,112.0,18.0,0.865285,6.41399,77.876448,4.0,0.0
75%,3.06,59.87,7.5,294.0,106.18,2.75,2.75,2.0,291.0,138.0,152.25,27.0,1.0,8.469744,88.473727,9.0,1.0
max,13.89,90.01,11.1,323.0,7655.38,9.7,9.7,89.0,4128.0,1911.0,2036.0,352.0,26.0,11.999968,260.0,86.0,1.0
