# SKEMPI 2.0 benchmark: data filtraton and processing

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import re
import os
import shutil

In [2]:
# Load dataset

skempi2 = pd.read_csv('/groups/sbinlab/panf/SKEMPIv2/skempi_v2.csv', sep=';') # 7085 rows

In [3]:
origianl_skempi_len = len(skempi2)

In [4]:
# Get a list of uniqie PDB IDs in the dataset and write it to a file to use with PDB batch_download.sh

pdb_list = skempi2['#Pdb'].str.split("_").str[0].unique().tolist()
print(f"{len(pdb_list)} unique PDB IDs") # 345
# with open('pdb_list.txt', 'w') as f:
#     for pdb in pdb_list:
#         f.write(f"{pdb},")

345 unique PDB IDs


In [5]:
# Get a separate PDB ID column

skempi2['PDB_ID'] = skempi2['#Pdb'].str.split('_').str[0]

## Step 1: Dropping and counting duplicates

In [6]:
# Possibly just a mistake? I checked row by row, field by field, and didn't see anything.

print(f"{len(skempi2.loc[skempi2.duplicated()])} entries are duplicated. {len(skempi2.loc[skempi2.duplicated()].loc[skempi2['SKEMPI version']==2])} of them are new for SKEMPI 2.0")
print("Dropping the duplicates.")

skempi2.drop_duplicates(inplace=True, ignore_index=True) # 7074 rows (11 duplicated, 10 from SKEMPI 2.0)

print(f"New dataset size: {len(skempi2)}")

11 entries are duplicated. 10 of them are new for SKEMPI 2.0
Dropping the duplicates.
New dataset size: 7074


## Step 2: Counting cases with unprecise binding affinity values

In [7]:
# Counts for cases with binding affinity values defined as smaller/larger then a certain detection threshold

out_of_range_total = skempi2.loc[(skempi2['Affinity_mut (M)'].str.startswith(('<',
            '>')))|(skempi2['Affinity_wt (M)'].str.startswith(('<', '>')))]

out_of_range_wt_mut = skempi2.loc[(skempi2['Affinity_mut (M)'].str.startswith(('<',
            '>')))&(skempi2['Affinity_wt (M)'].str.startswith(('<', '>')))]

out_of_range_wt = skempi2.loc[(skempi2['Affinity_wt (M)'].str.startswith(('<', '>')))&(~skempi2['Affinity_mut (M)'].str.startswith(('<', '>')))]

out_of_range_mut = skempi2.loc[(skempi2['Affinity_mut (M)'].str.startswith(('<', '>')))&(~skempi2['Affinity_wt (M)'].str.startswith(('<', '>')))]

print(f"Total cases with wild-type and/or mutant complex binding affinity out of detection range: ")
print(f"{len(out_of_range_total)} ({round(len(out_of_range_total)/origianl_skempi_len*100, 2)}% of the dataset), {len(out_of_range_total['PDB_ID'].unique())} unique PDB structures \n") # 186, 2.63%, 47 

print(f"Total cases with binding affinities of BOTH WILD TYPE AND MUTANT complexes out of detection range: ")
print(f"{len(out_of_range_wt_mut)}, {len(out_of_range_wt_mut['PDB_ID'].unique())} unique PDB structures ({', '.join(out_of_range_wt_mut['PDB_ID'].unique().tolist())}), {len(out_of_range_wt_mut['Method'].unique())} uniqie methods ({', '.join(out_of_range_wt_mut['Method'].unique())}) \n") # 14, 2 (4I77, 1CZ8), SPR

print(f"Total cases with binding affinities of WILD TYPE complexes out of detection range: ")
print(f"{len(out_of_range_wt)}, {len(out_of_range_wt['PDB_ID'].unique())} unique PDB structures ({', '.join(out_of_range_wt['PDB_ID'].unique().tolist())}), {len(out_of_range_wt['Method'].unique())} uniqie methods ({', '.join(out_of_range_wt['Method'].unique())}) \n") # 27, 2 (4I77, 1CZ8), SPR

print(f"Total cases with binding affinities of MUTANT complexes out of detection range: ")
print(f"{len(out_of_range_mut)}, {len(out_of_range_mut['PDB_ID'].unique())} unique PDB structures ({', '.join(out_of_range_mut['PDB_ID'].unique().tolist())}), {len(out_of_range_mut['Method'].unique())} uniqie methods ({', '.join(out_of_range_mut['Method'].unique())})") # 145, 45

print("\nThose values are parsed by SEKMPI itself as a threshold value, e. g. '>5E-05' becomes 0.00005")

Total cases with wild-type and/or mutant complex binding affinity out of detection range: 
186 (2.63% of the dataset), 47 unique PDB structures 

Total cases with binding affinities of BOTH WILD TYPE AND MUTANT complexes out of detection range: 
14, 2 unique PDB structures (4I77, 1CZ8), 1 uniqie methods (SPR) 

Total cases with binding affinities of WILD TYPE complexes out of detection range: 
27, 2 unique PDB structures (4I77, 1CZ8), 1 uniqie methods (SPR) 

Total cases with binding affinities of MUTANT complexes out of detection range: 
145, 45 unique PDB structures (1SBB, 1JCK, 1AK4, 1AHW, 1VFB, 2WPT, 2JEL, 2OOB, 2A9K, 1REW, 1HE8, 3BN9, 2FTL, 1A22, 1FCC, 3HFM, 1SMF, 1OGA, 2P5E, 2PYE, 3QDJ, 1MI5, 2AK4, 1AO7, 3G6D, 3Q8D, 4B0M, 1BJ1, 1MHP, 1MLC, 3SE3, 3SE4, 4KRL, 4KRO, 4KRP, 4OZG, 4PWX, 2C5D, 4RS1, 1C1Y, 4G0N, 1LFD, 5XCO, 2KSO, 3QIB), 12 uniqie methods (SE, SPR, IASP, FL, SFFL, CSPRIA, ITC, IARA, IAFL, ELISA, SP, KinExA)

Those values are parsed by SEKMPI itself as a threshold value, e

In [8]:
skempi2['no_proper_value'] = False
skempi2.loc[(skempi2['Affinity_mut (M)'].str.startswith(('<',
            '>')))|(skempi2['Affinity_wt (M)'].str.startswith(('<', '>'))), 'no_proper_value'] = True

#### **To Do:** separate "smaller than" from "bigger than" in a meaning full way (e.g. "very strong interaction" vs "very weak")

## Step 2: Calculating binding dG of the wild type and mutant complexes from binding affinities, calculating binding ddG, filtering out and solving ddG_bind is NaN cases

In [9]:
# Using ΔG = R*T*ln(Kd)

skempi2['dG_bind_WT'] =  0.001987204 * skempi2['Temperature'].str.extract('(\d+)'
                        ).values.astype(float).reshape(-1) * np.log(skempi2['Affinity_wt_parsed'].values)
skempi2['dG_bind_MUT'] = 0.001987204 * skempi2['Temperature'].str.extract('(\d+)'
                        ).values.astype(float).reshape(-1) * np.log(skempi2['Affinity_mut_parsed'].values)
skempi2['ddG_bind'] = skempi2['dG_bind_MUT'].values - skempi2['dG_bind_WT'].values

In [10]:
ddG_bind_na = skempi2.loc[skempi2['ddG_bind'].isna()]

print(f"Total cases with binding ddG being NaN: {len(ddG_bind_na)} ({round(len(ddG_bind_na)/origianl_skempi_len*100, 2)}% of the dataset), {len(ddG_bind_na['#Pdb'].str[0:4].unique())} unique PDB structures)")
print(f"From them:")
print(f"{len(skempi2.loc[skempi2['Temperature'].isna()])} do not have a temperature value")
print(f"{len(skempi2.loc[skempi2['Affinity_mut (M)'].isin(['n.b', 'n.b.'])])} are classified as 'no interaction' for mutant")
print(f"{len(skempi2.loc[skempi2['Affinity_wt (M)'].isin(['n.b', 'n.b.'])])} are classified as 'no interaction' for wild type")
print(f"{len(skempi2.loc[skempi2['Affinity_mut (M)']=='unf'])} are classified as 'unfolded mutant'")

print("\nSanity check: ")
print(f"{len(skempi2.loc[skempi2['Temperature'].isna()])} + {len(skempi2.loc[skempi2['Affinity_mut (M)'].isin(['n.b', 'n.b.'])])} + {len(skempi2.loc[skempi2['Affinity_wt (M)'].isin(['n.b', 'n.b.'])])} + {len(skempi2.loc[skempi2['Affinity_mut (M)']=='unf'])} = {len(skempi2.loc[skempi2['Temperature'].isna()]) + len(skempi2.loc[skempi2['Affinity_mut (M)'].isin(['n.b', 'n.b.'])]) + len(skempi2.loc[skempi2['Affinity_wt (M)'].isin(['n.b', 'n.b.'])]) + len(skempi2.loc[skempi2['Affinity_mut (M)']=='unf'])}")

Total cases with binding ddG being NaN: 291 (4.11% of the dataset), 76 unique PDB structures)
From them:
4 do not have a temperature value
278 are classified as 'no interaction' for mutant
2 are classified as 'no interaction' for wild type
7 are classified as 'unfolded mutant'

Sanity check: 
4 + 278 + 2 + 7 = 291


Keeping the cases with no dG values, dropping the cases without temperature values.

In [11]:
# Drop cases without temperature values -- we cannot calculate binding ddGs for that

skempi2 = skempi2.loc[~skempi2['Temperature'].isna()].reset_index() # 7070

In [12]:
# To keep track of cases without a value for binding affinities -- we might assign them new values for comparison with predictors

skempi2.loc[skempi2['Affinity_mut (M)'].isin(['n.b', 'n.b.']), 'no_proper_value'] = True
skempi2.loc[skempi2['Affinity_wt (M)'].isin(['n.b', 'n.b.']), 'no_proper_value'] = True
skempi2.loc[skempi2['Affinity_mut (M)']=='unf', 'no_proper_value'] = True

## Step 3: Filtering out values with disagreeing measurements by multiple methods, calculating the average

#### **To Do:** analysis of errors in dG_mut, dG_wt and ddG

In [13]:
# groups = skempi2[['#Pdb', 'Mutation(s)_PDB', 'dG_bind_MUT', 'dG_bind_WT', 'ddG_bind']].groupby(['#Pdb', 'Mutation(s)_PDB'])
# group_mean = groups.mean().reset_index()
# group_std = groups.std().reset_index()
# group_sizes = groups.size().reset_index()

# group_std.loc[(group_std['dG_bind_MUT']>1)|(group_std['dG_bind_WT']>1)]
# group_std.loc[((group_std['dG_bind_MUT']>1)|(group_std['dG_bind_WT']>1))&(group_std['ddG_bind']<1)]
# group_mean[498:499]
# group_sizes.loc[group_sizes[0]>1]

In [14]:
# Group cases by pdbID_Chains and mutations to get the mean and standard deviation of the same mutations (cases) measured with multiple methods

groups = skempi2[['#Pdb', 'Mutation(s)_PDB','ddG_bind']].groupby(['#Pdb', 'Mutation(s)_PDB'])
group_mean = groups.mean().reset_index()
group_std = groups.std().reset_index()
means_stds = pd.merge(group_mean, group_std, on=['#Pdb', 'Mutation(s)_PDB'], suffixes=('_mean', '_std'))
skempi2_avg = pd.merge(skempi2, means_stds, on=['#Pdb', 'Mutation(s)_PDB'])

# To keep track of cases measured with multiple methods
skempi2_avg['multiple_measurements'] = False
skempi2_avg.loc[~skempi2_avg['ddG_bind_std'].isna(), 'multiple_measurements'] = True

print(f"Total measures with multiple methods: {len(skempi2_avg.loc[~skempi2_avg['ddG_bind_std'].isna()])} ({round(len(skempi2_avg.loc[~skempi2_avg['ddG_bind_std'].isna()])/origianl_skempi_len*100, 2)}% of the dataset), {len(skempi2_avg.loc[~skempi2_avg['ddG_bind_std'].isna()]['PDB_ID'].unique())} unique PDB structures")

print(f"Total cases measured with multiple methods: {len(group_std.loc[~group_std['ddG_bind'].isna()])} ({round(len(group_std.loc[~group_std['ddG_bind'].isna()])/origianl_skempi_len*100, 2)}% of the dataset), {len(group_std.loc[~group_std['ddG_bind'].isna()]['#Pdb'].str[0:4].unique())} unique PDB structures")

# If there is only one measurement, std is 0
skempi2_avg['ddG_bind_std'].fillna(0, inplace=True)

print(f"Number of cases measured with multiple methods with stds > 1: {len(skempi2_avg.loc[skempi2_avg['ddG_bind_std']>1])} ({round(len(skempi2_avg.loc[skempi2_avg['ddG_bind_std']>1])/origianl_skempi_len*100, 2)}% of the dataset), {len(skempi2_avg.loc[skempi2_avg['ddG_bind_std']>1]['PDB_ID'].unique())} unique PDB structures")

# If std is bigger than 1, drop the case
skempi2_avg = skempi2_avg.loc[skempi2_avg['ddG_bind_std']<1]
skempi2_avg.drop_duplicates(subset=['#Pdb', 'Mutation(s)_PDB', 'ddG_bind_mean'], inplace=True, ignore_index=True) # 6171

print(f"Final dataset size: {len(skempi2_avg)} cases")

# Add chain identifiers as a separate column -- needed for mut- and resfiles naming

skempi2_avg['LChains'] = skempi2_avg['#Pdb'].str.split("_").str[1]
skempi2_avg['RChains'] = skempi2_avg['#Pdb'].str.split("_").str[2]
skempi2_avg["Chains"] = skempi2_avg["LChains"].astype(str) + skempi2_avg["RChains"].astype(str)

# Add index as a column and form a case ID from the number and the PDB
skempi2_avg['index'] = skempi2_avg.index.tolist()
skempi2_avg['case_ID'] = skempi2_avg['index'].astype('str') + '_' + skempi2_avg['PDB_ID']

Total measures with multiple methods: 1469 (20.73% of the dataset), 67 unique PDB structures
Total cases measured with multiple methods: 596 (8.41% of the dataset), 67 unique PDB structures
Number of cases measured with multiple methods with stds > 1: 53 (0.75% of the dataset), 13 unique PDB structures
Final dataset size: 6171 cases


## Step 4: Restructuring the dataframe and saving the final version

In [15]:
# Reorder and rename the columns

skempi2_avg = skempi2_avg[['index', 'case_ID', 'PDB_ID', '#Pdb', 'Mutation(s)_PDB', 'Mutation(s)_cleaned',
       'iMutation_Location(s)', 'Hold_out_type', 'Hold_out_proteins',
       'Affinity_mut (M)', 'Affinity_mut_parsed', 'Affinity_wt (M)',
       'Affinity_wt_parsed', 'Reference', 'Protein 1', 'Protein 2',
       'Temperature', 'kon_mut (M^(-1)s^(-1))', 'kon_mut_parsed',
       'kon_wt (M^(-1)s^(-1))', 'kon_wt_parsed', 'koff_mut (s^(-1))',
       'koff_mut_parsed', 'koff_wt (s^(-1))', 'koff_wt_parsed',
       'dH_mut (kcal mol^(-1))', 'dH_wt (kcal mol^(-1))',
       'dS_mut (cal mol^(-1) K^(-1))', 'dS_wt (cal mol^(-1) K^(-1))', 'Notes',
       'Method', 'SKEMPI version', 'no_proper_value', 'multiple_measurements', 'dG_bind_WT',
       'dG_bind_MUT', 'ddG_bind_mean', 'LChains', 'RChains', 'Chains']]

skempi2_avg.rename(columns={"ddG_bind_mean": "ddG_bind"}, inplace=True)

In [16]:
# Save the new filtered dataframe

skempi2_avg.to_csv('/groups/sbinlab/panf/SKEMPIv2_benchmark/skempi2_final.csv', index=False)

## Bits and pieces (ignore)

In [17]:
dif_chains = skempi2_avg[['PDB_ID', 'Chains']].groupby(['PDB_ID', 'Chains']).size().reset_index()
s = dif_chains.groupby(['PDB_ID']).size().reset_index()
s.loc[s[0]>1]
dif_chains.loc[dif_chains['PDB_ID'].isin(['2C5D', '3SE3', '3SE4'])]

Unnamed: 0,PDB_ID,Chains,0
145,2C5D,ABCD,19
146,2C5D,AC,16
262,3SE3,BA,47
263,3SE3,BC,2
264,3SE4,BA,18
265,3SE4,BC,17


In [18]:
# skempi2 = pd.read_csv('/groups/sbinlab/panf/SKEMPIv2_benchmark/skempi_v2.csv', sep=';')
# skempi2['PDB_ID'] = skempi2['#Pdb'].str.split('_').str[0]
# skempi2['index'] = skempi2.index.tolist()
# skempi2['case_ID'] = skempi2['index'].astype('str') + '_' + skempi2['PDB_ID']
#skempi2.to_csv('/groups/sbinlab/panf/SKEMPIv2/skempi2_mesure_mean.csv', index=False)

In [19]:
nick_pdbs = ['1JTG', '5E9D', '3SZK', '3BN9', '2J0T', '1KTZ', '3SGB', '3MZG', '1DAN', '1VFB', '1OGA']

In [20]:
skempi2.loc[(skempi2['#Pdb'].str.split("_").str[0].isin(nick_pdbs))&(skempi2['ddG_bind'].isna())]

Unnamed: 0,index,#Pdb,Mutation(s)_PDB,Mutation(s)_cleaned,iMutation_Location(s),Hold_out_type,Hold_out_proteins,Affinity_mut (M),Affinity_mut_parsed,Affinity_wt (M),...,dS_mut (cal mol^(-1) K^(-1)),dS_wt (cal mol^(-1) K^(-1)),Notes,Method,SKEMPI version,PDB_ID,no_proper_value,dG_bind_WT,dG_bind_MUT,ddG_bind
362,362,1VFB_AB_C,YB101A,YB101A,COR,AB/AG,AB/AG,n.b,,2e-08,...,,,,SPR,2,1VFB,True,-10.498011,,
631,631,2J0T_A_D,"TD2R,VD4I","TD2R,VD4I","COR,RIM",,2J0T_A_D,n.b,,4e-10,...,,,,IAFL,2,2J0T,True,-12.81466,,
2129,2129,1VFB_AB_C,YB101A,YB101A,COR,AB/AG,AB/AG,n.b,,1.25e-08,...,,,,SPR,2,1VFB,True,-10.776341,,


In [24]:
nick_pdbs_df = skempi2_avg.loc[skempi2_avg['PDB_ID'].isin(nick_pdbs)]

In [30]:
nick_pdbs_df['#Pdb'].unique()

array(['1VFB_AB_C', '2J0T_A_D', '1KTZ_A_B', '1JTG_A_B', '3BN9_B_CD',
       '3MZG_A_B', '1DAN_HL_UT', '3SGB_E_I', '1OGA_ABC_DE', '5E9D_AB_CDE',
       '3SZK_AB_C'], dtype=object)

In [21]:
# FOR FUTUTRE ANALYSIS: No binding detected, I would guess predictions should be >15 kcal/mol
# skempi2_filt.loc[skempi2_filt['Affinity_mut (M)']=='n.b']
# skempi2.loc[skempi2['Affinity_mut (M)'] == 'n.b', 'dG_bind_MUT'] = 100