<a href="https://colab.research.google.com/github/tcardlab/optimus_bind_sample/blob/develop/3_1_TJC_Confirm_MutDataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Due to an error in the origional Skempi_import, empty temperatures did not default to 298K. After I fixed it in 3.0, Outputs DDG median calculation changed. I wish to confirm that this change is correct by manual reproduction. This issue has two facets, values changing (confirmed correct by *Manual Reproduction*) and values effecting groups and duplicates(Explained in *Temp in drop, not in Groupby*).

#Import Dataset

In [45]:
import pandas as pd
import numpy as np
import re

csv_link = 'https://life.bsc.es/pid/skempi2/database/download/skempi_v2.csv'
!wget $csv_link -O skempi_v2.0.csv #-O to rename
skempi = pd.read_csv('skempi_v2.0.csv', sep=';')

--2019-09-11 04:10:12--  https://life.bsc.es/pid/skempi2/database/download/skempi_v2.csv
Resolving life.bsc.es (life.bsc.es)... 84.88.52.107
Connecting to life.bsc.es (life.bsc.es)|84.88.52.107|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1602208 (1.5M) [text/csv]
Saving to: ‘skempi_v2.0.csv’


2019-09-11 04:10:16 (445 KB/s) - ‘skempi_v2.0.csv’ saved [1602208/1602208]



#Original 

In [90]:
def SKEMPItoPandas(SKEMPI_loc):
    #fix this
    pd.options.mode.chained_assignment = None  # default='warn'
    
    #Constants
    R=1.9872036e-3 #Ideal Gas Constant in kcal
    
    SKEMPI_df = pd.read_csv(SKEMPI_loc,sep=';')
   
    #Convert non numeric temperature comments to numeric values. Default is 298 K 
    ConvertTemp = lambda x: int(re.search(r'\d+', x).group(0) or 298)
    BadTemps=SKEMPI_df.Temperature.str.isnumeric()==0
    SKEMPI_df['Temperature'].loc[BadTemps]=SKEMPI_df['Temperature'].loc[BadTemps].map(ConvertTemp)
    SKEMPI_df['Temperature']=pd.to_numeric(SKEMPI_df['Temperature'], errors='coerce')
    print(skempi[6665:6669][['#Pdb','Mutation(s)_cleaned', 'Temperature']])
    
    #Drop missing values
    SKEMPI_df.dropna(subset=['Affinity_wt_parsed'], inplace=True)
    SKEMPI_df.dropna(subset=['Affinity_mut_parsed'], inplace=True)
    
    #Calculate free energies
    SKEMPI_df['dgWT']=-R*SKEMPI_df['Temperature']*np.log(SKEMPI_df['Affinity_wt_parsed'])
    SKEMPI_df['dgMut']=-R*SKEMPI_df['Temperature']*np.log(SKEMPI_df['Affinity_mut_parsed'])
    SKEMPI_df['ddG']=SKEMPI_df['dgWT']-SKEMPI_df['dgMut']
    
    #Create a key for unique mutations based on PDB and 
    SKEMPI_df['MutKey']=SKEMPI_df['#Pdb']+'_'+SKEMPI_df['Mutation(s)_PDB']
    #Replace multiple measurements of the same mutation with the group mean
    #May consider grouping by experimental method as well            
    SKEMPI_df['ddgMedian'] = SKEMPI_df.groupby('MutKey')['ddG'].transform('median')        
    SKEMPI_df=SKEMPI_df.drop_duplicates(subset=['MutKey','Temperature'], keep='first', inplace=False)
    
    #Flag multiple mutations in the same protein
    SKEMPI_df['NumMutations']=SKEMPI_df['Mutation(s)_PDB'].str.count(',')+1 
    
    #Extract Chains and remove cross chain mutations. Chain is the second position in the mutation code
    SKEMPI_df['Prot1Chain']=SKEMPI_df['#Pdb'].str.split('_').str[1]
    SKEMPI_df['Prot2Chain']=SKEMPI_df['#Pdb'].str.split('_').str[2]
    SKEMPI_df['MutSplit']=SKEMPI_df['Mutation(s)_PDB'].str.split(',')
    def ChainCheck(df):
        if df['NumMutations'] == 1:
            CrossChain= False
            return CrossChain
        else:
            Chain= df['MutSplit'][0][1]
            if Chain in df['Prot1Chain']:
                ChainSet=df['Prot1Chain']
            elif Chain in df['Prot2Chain']:
                ChainSet=df['Prot2Chain']
            for i in range(len(df['MutSplit'])):
                Chain= df['MutSplit'][i][1]
                if Chain in ChainSet:
                    CrossChain= False
                else:
                    CrossChain= True
                    break
        return CrossChain
    SKEMPI_df['CrossChain']=SKEMPI_df.apply(ChainCheck, axis=1)        
    SKEMPI_SingleSided=SKEMPI_df[SKEMPI_df.CrossChain == False]        
    
    NumProteins=SKEMPI_SingleSided['#Pdb'].nunique()
    NumMutations=SKEMPI_SingleSided['#Pdb'].count()
    print("There are %s unique single sided mutations in %s proteins" % (NumMutations, NumProteins))             
    return SKEMPI_SingleSided
  
skempi_og = SKEMPItoPandas('skempi_v2.0.csv')

          #Pdb Mutation(s)_cleaned Temperature
6665  4G0N_A_B               DA38A         NaN
6666  1C1Y_A_B               DA38A         NaN
6667  1LFD_A_B               DB38A         NaN
6668  1LFD_A_B               YB32W         NaN
There are 5454 unique single sided mutations in 343 proteins


#New

In [0]:
class MutantDataset(pd.DataFrame):
    '''<Subclassed Pandsas DataFrame>
        Given the potential of multiple sources for mutant datasets,
        this calss serves to improve clarity, debugging, and reusability

        Needs repr
    '''
    def __init__(self, data, sep=',', index=None, columns=None, dtype=None,
                 copy=True,):
        '''Initialize subclass from DataFrame instance or csv path.'''
        if type(data) == str:
            data = pd.read_csv(data, sep=sep)
        super(MutantDataset, self).__init__(data=data,
                                            index=index,
                                            columns=columns,
                                            dtype=dtype,
                                            copy=copy)

    def Mutations(self, row):
        '''Returns dictionary of mutation identifiers.'''
        keys = ['initAA', 'chain', 'loc', 'mutAA']  # code key
        mut_codes = self.loc[row]['Mutation(s)_cleaned'].split(',')
        unzip_code = zip(*[re.findall('(\d+|.)', mut) for mut in mut_codes])
        mut_dct = dict(zip(keys, unzip_code))
        return mut_dct

    def to_numeric(self, keys):
        '''converts column of single or list of keys to numeric'''
        self[keys] = self[keys].apply(pd.to_numeric, errors='coerce')
        return self[keys]

    def gibbsEq(self, Kd_key, tmp_key='Temperature'):
        '''Gibbs Free Energy = -R * Temp * ln(kd)'''
        R = 1.9872036e-3  # Ideal Gas Constant in kcal
        ΔG = -R * self[tmp_key] * np.log(self[Kd_key])  # log is ln in np
        return ΔG

    def solve_ddG(self, wild, mutant, tmp_key='Temperature'):
        '''ddG is the changes in affinity upon mutation:
              ddG = dG_Mutant-dG_wild_Type
        '''
        self['dgWT'] = self.gibbsEq(wild, tmp_key)
        self['dgMut'] = self.gibbsEq(mutant, tmp_key)
        self['ddG'] = self['dgWT']-self['dgMut']
        return self

    def _ChainCheck(self, df):
        '''Utalizes subtracted sets to identify if
           mutated chains are unique to a single protein
        '''
        mutated_chains = set(i[1] for i in df['MutSplit'])
        prot1, prot2 = set(df['Prot1Chain']), set(df['Prot2Chain'])
        if df['NumMutations'] == 1 or len(mutated_chains)==1:
            # Single mutant is unique to one protien, not cross chain
            return False
        elif bool(mutated_chains-prot1) != bool(mutated_chains-prot2):
            # given bool normalization this functions as xor
            # A chain remains after subrtaction,
            # mutations not unique to single protein, cross chain
            return False
        else:
            # Mutated chains are specific to one protein
            return True

    def find_cross_chains(self):
        '''checks if mutation occur on more than one protein'''
        self['Prot1Chain'] = self['#Pdb'].str.split('_').str[1]
        self['Prot2Chain'] = self['#Pdb'].str.split('_').str[2]
        crossChain = self.apply(self._ChainCheck, axis=1)
        return crossChain

    @property
    def _constructor(self):
        return MutantDataset  # Class Name


def Clean_Skempi(path):
    '''
    Purpose:
        1. Loads SKEMPI CSV file.
        2. Calculates ddG
        3. For multiple measurements, keeps the median value
        4. Eliminates entries with mutations on both sides of the interface
    Input:
        path : Location of SKEMPI CSV file
    Output:
        SKEMPI_SingleSided : MutantDataset(pd.DataFrame)
    Note:
        Content and order subject to change with additional datasets.
        It is foreseeable that some steps may occur post combination.
    '''
  # Initialize class
    skempi = MutantDataset(path, sep=';')

  # Convert 'Temperature' comments/str's to numeric values. Default is 298
    skempi['Temperature'] = skempi['Temperature'].str.extract(r'(\d+)')
    skempi['Temperature'] = skempi.to_numeric('Temperature')
    skempi['Temperature'].fillna(value=298, inplace=True)  # 6665-6668 blank
    print(skempi[6665:6669][['#Pdb','Mutation(s)_cleaned','Temperature']])

  # Calculate free energies
    dropna_lst = ['Affinity_wt_parsed', 'Affinity_mut_parsed']
    skempi.dropna(subset=dropna_lst, inplace=True)
    skempi = skempi.solve_ddG('Affinity_wt_parsed', 'Affinity_mut_parsed')
    print(skempi[6665:6669][['#Pdb','Mutation(s)_cleaned', 'Temperature']])
  
  # Median and duplicate ddG/tmp values
    group_keys = ['#Pdb', 'Mutation(s)_cleaned']
    skempi['ddgMedian'] = skempi.groupby(group_keys)['ddG'].transform('median')
    skempi.drop_duplicates(subset=[*group_keys, 'Temperature'],
                                    keep='first', inplace=True)
    print(skempi[6665:6669][['#Pdb','Mutation(s)_cleaned', 'Temperature']])

  # Flag multiple mutations in the same protein
    skempi['MutSplit'] = skempi['Mutation(s)_cleaned'].str.split(',')
    skempi['NumMutations'] = skempi['MutSplit'].apply(len)

  # Extract Chains and remove cross chain mutations.
    skempi['CrossChain'] = skempi.find_cross_chains()
    SKEMPI_SingleSided = skempi[skempi.CrossChain == False]
    return SKEMPI_SingleSided

In [92]:
skempi_final = Clean_Skempi('skempi_v2.0.csv')
NumProteins = skempi_final['#Pdb'].nunique()
NumMutations = skempi_final['#Pdb'].count()
print(f'There are {NumMutations} unique single sided'
      f'mutations in {NumProteins} proteins')

          #Pdb Mutation(s)_cleaned  Temperature
6665  4G0N_A_B               DA38A        298.0
6666  1C1Y_A_B               DA38A        298.0
6667  1LFD_A_B               DB38A        298.0
6668  1LFD_A_B               YB32W        298.0
          #Pdb Mutation(s)_cleaned  Temperature
6947  1KBH_A_B      YB1172W,LA903A        298.0
6948  1KBH_A_B      YB1172W,LA909A        298.0
6949  1KBH_A_B      YB1172W,LA910A        298.0
6950  1KBH_A_B      YB1172W,AA915G        298.0
There are 5454 unique single sidedmutations in 343 proteins


#comparison

**key = 'ddG'**

We see New == OG except for the for blanks to be fixed.

This  is the expected output and ensures equivalence before differentiating step.
<details>
  <summary> See supporting data</summary>

6665
- 4G0N_A_B  –  DA38A
- nan | 2.534409601550072 

6666
- 1C1Y_A_B  –  DA38A
- nan | 2.619152014020517

6667
- 1LFD_A_B  –  DB38A
- nan | 2.7271204101180446

6668
- 1LFD_A_B  –  YB32W
- nan | -0.21121814832220487
</details>
<br>

**key = 'ddgMedian'**

We see calculations involving those four has effected the median calculation of groups involving the new value.
<details>
  <summary> See supporting data</summary>

6472
- 1LFD_A_B  –  DB38A
- 310.0  |  **310.0** *(same ddG calc, seperated by temp)*
- 3.8958503047001924 | 3.3114853574091185
  
6493
- 4G0N_A_B  –  DA38A
- 308.0  |  **308.0** *(same ddG calc, seperated by temp)*
- 3.2996884744981614 | 2.9170490380241167
  
6665
- 4G0N_A_B  –  DA38A 
- nan  |  **298.0**
- 3.2996884744981614 | 2.9170490380241167
  
6666
- 1C1Y_A_B  –  DA38A
- nan  |  298.0
- nan | 2.619152014020517
  
6667
- 1LFD_A_B  –  DB38A 
- nan  |  **298.0**
- 3.8958503047001924 | 3.3114853574091185
  
6668
- 1LFD_A_B  –  YB32W
- nan  |  298.0
- nan | -0.21121814832220487
  
False
</details>

In [0]:
skempi = skempi_final
key = 'ddgMedian' #'ddG' #'ddgMedian'
for i in skempi.index.tolist():
  try:
    og = skempi_og[key][i]
  except:
    og = ''
  try:
    new = skempi_final[key][i]
  except:
    new = ''
  if og!=new and (og!='' and new!=''):
    print(i)
    print('\t', skempi['#Pdb'][i], '  ',skempi['Mutation(s)_cleaned'][i])
    ##Indices match.
    #print('\t', skempi_og['#Pdb'][i], '  ',skempi_og['Mutation(s)_cleaned'][i])
    print('\t', skempi_og['Temperature'][i], '  ',skempi['Temperature'][i])
    print('\t', og, new)
    
skempi_og[key].equals(skempi_final[key])

#Manual Recreation(Success)

I do not understand why there are two duplicates in both outputs... 

It is neccisary to gain insight to this calculation to ensure the validity of my change and the dataset as a whole.

NVM, figured it out. See section below, pretty simple.



In [0]:
skempi = Clean_Skempi('skempi_v2.0.csv')

mutFix=[('4G0N_A_B','DA38A'),
        ('1C1Y_A_B','DA38A'),
        ('1LFD_A_B','DB38A'),
        ('1LFD_A_B','YB32W')]

In [135]:
from statistics import median 


pdbs = list(zip(*mutFix))[0]
#[i[0] for i in pdbs mutFix]

# Isolate relavent pdbs being fixed:
test = skempi[skempi['#Pdb'].isin(pdbs)][['#Pdb','Mutation(s)_cleaned','Temperature','ddgMedian']]
output = dict()
for pdb in pdbs:
  tmp=test[test['#Pdb']==pdb]
  
  # Group by
  grouped = dict()
  for index, row in tmp.iterrows():
    key = (row[0],row[1])
    if key in grouped:
      grouped[key]+=[(row[3],row[2])] # ddg, tmp
    else:
      grouped[key]=[(row[3],row[2])]
  
  # Calc median, drop duplicates with set
  medDct = dict()
  for k,v in grouped.items():  
    ddgs, temps = zip(*v)
    medDct[k]=(median(ddgs), set(temps))
  output = {**output, **medDct}
    
# find
for k in mutFix:
  print(k, output[k])
  

('4G0N_A_B', 'DA38A') (2.9170490380241167, {298.0, 308.0})
('1C1Y_A_B', 'DA38A') (2.619152014020517, {298.0})
('1LFD_A_B', 'DB38A') (3.3114853574091185, {298.0, 310.0})
('1LFD_A_B', 'YB32W') (-0.21121814832220487, {298.0})


**Coolio, outputs match!**

this assumes the ddgMedian is supposed to be calculated accross diff temps, as it is so in OG and New. 

otherwise add to key under group and make minor tweaks. 

#Temp in drop, not in Groupby (Problem Identified)

'Temperature' is in drop_duplicates and not in the groupby. 
I want to verifying ddgMedian is supposed to calculate across different temperatures.

Index| Mutkey,                      [Temp,   ddgMedian]

–––––––––––––––––––––––––––––––––––––––––––––

6472| 1LFD_A_B_DB38A,  [310.0 , 3.3114853574091185]

6667| 1LFD_A_B_DB38A,  [298.0 , 3.3114853574091185]

–––––––––––––––––––––––––––––––––––––––––––––

6493| 4G0N_A_B_DA38A,  [308.0, 2.9170490380241167]

6665| 4G0N_A_B_DA38A,  [298.0, 2.9170490380241167]

*Result: shared 'ddgMedian' calculation registered to different 'Temps'*



---


6472 & 6493 appear as OG != New. 

But that doesnt explain their existence in the first place...

The groupling for median was done on \[pdb, mutations]. 

However, deleting duplicates includes temperature.

The result is duplicate ddgs for different temp registers. 

In [100]:
key = 'ddgMedian' #'ddG' #'ddgMedian'
for i in skempi.index.tolist():
  try:
    og = skempi_og[key][i]
  except:
    og = ''
  try:
    new = skempi_final[key][i]
  except:
    new = ''
  if og!=new and (og!='' and new!=''):
    print(i)
    print('\t', skempi['#Pdb'][i], '  ',skempi['Mutation(s)_cleaned'][i])
    ##see temps
    print('\t', skempi_og['Temperature'][i], '  ',skempi['Temperature'][i])
    print('\t', og, new)
    
skempi_og[key].equals(skempi_final[key])

6472
	 1LFD_A_B    DB38A
	 310.0    310.0
	 3.8958503047001924 3.3114853574091185
6493
	 4G0N_A_B    DA38A
	 308.0    308.0
	 3.2996884744981614 2.9170490380241167
6665
	 4G0N_A_B    DA38A
	 nan    298.0
	 3.2996884744981614 2.9170490380241167
6666
	 1C1Y_A_B    DA38A
	 nan    298.0
	 nan 2.619152014020517
6667
	 1LFD_A_B    DB38A
	 nan    298.0
	 3.8958503047001924 3.3114853574091185
6668
	 1LFD_A_B    YB32W
	 nan    298.0
	 nan -0.21121814832220487


False

In [104]:
t = lambda x,y: x+y
t(3,9)

12