<a href="https://colab.research.google.com/github/tcardlab/optimus_bind_sample/blob/develop/notebooks/4_0_TJC_Cross_Chain_Checker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import re

csv_link = 'https://life.bsc.es/pid/skempi2/database/download/skempi_v2.csv'
!wget $csv_link -O skempi_v2.0.csv #-O to rename
skempi = pd.read_csv('skempi_v2.0.csv', sep=';')

--2019-07-23 06:45:21--  https://life.bsc.es/pid/skempi2/database/download/skempi_v2.csv
Resolving life.bsc.es (life.bsc.es)... 84.88.52.107
Connecting to life.bsc.es (life.bsc.es)|84.88.52.107|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1602208 (1.5M) [text/csv]
Saving to: ‘skempi_v2.0.csv’


2019-07-23 06:45:27 (309 KB/s) - ‘skempi_v2.0.csv’ saved [1602208/1602208]



#Origional

In [2]:
def SKEMPItoPandas(SKEMPI_loc):
    #fix this
    pd.options.mode.chained_assignment = None  # default='warn'
    
    #Constants
    R=1.9872036e-3 #Ideal Gas Constant in kcal
    
    SKEMPI_df = pd.read_csv(SKEMPI_loc,sep=';')
   
    #Convert non numeric temperature comments to numeric values. Default is 298 K 
    ConvertTemp = lambda x: int(re.search(r'\d+', x).group(0) or 298)
    BadTemps=SKEMPI_df.Temperature.str.isnumeric()==0
    SKEMPI_df['Temperature'].loc[BadTemps]=SKEMPI_df['Temperature'].loc[BadTemps].map(ConvertTemp)
    SKEMPI_df['Temperature']=pd.to_numeric(SKEMPI_df['Temperature'], errors='coerce')
    
    #Drop missing values
    SKEMPI_df.dropna(subset=['Affinity_wt_parsed'], inplace=True)
    SKEMPI_df.dropna(subset=['Affinity_mut_parsed'], inplace=True)
    
    #Calculate free energies
    SKEMPI_df['dgWT']=-R*SKEMPI_df['Temperature']*np.log(SKEMPI_df['Affinity_wt_parsed'])
    SKEMPI_df['dgMut']=-R*SKEMPI_df['Temperature']*np.log(SKEMPI_df['Affinity_mut_parsed'])
    SKEMPI_df['ddG']=SKEMPI_df['dgWT']-SKEMPI_df['dgMut']
    
    #Create a key for unique mutations based on PDB and 
    SKEMPI_df['MutKey']=SKEMPI_df['#Pdb']+'_'+SKEMPI_df['Mutation(s)_PDB']
    #Replace multiple measurements of the same mutation with the group mean
    #May consider grouping by experimental method as well            
    SKEMPI_df['ddgMedian'] = SKEMPI_df.groupby('MutKey')['ddG'].transform('median')        
    SKEMPI_df=SKEMPI_df.drop_duplicates(subset=['MutKey','Temperature'], keep='first', inplace=False)
    
    #Flag multiple mutations in the same protein
    SKEMPI_df['NumMutations']=SKEMPI_df['Mutation(s)_PDB'].str.count(',')+1 
    
    #Extract Chains and remove cross chain mutations. Chain is the second position in the mutation code
    SKEMPI_df['Prot1Chain']=SKEMPI_df['#Pdb'].str.split('_').str[1]
    SKEMPI_df['Prot2Chain']=SKEMPI_df['#Pdb'].str.split('_').str[2]
    SKEMPI_df['MutSplit']=SKEMPI_df['Mutation(s)_PDB'].str.split(',')
    def ChainCheck(df):
        if df['NumMutations'] == 1:
            CrossChain= False
            return CrossChain
        else:
            Chain= df['MutSplit'][0][1]
            if Chain in df['Prot1Chain']:
                ChainSet=df['Prot1Chain']
            elif Chain in df['Prot2Chain']:
                ChainSet=df['Prot2Chain']
            for i in range(len(df['MutSplit'])):
                Chain= df['MutSplit'][i][1]
                if Chain in ChainSet:
                    CrossChain= False
                else:
                    CrossChain= True
                    break
        return CrossChain
    SKEMPI_df['CrossChain']=SKEMPI_df.apply(ChainCheck, axis=1)        
    SKEMPI_SingleSided=SKEMPI_df[SKEMPI_df.CrossChain == False]        
    
    NumProteins=SKEMPI_SingleSided['#Pdb'].nunique()
    NumMutations=SKEMPI_SingleSided['#Pdb'].count()
    print("There are %s unique single sided mutations in %s proteins" % (NumMutations, NumProteins))             
    return SKEMPI_SingleSided
  
skempi_og = SKEMPItoPandas('skempi_v2.0.csv')

There are 5454 unique single sided mutations in 343 proteins


#Rewrite cross chain checker

In [3]:
print(set('ABC'))
print(bool(set('')))
print(bool(set('1')))

{'C', 'B', 'A'}
False
True


In [4]:
Prot1Chain, Prot2Chain = '1REW_AB_C'.split('_')[1:]    
MutSplit = ['DA19A','DB19A']

def tester(MutSplit, Prot1Chain, Prot2Chain):
  mutated_chains = set(i[1] for i in MutSplit)
  prot1, prot2 = set(Prot1Chain), set(Prot2Chain)
  print(mutated_chains-prot1, mutated_chains-prot2)
  if len(MutSplit) == 1:
      # Single mutant cannot be cross-chain
      return False
  elif bool(mutated_chains-prot1) != bool(mutated_chains-prot2):
      # mutations not unique to single protein
      return False
  else:
      # Mutated chains are specific to one protein
      return True
    
tester(MutSplit, Prot1Chain, Prot2Chain)

set() {'B', 'A'}


False

In [0]:
class MutantDataset(pd.DataFrame):
    '''<Subclassed Pandsas DataFrame>
        Given the potential of multiple sources for mutant datasets,
        this calss serves to improve clarity, debugging, and reusability

        Needs repr
    '''
    def __init__(self, data, sep=',', index=None, columns=None, dtype=None,
                 copy=True,):
        '''Initialize subclass from DataFrame instance or csv path.'''
        if type(data) == str:
            data = pd.read_csv(data, sep=sep)
        super(MutantDataset, self).__init__(data=data,
                                            index=index,
                                            columns=columns,
                                            dtype=dtype,
                                            copy=copy)

    def Mutations(self, row):
        '''Returns dictionary of mutation identifiers.'''
        keys = ['initAA', 'chain', 'loc', 'mutAA']  # code key
        mut_codes = self.loc[row]['Mutation(s)_cleaned'].split(',')
        unzip_code = zip(*[re.findall('(\d+|.)', mut) for mut in mut_codes])
        mut_dct = dict(zip(keys, unzip_code))
        return mut_dct

    def to_numeric(self, keys):
        '''converts column of single or list of keys to numeric'''
        self[keys] = self[keys].apply(pd.to_numeric, errors='coerce')
        return self[keys]

    def gibbsEq(self, Kd_key, tmp_key='Temperature'):
        '''Gibbs Free Energy = -R * Temp * ln(kd)'''
        R = 1.9872036e-3  # Ideal Gas Constant in kcal
        ΔG = -R * self[tmp_key] * np.log(self[Kd_key])  # log is ln in np
        return ΔG

    def solve_ddG(self, wild, mutant, tmp_key='Temperature'):
        '''ddG is the changes in affinity upon mutation:
              ddG = dG_Mutant-dG_wild_Type
        '''
        self['dgWT'] = self.gibbsEq(wild, tmp_key)
        self['dgMut'] = self.gibbsEq(mutant, tmp_key)
        self['ddG'] = self['dgWT']-self['dgMut']
        return self

    def _ChainCheck(self, df):
        '''Utalizes subtracted sets to identify if
           mutated chains are unique to a single protein
        '''
        mutated_chains = set(i[1] for i in df['MutSplit'])
        prot1, prot2 = set(df['Prot1Chain']), set(df['Prot2Chain'])
        if df['NumMutations'] == 1 or len(mutated_chains)==1:
            # Single mutant is unique to one protien, not cross chain
            return False
        elif bool(mutated_chains-prot1) != bool(mutated_chains-prot2):
            # given bool normalization this functions as xor
            # A chain remains after subrtaction,
            # mutations not unique to single protein, cross chain
            return False
        else:
            # Mutated chains are specific to one protein
            return True

    def find_cross_chains(self):
        '''checks if mutation occur on more than one protein'''
        self['Prot1Chain'] = self['#Pdb'].str.split('_').str[1]
        self['Prot2Chain'] = self['#Pdb'].str.split('_').str[2]
        crossChain = self.apply(self._ChainCheck, axis=1)
        return crossChain

    @property
    def _constructor(self):
        return MutantDataset  # Class Name


def Clean_Skempi(path):
    '''
    Purpose:
        1. Loads SKEMPI CSV file.
        2. Calculates ddG
        3. For multiple measurements, keeps the median value
        4. Eliminates entries with mutations on both sides of the interface
    Input:
        path : Location of SKEMPI CSV file
    Output:
        SKEMPI_SingleSided : MutantDataset(pd.DataFrame)
    Note:
        Content and order subject to change with additional datasets.
        It is foreseeable that some steps may occur post combination.
    '''
  # Initialize class
    skempi = MutantDataset(path, sep=';')

  # Convert 'Temperature' comments/str's to numeric values. Default is 298
    skempi['Temperature'] = skempi['Temperature'].str.extract(r'(\d+)')
    skempi['Temperature'] = skempi.to_numeric('Temperature')
    skempi['Temperature'].fillna(value=298, inplace=True)  # 6665-6668 blank

  # Calculate free energies
    dropna_lst = ['Affinity_wt_parsed', 'Affinity_mut_parsed']
    skempi.dropna(subset=dropna_lst, inplace=True)
    skempi = skempi.solve_ddG('Affinity_wt_parsed', 'Affinity_mut_parsed')

  # Median and duplicate ddG/tmp values
    group_keys = ['#Pdb', 'Mutation(s)_PDB']
    skempi['ddgMedian'] = skempi.groupby(group_keys)['ddG'].transform('median')
    skempi = skempi.drop_duplicates(subset=[*group_keys, 'Temperature'],
                                    keep='first', inplace=False)

  # Flag multiple mutations in the same protein
    skempi['MutSplit'] = skempi['Mutation(s)_PDB'].str.split(',')
    skempi['NumMutations'] = skempi['MutSplit'].apply(len)

  # Extract Chains and remove cross chain mutations.
    skempi['CrossChain'] = skempi.find_cross_chains()
    SKEMPI_SingleSided = skempi[skempi.CrossChain == False]
    return SKEMPI_SingleSided


In [6]:
skempi_final = Clean_Skempi('skempi_v2.0.csv')
NumProteins = skempi_final['#Pdb'].nunique()
NumMutations = skempi_final['#Pdb'].count()
print(f'There are {NumMutations} unique single sided'
      f'mutations in {NumProteins} proteins')

There are 5454 unique single sidedmutations in 343 proteins


All checks out, outputs match on a per index basis

In [7]:
for i in skempi.index.tolist():
  try:
    og = skempi_og['CrossChain'][i]
  except:
    og = ''
  try:
    new = skempi_final['CrossChain'][i]
  except:
    new = ''
  if og!=new and (og!='' and new!=''):
    print(i)
    print('\t', skempi['#Pdb'][i], '  ',skempi['Mutation(s)_cleaned'][i])
    print('\t', og, new)
    
skempi_og['CrossChain'].equals(skempi_final['CrossChain'])

True