<a href="https://colab.research.google.com/github/tcardlab/optimus_bind_sample/blob/master/notebooks/3_0_TJC_Cleaning_Code_While_No_Testing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
import re

In [2]:
link = 'https://life.bsc.es/pid/skempi2/database/download/skempi_v2.csv'

'''Proper python retreival'''
#from urllib.request import urlretrieve
#csv_path, _ = urlretrieve(link,f'skempi_v2.0.csv')

'''Direct import to Pandas'''
#data = pd.read_csv(link, sep=';')
#print(data)

'''OS get'''
!wget $link -O skempi_v2.0.csv #-O to rename

--2019-07-16 02:25:08--  https://life.bsc.es/pid/skempi2/database/download/skempi_v2.csv
Resolving life.bsc.es (life.bsc.es)... 84.88.52.107
Connecting to life.bsc.es (life.bsc.es)|84.88.52.107|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1602208 (1.5M) [text/csv]
Saving to: ‘skempi_v2.0.csv’


2019-07-16 02:25:14 (293 KB/s) - ‘skempi_v2.0.csv’ saved [1602208/1602208]



#origional

In [0]:
def SKEMPItoPandas(SKEMPI_loc):
    '''
    Purpose:
        1. Loads SKEMPI CSV file.
        2. Calculates ddG
        3. For multiple measurements, keeps the median value
        4. Eliminates entries with mutations on both sides of the interface
    Input:
        SKEMPI_loc : Location of SKEMPI CSV file
    Output:
        SKEMPI_df : Pandas dataframe    
    '''
    
    # fix this
    pd.options.mode.chained_assignment = None  # default='warn'

    # Constants
    R = 1.9872036e-3  # Ideal Gas Constant in kcal

    SKEMPI_df = pd.read_csv(SKEMPI_loc, sep=';')

    # Convert non numeric temperature comments to numeric values. Default is 298K 
    ConvertTemp = lambda x: int(re.search(r'\d+', x).group(0) or 298)
    BadTemps = SKEMPI_df.Temperature.str.isnumeric() == 0
    SKEMPI_df['Temperature'].loc[BadTemps] = SKEMPI_df['Temperature'].loc[BadTemps].map(ConvertTemp)
    SKEMPI_df['Temperature'] = pd.to_numeric(SKEMPI_df['Temperature'], errors='coerce')

    # Drop missing values
    SKEMPI_df.dropna(subset=['Affinity_wt_parsed'], inplace=True)
    SKEMPI_df.dropna(subset=['Affinity_mut_parsed'], inplace=True)

    # Calculate free energies
    SKEMPI_df['dgWT'] = -R*SKEMPI_df['Temperature']*np.log(SKEMPI_df['Affinity_wt_parsed'])
    SKEMPI_df['dgMut'] = -R*SKEMPI_df['Temperature']*np.log(SKEMPI_df['Affinity_mut_parsed'])
    SKEMPI_df['ddG'] = SKEMPI_df['dgWT']-SKEMPI_df['dgMut']

    # Create a key for unique mutations based on PDB and 
    SKEMPI_df['MutKey'] = SKEMPI_df['#Pdb']+'_'+SKEMPI_df['Mutation(s)_PDB']
    # Replace multiple measurements of the same mutation with the group mean
    # May consider grouping by experimental method as well
    SKEMPI_df['ddgMedian'] = SKEMPI_df.groupby('MutKey')['ddG'].transform('median')        
    SKEMPI_df = SKEMPI_df.drop_duplicates(subset=['MutKey', 'Temperature'], keep='first', inplace=False)

    # Flag multiple mutations in the same protein
    SKEMPI_df['NumMutations'] = SKEMPI_df['Mutation(s)_PDB'].str.count(',')+1 

    # Extract Chains and remove cross chain mutations. Chain is the second position in the mutation code
    SKEMPI_df['Prot1Chain'] = SKEMPI_df['#Pdb'].str.split('_').str[1]
    SKEMPI_df['Prot2Chain'] = SKEMPI_df['#Pdb'].str.split('_').str[2]
    SKEMPI_df['MutSplit'] = SKEMPI_df['Mutation(s)_PDB'].str.split(',')

    def ChainCheck(df):
        if df['NumMutations'] == 1:
            CrossChain = False
            return CrossChain
        else:
            Chain = df['MutSplit'][0][1]
            if Chain in df['Prot1Chain']:
                ChainSet = df['Prot1Chain']
            elif Chain in df['Prot2Chain']:
                ChainSet = df['Prot2Chain']
            for i in range(len(df['MutSplit'])):
                Chain = df['MutSplit'][i][1]
                if Chain in ChainSet:
                    CrossChain = False
                else:
                    CrossChain = True
                    break
        return CrossChain

    SKEMPI_df['CrossChain'] = SKEMPI_df.apply(ChainCheck, axis=1)
    SKEMPI_SingleSided = SKEMPI_df[SKEMPI_df.CrossChain == False]

    NumProteins = SKEMPI_SingleSided['#Pdb'].nunique()
    NumMutations = SKEMPI_SingleSided['#Pdb'].count()
    print("There are %s unique single sided mutations in %s proteins" % (NumMutations, NumProteins))             
    return SKEMPI_SingleSided

In [15]:
og_output = SKEMPItoPandas('skempi_v2.0.csv')

There are 5454 unique single sided mutations in 343 proteins


#Changed

##v1.0


In [0]:
def ChainCheck(df):
        if df['NumMutations'] == 1:
            CrossChain = False
            return CrossChain
        else:
            Chain = df['MutSplit'][0][1]
            if Chain in df['Prot1Chain']:
                ChainSet = df['Prot1Chain']
            elif Chain in df['Prot2Chain']:
                ChainSet = df['Prot2Chain']
            for i in range(len(df['MutSplit'])):
                Chain = df['MutSplit'][i][1]
                if Chain in ChainSet:
                    CrossChain = False
                else:
                    CrossChain = True
                    break
        return CrossChain

def gibbsEq(Kd, tmp):
  R = 1.9872036e-3  # Ideal Gas Constant in kcal
  ΔG = -R * tmp * np.log(Kd) #log is ln in np
  return ΔG

def SKEMPItoPandas(SKEMPI_loc):
    '''
    Purpose:
        1. Loads SKEMPI CSV file.
        2. Calculates ddG
        3. For multiple measurements, keeps the median value
        4. Eliminates entries with mutations on both sides of the interface
    Input:
        SKEMPI_loc : Location of SKEMPI CSV file
    Output:
        SKEMPI_df : Pandas dataframe    
    '''
    
    SKEMPI_df = pd.read_csv(SKEMPI_loc, sep=';')

    # Convert non numeric temperature comments to numeric values. 
    # Default is 298K 
    SKEMPI_df['Temperature'] = SKEMPI_df['Temperature'].str.extract(r'(\d+)')
    SKEMPI_df['Temperature'] = pd.to_numeric(SKEMPI_df['Temperature'], 
                                             errors='coerce')
    SKEMPI_df['Temperature'].fillna(298, inplace=True)
    
    # Drop missing values
    SKEMPI_df.dropna(subset=['Affinity_wt_parsed'], inplace=True)
    SKEMPI_df.dropna(subset=['Affinity_mut_parsed'], inplace=True)

    # Calculate free energies
    SKEMPI_df['dgWT'] = gibbsEq(SKEMPI_df['Affinity_wt_parsed'], 
                                SKEMPI_df['Temperature'])
    SKEMPI_df['dgMut'] = gibbsEq(SKEMPI_df['Affinity_mut_parsed'], 
                                 SKEMPI_df['Temperature'])
    SKEMPI_df['ddG'] = SKEMPI_df['dgWT']-SKEMPI_df['dgMut']

    # Create a key for unique mutations based on PDB and 
    SKEMPI_df['MutKey'] = SKEMPI_df['#Pdb']+'_'+SKEMPI_df['Mutation(s)_PDB']
    # Replace multiple measurements of the same mutation with the group mean
    # May consider grouping by experimental method as well
    SKEMPI_df['ddgMedian'] = SKEMPI_df.groupby('MutKey')['ddG'].transform('median')        
    SKEMPI_df = SKEMPI_df.drop_duplicates(subset=['MutKey', 'Temperature'], 
                                          keep='first', inplace=False)

    # Flag multiple mutations in the same protein
    SKEMPI_df['MutSplit'] = SKEMPI_df['Mutation(s)_PDB'].str.split(',')
    SKEMPI_df['NumMutations'] = SKEMPI_df['MutSplit'].apply(len)
    
    # Extract Chains and remove cross chain mutations. 
    # Chain is the second position in the mutation code
    SKEMPI_df['Prot1Chain'] = SKEMPI_df['#Pdb'].str.split('_').str[1]
    SKEMPI_df['Prot2Chain'] = SKEMPI_df['#Pdb'].str.split('_').str[2]
    
    SKEMPI_df['CrossChain'] = SKEMPI_df.apply(ChainCheck, axis=1)
    SKEMPI_SingleSided = SKEMPI_df[SKEMPI_df.CrossChain == False]

    NumProteins = SKEMPI_SingleSided['#Pdb'].nunique()
    NumMutations = SKEMPI_SingleSided['#Pdb'].count()
    print("There are %s unique single sided mutations in %s proteins" % (NumMutations, NumProteins))             
    return SKEMPI_SingleSided

In [321]:
new_output = SKEMPItoPandas('skempi_v2.0.csv')

There are 5454 unique single sided mutations in 343 proteins


In [0]:
new_output

##v1.1 as class?

In [0]:
class test:
  pass

#Work

##Temp formatting

In [269]:
#initialize dataframes
SKEMPI_df = pd.read_csv('skempi_v2.0.csv', sep=';')
test = SKEMPI_df.copy()

print('following entry has nan tmp')
print(np.array(SKEMPI_df.iloc[[6665]]), '\n')


'''Origional method'''
# Convert non numeric temperature comments to numeric values. Default is 298K 
ConvertTemp = lambda x: int(re.search(r'\d+', x)[0] or 298)
BadTemps = SKEMPI_df.Temperature.str.isnumeric() == False


print("nan val tests 'False'? map not applied, thus unaltered")
print(BadTemps.iloc[[6665]], '\n')

SKEMPI_df['Temperature'].loc[BadTemps] = SKEMPI_df['Temperature'].loc[BadTemps].map(ConvertTemp)
#SKEMPI_df['Temperature'] = SKEMPI_df['Temperature'].apply(ConvertTemp)
SKEMPI_df['Temperature'] = pd.to_numeric(SKEMPI_df['Temperature'], errors='coerce')

'''
New Method:
  -likely a tad slower as regex is applied to all rather than binry mapping
  -no error 
  -handled nan issue
'''
test['Temperature'] = test['Temperature'].str.extract(r'(\d+)')
test['Temperature'] = pd.to_numeric(test['Temperature'], errors='coerce')
test['Temperature'].fillna(298, inplace=True)


SKEMPI_df.equals(og)

following entry has nan tmp
[['4G0N_A_B' 'DA38A' 'DA38A' 'COR' nan
  '4G0N_A_B,3KUD_A_B,1LFD_A_B,1GUA_A_B,1C1Y_A_B,1K8R_A_B,1HE8_A_B,1E96_A_B'
  '1.3E-06' 1.3e-06 '1.8E-08' 1.8e-08 '8636102' 'H-Ras1' 'Raf-RBD' nan
  nan nan nan nan nan nan nan nan nan nan nan nan nan 'IAFL' 2]] 

nan val tests 'False'? map not applied, thus unaltered
6665    False
Name: Temperature, dtype: bool 



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


True

In [266]:
'''both versions are equal but include nan'''
#print(SKEMPI_df['Temperature'].isnull().values.any())

#print(SKEMPI_df[SKEMPI_df['Temperature'].isnull()]) #['Temperature'])
#print(np.array(SKEMPI_df.iloc[[6665]]))

for init, new in zip(SKEMPI_df['Temperature'], test['Temperature']):
  if init!=new:
    print(init, type(init),':' ,new, type(new))

nan <class 'float'> : 298.0 <class 'float'>
nan <class 'float'> : 298.0 <class 'float'>
nan <class 'float'> : 298.0 <class 'float'>
nan <class 'float'> : 298.0 <class 'float'>


In [276]:
'''strange, i cant reproduce the issue'''
df = pd.DataFrame('', index=[0,1,2,3], columns=['A']) #str(np.nan)
print('init empty data\n', df)
baddies=df["A"].str.isnumeric() == False
print('\nfind non-numeric', baddies, sep='\n')
print('convert temps', df['A'].loc[baddies].map(ConvertTemp))
print(pd.to_numeric(df["A"], errors='coerce'))

init empty data
   A
0  
1  
2  
3  

find non-numeric
0    True
1    True
2    True
3    True
Name: A, dtype: bool


TypeError: ignored

In [278]:
test['Temperature'][6663:6670]

6663    273.0
6664    273.0
6665    298.0
6666    298.0
6667    298.0
6668    298.0
6669    298.0
Name: Temperature, dtype: float64

##other

In [309]:
SKEMPI_df['NumMutations'] = SKEMPI_df['Mutation(s)_PDB'].str.count(',')+1 

largest = [0,0]
for i,str_lst in enumerate(SKEMPI_df['Mutation(s)_PDB']):
  lst=str_lst.split(',')
  split_len = len(lst)
  camma_len = SKEMPI_df['NumMutations'][i]
  if split_len != camma_len:
    print(i,lst, SKEMPI_df['NumMutations'][i], SKEMPI_df['Mutation(s)_PDB'][i], str_lst)
    pass
  longest = max(len(el) for el in lst)
  if longest>largest[0]:
    largest=[longest,i]
  
print(largest)
print(SKEMPI_df.loc[largest[1]])

[7, 329]
#Pdb                                    1DVF_AB_CD
Mutation(s)_PDB                            RD100bA
Mutation(s)_cleaned                         RD106A
iMutation_Location(s)                          COR
Hold_out_type                                AB/AG
Hold_out_proteins                            AB/AG
Affinity_mut (M)                          1.08E-05
Affinity_mut_parsed                       1.08e-05
Affinity_wt (M)                           1.08E-08
Affinity_wt_parsed                        1.08e-08
Reference                                  8993317
Protein 1                       IgG1-kappa D1.3 Fv
Protein 2                                  E5.2 Fv
Temperature                                    298
kon_mut (M^(-1)s^(-1))                         NaN
kon_mut_parsed                                 NaN
kon_wt (M^(-1)s^(-1))                          NaN
kon_wt_parsed                                  NaN
koff_mut (s^(-1))                              NaN
koff_mut_parsed       

0        5
1        5
2        5
3        5
4        5
5        5
6        5
7        5
8        5
9        5
10       5
11       5
12       5
13       5
14       5
15       5
16       5
17       5
18       5
19      11
20       5
21       5
22       5
23      11
24      11
25       5
26       5
27       5
28       5
29       5
        ..
7055     5
7056     5
7057    11
7058     5
7059     5
7060     5
7061     5
7062     5
7063     5
7064     5
7065     5
7066     5
7067     5
7068     5
7069     5
7070     5
7071     5
7072     5
7073     5
7074     5
7075     5
7076    11
7077     4
7078     4
7079     4
7080     4
7081     5
7082     5
7083     5
7084    10
Name: Mutation(s)_PDB, Length: 7085, dtype: int64