In [13]:
#IMPORT some LIBS
import pandas as pd
import seaborn as sb
import re
import numpy as np

# Read the file
with open('INDEX_demo_PL_data.2021', 'r') as file:
    lines = file.readlines()

# Filter out comment lines (starting with #) and empty lines
data_lines = []
for line in lines:
    line = line.strip()
    if line and not line.startswith('#'):
        data_lines.append(line)

# Parse each line
parsed_data = []
for line in data_lines:
    # Split by whitespace but be careful with the binding data format
    parts = line.split()
    
    if len(parts) >= 4:
        pdb_code = parts[0]
        resolution = float(parts[1])
        year = int(parts[2])
        
        # Extract binding data (format like "Ki=0.068nM" or "Kd=0.53mM")
        binding_info = parts[3]
        
        # Parse binding type and value
        if '=' in binding_info:
            binding_type, value_unit = binding_info.split('=', 1)
            
            # Extract numeric value and unit
            match = re.match(r'([0-9.]+)([a-zA-Z]+)', value_unit)
            if match:
                value = float(match.group(1))
                unit = match.group(2)
            else:
                value = None
                unit = None
        else:
            binding_type = None
            value = None
            unit = None
        
        # Extract ligand name from the comment (between parentheses)
        comment_part = ' '.join(parts[4:])
        ligand_match = re.search(r'\(([^)]+)\)', comment_part)
        ligand_name = ligand_match.group(1) if ligand_match else None
        
        parsed_data.append({
            'PDB_Code': pdb_code,
            #'Resolution': resolution,
            'Binding_Type': binding_type,
            'Binding_Value': value,
            'Unit': unit,
            'Ligand_Name': ligand_name,
        })

# Create DataFrame
df = pd.DataFrame(parsed_data)
df.head()

Unnamed: 0,PDB_Code,Binding_Type,Binding_Value,Unit,Ligand_Name
0,4tmn,Ki,0.068,nM,0PK
1,5tmn,Ki,9.1,nM,0PJ
2,1ydr,Ki,3.0,uM,IQP
3,1ydt,Ki,48.0,nM,IQB
4,1bcu,Kd,0.53,mM,PRL


In [16]:
#Convert Binding value
def convert_to_nM(value, unit):
    """Convert binding values to nanomolar (nM)"""
    if pd.isna(value) or pd.isna(unit):
        return None
    
    conversion_factors = {
        'pM': 0.001,    # picomolar to nanomolar
        'nM': 1,        # nanomolar
        'uM': 1000,     # micromolar to nanomolar  
        'mM': 1000000   # millimolar to nanomolar
    }
    
    return value * conversion_factors.get(unit, 1)

df['Binding_Value_nM'] = df.apply(lambda row: convert_to_nM(row['Binding_Value'], row['Unit']), axis=1)

df

#convert binding nM to class and adding column
def Kd_to_class(kd_nM):
    pKd = -np.log10(kd_nM* 1e-9)
    if pKd >= 8:
        return 'high'#Kd ≤ 10 nM
    elif pKd >= 6:
        return 'medium' # 10 nM < Kd < 1000 nM 
    else:
        return 'low'# Kd > 1000nM

df['Affinity_Category'] = df['Binding_Value_nM'].apply(Kd_to_class)
df

Unnamed: 0,PDB_Code,Binding_Type,Binding_Value,Unit,Ligand_Name,Binding_Value_nM,Affinity_Category
0,4tmn,Ki,0.068,nM,0PK,0.0680,high
1,5tmn,Ki,9.100,nM,0PJ,9.1000,high
2,1ydr,Ki,3.000,uM,IQP,3000.0000,low
3,1ydt,Ki,48.000,nM,IQB,48.0000,medium
4,1bcu,Kd,0.530,mM,PRL,530000.0000,low
...,...,...,...,...,...,...,...
280,4x6p,Ki,5.000,nM,3YU,5.0000,high
281,5dwr,Ki,6.000,pM,5H7,0.0060,high
282,5c2h,Ki,8.200,pM,4XU,0.0082,high
283,4cr9,Ki,80.000,uM,OTW,80000.0000,low


In [17]:
# Save to CSV if needed
df.to_csv('pdbind_data.csv', index=False)
print("\nDataFrame saved as 'pdbind_data.csv'")


DataFrame saved as 'pdbind_data.csv'
