# PROTAC-DB processing

In [1]:
import pandas as pd
import re

In [2]:
all_protac = pd.read_csv('./protac.csv')
print(all_protac.shape)

(5388, 89)


## Degradation Indicator analysis

- `'DC50 (nM)', 'Dmax (%)'` are severely missing (both missing samples 3601/4648)
- Consider `'DC50 (nM)', 'Dmax (%)', 'Assay (DC50/Dmax)', 'Percent degradation (%)', 'Assay (Percent degradation)'`, and utilize the implicit information

In [3]:
model_df = all_protac.copy()

In [4]:
header = model_df.columns.to_list()
# print(header)
print(f'{header[:15]}')

['Compound ID', 'Uniprot', 'Target', 'E3 ligase', 'PDB', 'Name', 'Smiles', 'DC50 (nM)', 'Dmax (%)', 'Assay (DC50/Dmax)', 'Percent degradation (%)', 'Assay (Percent degradation)', 'IC50 (nM, Protac to Target)', 'Assay (Protac to Target, IC50)', 'EC50 (nM, Protac to Target)']


Statistics on missing values for degradation-related indicators

In [5]:
print(model_df.shape)

(5388, 89)


In [6]:
four_is_nan = model_df[['DC50 (nM)', 'Dmax (%)', 'Assay (DC50/Dmax)', 'Percent degradation (%)', 'Assay (Percent degradation)']].isna().all(axis=1).sum()
print(four_is_nan)

3737


In [7]:
both_is_nan = model_df[['DC50 (nM)', 'Dmax (%)']].isna().all(axis=1).sum()
print(both_is_nan)

4101


In [8]:
two_info = model_df[['Percent degradation (%)', 'Assay (Percent degradation)']].notna().sum()
two_info

Percent degradation (%)        362
Assay (Percent degradation)    362
dtype: int64

In [9]:
four_info = model_df[['DC50 (nM)', 'Dmax (%)', 'Assay (DC50/Dmax)','Percent degradation (%)', 'Assay (Percent degradation)']].notna().sum()
four_info

DC50 (nM)                       905
Dmax (%)                        726
Assay (DC50/Dmax)              1008
Percent degradation (%)         362
Assay (Percent degradation)     362
dtype: int64

## Degradation labeling
- DC50, Dmax
- Percent degradation

Remove missing

In [10]:
model_df = model_df.dropna(subset=['DC50 (nM)', 'Dmax (%)', 'Percent degradation (%)', 'Assay (Percent degradation)'], how='all') # 四项都没有才drop
print("model_df: {:,} x {:,}".format(*model_df.shape))

model_df: 1,631 x 89


In [11]:
model_df[['DC50 (nM)', 'Dmax (%)', 'Percent degradation (%)', 'Assay (Percent degradation)']]

Unnamed: 0,DC50 (nM),Dmax (%),Percent degradation (%),Assay (Percent degradation)
19,560,80,,
40,1.76,95,,
41,4,,,
42,2/8,,,
43,4.5,95,,
...,...,...,...,...
5381,1100,85,,
5382,3600,60,,
5383,1600,80,,
5386,2,,,


### Explicit labeling (DC50, Dmax)
Labeling is performed according to the following principles
- DC50 < 100nM, Dmax >= 80%: 1
- DC50 ≥ 100 nM, Dmax < 80%: 0

In [12]:
import numpy as np

dc50_labels = []
for i in model_df['DC50 (nM)']:
    if type(i) is str:
        if i == 'N.D.': # No Degradation
            dc50_labels.append(False)
            continue
        dc50_list = re.findall(r"\d+\.?\d*", i)
        if len(dc50_list) == 0:
            dc50_labels.append('nan')
            continue 
        dc50 = min([float(e) for e in dc50_list]) 
        if float(dc50) < 100.:
            dc50_labels.append(True)
        else:
            dc50_labels.append(False)
    else:
        # print(i)
        dc50_labels.append('nan')

dmax_labels = []
for i in model_df['Dmax (%)']:
    if type(i) is str:
        if i == 'N.D.': # No Degradation
            dmax_labels.append(False)
        else:
            dmax_list = re.findall(r"\d+\.?\d*", i)
            if len(dmax_list) == 0:
                dmax_labels.append('nan')
                continue
            dmax = max([float(e) for e in dmax_list]) 
            if float(dmax) >= 80.:
                dmax_labels.append(True)
            else:
                dmax_labels.append(False)
    else:
        dmax_labels.append('nan')

total_labels, valid_labels = [], []
for i in range(len(dc50_labels)):
    if dc50_labels[i] == 'nan' and dmax_labels[i] == 'nan':
        total_labels.append(np.nan)
    elif dc50_labels[i] is False or dmax_labels[i] is False:
        total_labels.append(False)
        valid_labels.append(False)
    else:
        total_labels.append(True)
        valid_labels.append(True)
        
print(f'total samples: {len(total_labels)}')
print(f'valid labels: {len(valid_labels)}')

model_df['label'] = total_labels
print('True/False: {}/{}'.format(sum(valid_labels), len(valid_labels) - sum(valid_labels)))

total samples: 1631
valid labels: 1287
True/False: 506/781


In [13]:
nan_labels = model_df[['label']].isna().all(axis=1).sum()
nan_labels

344

### Implicit labeling (Percentage Degradation)

- Extract information and expand tags via `Percentage Degradation`
- fill the nan labels

In [14]:
def extract_percent_values(x):
    if pd.isna(x):
        return np.nan  # na values
    # if x == '0' or 'N.D.' in x:
    #     return [0.0]
    # replce 'N.D.' with '0'
    x = x.replace('N.D.', '0')
    vals = [re.findall('\d+\.?\d*', e)[-1] for e in x.split('/')] # e.g.：20-50，keep 50
    return [float(val) for val in vals]
model_df['percent_values'] = model_df['Percent degradation (%)'].apply(extract_percent_values)
print(f'model_df: {model_df.shape}')
extract_percent_values = model_df['percent_values'].dropna().to_list()
print(len(extract_percent_values))
extract_percent_values

model_df: (1631, 91)
362


[[26.0, 35.0, 28.0],
 [15.0, 23.0, 23.0],
 [16.0, 20.0, 25.0],
 [11.0, 25.0, 29.0],
 [54.0, 84.0, 64.0],
 [8.0, 29.0, 65.0],
 [15.0, 66.0, 87.0],
 [10.0, 48.0, 88.0],
 [11.0, 48.0, 86.0],
 [30.0, 69.0, 75.0],
 [32.0, 35.0, 40.0],
 [30.0, 63.0, 96.0],
 [50.0, 80.0, 80.0],
 [37.0, 61.0, 92.0],
 [28.0, 48.0, 89.0],
 [34.0, 51.0, 78.0],
 [3.0, 31.0, 28.0],
 [0.0, 12.0, 0.0],
 [11.0, 50.0, 87.0],
 [27.0, 75.0, 94.0],
 [22.0, 72.0, 93.0],
 [20.0, 81.0, 97.0],
 [38.0, 80.0, 95.0],
 [18.0, 34.0, 35.0],
 [76.0, 98.0, 99.0],
 [76.0, 95.0, 99.0],
 [65.0, 93.0, 78.0],
 [89.0, 99.0, 100.0],
 [76.0, 99.0, 100.0],
 [39.0, 77.0, 99.0],
 [5.0, 5.0, 5.0],
 [5.0, 5.0, 5.0],
 [43.0, 91.0, 96.0],
 [7.0, 20.0, 5.0],
 [2.0, 25.0, 20.0],
 [28.0, 50.0],
 [58.0, 85.0],
 [39.0, 83.0, 96.0],
 [58.0, 78.0],
 [23.0, 71.0],
 [3.0, 9.0],
 [79.0, 92.0],
 [25.0, 63.0, 89.0],
 [0.0, 15.0, 19.0],
 [0.0, 4.0, 10.0],
 [47.9, 73.5],
 [71.7, 81.0],
 [0.0],
 [68.0, 95.0, 99.0],
 [81.0, 95.0, 99.0],
 [45.0, 76.0, 95.0],
 [85.0

In [15]:
def extrac_dose_values(x):
    if pd.isna(x):
        return np.nan
    vals = re.findall(' at ([\d.]+(?:/[\d.]+)*) (?:nM|μM)', x)
    if len(vals) == 0:
        print(x)
        return np.nan
    
    values = []
    for val in vals:
        if '/' in val:
            values.extend(val.split('/'))
        else:
            values.append(val)
    return [float(val) for val in values]
model_df['dose_values'] = model_df['Assay (Percent degradation)'].apply(lambda x: re.sub(r'with', 'at', x) if isinstance(x, str) else x) # 'cells with 100/10 nM' to 'cells at 100/10 nM'
model_df['dose_values'] = model_df['dose_values'].apply(lambda x: re.sub(r' nM/(\d)', r'/\1', x) if isinstance(x, str) else x) # 'cells at 100 nM/10 nM' to 'cells at 100/10 nM' and 'at 100 nM/on peritoneal macrophages from C57BL/6 mice  at 10 nM'
model_df['dose_values'] = model_df['dose_values'].apply(lambda x: re.sub(r'after', 'at', x) if isinstance(x, str) else x) # 'cells after 100 nM' to 'cells at 100 nM'
model_df['dose_values'] = model_df['dose_values'].apply(lambda x: x.replace('\xa0', ' ') if isinstance(x, str) else x) # '\xa0'
# tmp = model_df['dose_values'].to_list()
# tmp
model_df['dose_values'] = model_df['dose_values'].apply(extrac_dose_values)
print(f'model_df: {model_df.shape}')
extrac_dose_values = model_df['dose_values'].dropna().to_list()
print(len(extrac_dose_values))
extrac_dose_values

model_df: (1631, 92)
362


[[100.0, 1000.0, 10000.0],
 [100.0, 1000.0, 10000.0],
 [100.0, 1000.0, 10000.0],
 [100.0, 1000.0, 10000.0],
 [100.0, 1000.0, 10000.0],
 [10.0, 100.0, 1000.0],
 [10.0, 100.0, 1000.0],
 [10.0, 100.0, 1000.0],
 [10.0, 100.0, 1000.0],
 [100.0, 1000.0, 10000.0],
 [100.0, 1000.0, 10000.0],
 [10.0, 100.0, 1000.0],
 [100.0, 1000.0, 10000.0],
 [10.0, 100.0, 1000.0],
 [10.0, 100.0, 1000.0],
 [10.0, 100.0, 1000.0],
 [100.0, 1000.0, 10000.0],
 [100.0, 1000.0, 10000.0],
 [10.0, 100.0, 1000.0],
 [10.0, 100.0, 1000.0],
 [10.0, 100.0, 1000.0],
 [10.0, 100.0, 1000.0],
 [10.0, 100.0, 1000.0],
 [100.0, 1000.0, 10000.0],
 [10.0, 100.0, 1000.0],
 [10.0, 100.0, 1000.0],
 [100.0, 1000.0, 10000.0],
 [10.0, 100.0, 1000.0],
 [10.0, 100.0, 1000.0],
 [10.0, 100.0, 1000.0],
 [100.0, 1000.0, 10000.0],
 [100.0, 1000.0, 10000.0],
 [10.0, 100.0, 1000.0],
 [100.0, 1000.0, 10000.0],
 [100.0, 1000.0, 10000.0],
 [30.0, 100.0],
 [30.0, 100.0],
 [10.0, 30.0, 100.0],
 [30.0, 100.0],
 [30.0, 100.0],
 [30.0, 100.0],
 [30.0, 10

Use `percent_values` and `dose_values` to augment the label.
- If a label already exists, skip it.
- If not, then:
```python
    A = percent_values
    B = dose_values
    max_B = max(B)
    if max_B < 100:
        label = True
    elif max_B == 100
        if A[B.index(100.)] >= 80:
            label = True
    else label = False
```

In [16]:
def update_row(row):
    label = row['label']
    if pd.notna(label): # skip
        return label
    
    percent_values = row['percent_values']
    dose_values = row['dose_values']
    print(percent_values, dose_values)
    assert len(percent_values) == len(dose_values)
    
    assert len(dose_values) >= 1
    max_dose_value = int(max(dose_values))
    if max_dose_value < 100:
        return True
    elif 100. in dose_values:
        index = dose_values.index(100.)
        return percent_values[index] >= 80.
    else:
        return False

model_df.loc[:, 'label'] = model_df.apply(update_row, axis=1)
model_df

[26.0, 35.0, 28.0] [100.0, 1000.0, 10000.0]
[15.0, 23.0, 23.0] [100.0, 1000.0, 10000.0]
[16.0, 20.0, 25.0] [100.0, 1000.0, 10000.0]
[11.0, 25.0, 29.0] [100.0, 1000.0, 10000.0]
[54.0, 84.0, 64.0] [100.0, 1000.0, 10000.0]
[8.0, 29.0, 65.0] [10.0, 100.0, 1000.0]
[15.0, 66.0, 87.0] [10.0, 100.0, 1000.0]
[10.0, 48.0, 88.0] [10.0, 100.0, 1000.0]
[11.0, 48.0, 86.0] [10.0, 100.0, 1000.0]
[30.0, 69.0, 75.0] [100.0, 1000.0, 10000.0]
[32.0, 35.0, 40.0] [100.0, 1000.0, 10000.0]
[30.0, 63.0, 96.0] [10.0, 100.0, 1000.0]
[50.0, 80.0, 80.0] [100.0, 1000.0, 10000.0]
[37.0, 61.0, 92.0] [10.0, 100.0, 1000.0]
[28.0, 48.0, 89.0] [10.0, 100.0, 1000.0]
[34.0, 51.0, 78.0] [10.0, 100.0, 1000.0]
[3.0, 31.0, 28.0] [100.0, 1000.0, 10000.0]
[0.0, 12.0, 0.0] [100.0, 1000.0, 10000.0]
[11.0, 50.0, 87.0] [10.0, 100.0, 1000.0]
[27.0, 75.0, 94.0] [10.0, 100.0, 1000.0]
[22.0, 72.0, 93.0] [10.0, 100.0, 1000.0]
[20.0, 81.0, 97.0] [10.0, 100.0, 1000.0]
[38.0, 80.0, 95.0] [10.0, 100.0, 1000.0]
[18.0, 34.0, 35.0] [100.0, 1000

Unnamed: 0,Compound ID,Uniprot,Target,E3 ligase,PDB,Name,Smiles,DC50 (nM),Dmax (%),Assay (DC50/Dmax),...,Hydrogen Bond Acceptor Count,Hydrogen Bond Donor Count,Rotatable Bond Count,Topological Polar Surface Area,Molecular Formula,InChI,InChI Key,label,percent_values,dose_values
19,11,Q9H8M2,BRD9,VHL,,,COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...,560,80,Degradation of BRD9 in HeLa cells after 4 h tr...,...,16,3,22,199.15,C54H69FN8O10S,InChI=1S/C54H69FN8O10S/c1-34-47(74-33-58-34)35...,MXAKQOVZPDLCDK-UDVNCTHFSA-N,False,,
40,22,Q9H8M2,BRD9,VHL,,VZ185,COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...,1.76,95,Degradation of BRD9 in RI-1 cells after 8 h tr...,...,14,3,19,180.69,C53H67FN8O8S,InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-...,ZAGCLFXBHOXXEN-JPTLTNPLSA-N,True,,
41,22,Q9H8M2,BRD9,VHL,,VZ185,COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...,4,,Degradation of HiBiT-BRD9 in HEK293 cells afte...,...,14,3,19,180.69,C53H67FN8O8S,InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-...,ZAGCLFXBHOXXEN-JPTLTNPLSA-N,True,,
42,22,Q9H8M2,BRD9,VHL,,VZ185,COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...,2/8,,Degradation of BRD9 in EOL-1/A-204 cells after...,...,14,3,19,180.69,C53H67FN8O8S,InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-...,ZAGCLFXBHOXXEN-JPTLTNPLSA-N,True,,
43,22,Q9NPI1,BRD7,VHL,,VZ185,COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...,4.5,95,Degradation of BRD7 in RI-1 cells after 8 h tr...,...,14,3,19,180.69,C53H67FN8O8S,InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-...,ZAGCLFXBHOXXEN-JPTLTNPLSA-N,True,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5381,3264,O60885,BRD4,FEM1B,,,CC1=C(C)C2=C(S1)N1C(C)=NN=C1[C@H](CC(=O)NCCOCC...,1100,85,Degradation of BRD4 in HEK293T cells after 8 h...,...,13,2,18,176.30,C40H45Cl2N9O6S,InChI=1S/C40H45Cl2N9O6S/c1-25-26(2)58-40-37(25...,CPDVGNBJFIONLX-HKBQPEDESA-N,False,,
5382,3265,O60885,BRD4,FEM1B,,,CC1=C(C)C2=C(S1)N1C(C)=NN=C1[C@H](CC(=O)NCCOCC...,3600,60,Degradation of BRD4 in HEK293T cells after 8 h...,...,14,2,21,185.53,C42H49Cl2N9O7S,InChI=1S/C42H49Cl2N9O7S/c1-27-28(2)61-42-39(27...,QMBOIOPJFSHXPV-XIFFEERXSA-N,False,,
5383,3266,O60885,BRD4,FEM1B,,,CC1=C(C)C2=C(S1)N1C(C)=NN=C1[C@H](CC(=O)NCCOCC...,1600,80,Degradation of BRD4 in HEK293T cells after 8 h...,...,15,2,24,194.76,C44H53Cl2N9O8S,InChI=1S/C44H53Cl2N9O8S/c1-29-30(2)64-44-41(29...,UUCUKSPUFPMKNK-DHUJRADRSA-N,False,,
5386,3269,P03372,ER,CRBN,,ARV-471,O=C1CC[C@H](N2CC3=CC(N4CCN(CC5CCN(C6=CC=C([C@@...,2,,Degradation of ER in ER-positive breast cancer...,...,7,2,7,96.43,C45H49N5O4,InChI=1S/C45H49N5O4/c51-37-12-15-39-33(27-37)8...,TZZDVPMABRWKIZ-XMOGEVODSA-N,True,,


In [17]:
model_df['label'].describe()

count      1631
unique        2
top       False
freq       1011
Name: label, dtype: object

## Keep the necessary items
`'Compound ID', 'Uniprot', 'Target', 'E3 ligase', 'PDB	Name', 'Smiles', 'DC50 (nM)', 'Dmax (%)', 'warhead smiles', 'e3 ligand smiles', 'linker smiles'`

In [18]:
columns = ['Compound ID', 'Uniprot', 'Target', 'E3 ligase', 'PDB', 'Smiles', 
           'DC50 (nM)', 'Dmax (%)', 'Assay (DC50/Dmax)', 'Percent degradation (%)', 'Assay (Percent degradation)', 
            'Molecular Weight', 'Exact Mass', 'XLogP3', 
            'Heavy Atom Count', 'Ring Count', 'Hydrogen Bond Acceptor Count', 
            'Hydrogen Bond Donor Count', 'Rotatable Bond Count', 'Topological Polar Surface Area',
            # 'warhead smiles', 'e3 ligand smiles', 'linker smiles', 
           'label', 'percent_values', 'dose_values']
model_df = model_df[columns]
# model_df.to_csv('data/protacdb/label_protac.csv', index=False)
model_df

Unnamed: 0,Compound ID,Uniprot,Target,E3 ligase,PDB,Smiles,DC50 (nM),Dmax (%),Assay (DC50/Dmax),Percent degradation (%),...,XLogP3,Heavy Atom Count,Ring Count,Hydrogen Bond Acceptor Count,Hydrogen Bond Donor Count,Rotatable Bond Count,Topological Polar Surface Area,label,percent_values,dose_values
19,11,Q9H8M2,BRD9,VHL,,COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...,560,80,Degradation of BRD9 in HeLa cells after 4 h tr...,,...,3.69,74,8,16,3,22,199.15,False,,
40,22,Q9H8M2,BRD9,VHL,,COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...,1.76,95,Degradation of BRD9 in RI-1 cells after 8 h tr...,,...,5.06,71,8,14,3,19,180.69,True,,
41,22,Q9H8M2,BRD9,VHL,,COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...,4,,Degradation of HiBiT-BRD9 in HEK293 cells afte...,,...,5.06,71,8,14,3,19,180.69,True,,
42,22,Q9H8M2,BRD9,VHL,,COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...,2/8,,Degradation of BRD9 in EOL-1/A-204 cells after...,,...,5.06,71,8,14,3,19,180.69,True,,
43,22,Q9NPI1,BRD7,VHL,,COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...,4.5,95,Degradation of BRD7 in RI-1 cells after 8 h tr...,,...,5.06,71,8,14,3,19,180.69,True,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5381,3264,O60885,BRD4,FEM1B,,CC1=C(C)C2=C(S1)N1C(C)=NN=C1[C@H](CC(=O)NCCOCC...,1100,85,Degradation of BRD4 in HEK293T cells after 8 h...,,...,3.58,58,6,13,2,18,176.30,False,,
5382,3265,O60885,BRD4,FEM1B,,CC1=C(C)C2=C(S1)N1C(C)=NN=C1[C@H](CC(=O)NCCOCC...,3600,60,Degradation of BRD4 in HEK293T cells after 8 h...,,...,3.43,61,6,14,2,21,185.53,False,,
5383,3266,O60885,BRD4,FEM1B,,CC1=C(C)C2=C(S1)N1C(C)=NN=C1[C@H](CC(=O)NCCOCC...,1600,80,Degradation of BRD4 in HEK293T cells after 8 h...,,...,3.28,64,6,15,2,24,194.76,False,,
5386,3269,P03372,ER,CRBN,,O=C1CC[C@H](N2CC3=CC(N4CCN(CC5CCN(C6=CC=C([C@@...,2,,Degradation of ER in ER-positive breast cancer...,,...,6.36,54,9,7,2,7,96.43,True,,
