# PROTAC-DB processing

In [50]:
import pandas as pd
import re

In [51]:
all_protac = pd.read_csv('./protac.csv')
print(all_protac.shape)

(9380, 89)


  all_protac = pd.read_csv('./protac.csv')


## Degradation Indicator analysis

- `'DC50 (nM)', 'Dmax (%)'` are severely missing (both missing samples 3601/4648)
- Consider `'DC50 (nM)', 'Dmax (%)', 'Assay (DC50/Dmax)', 'Percent degradation (%)', 'Assay (Percent degradation)'`, and utilize the implicit information

In [52]:
model_df = all_protac.copy()

In [None]:
header = model_df.columns.to_list()
# print(header)
print(f'{ header[:]}')

False


Statistics on missing values for degradation-related indicators

In [6]:
print(model_df.shape)

(9380, 89)


In [8]:
four_is_nan = model_df[['DC50 (nM)', 'Dmax (%)', 'Assay (DC50/Dmax)', 'Percent degradation (%)', 'Assay (Percent degradation)']].isna().all(axis=1).sum()
print(four_is_nan)
print("有", model_df.shape[0] - four_is_nan, "条数据有降解相关信息，我们通过这四条信息来推断是否降解")

5867
有 3513 条数据有降解相关信息，我们通过这四条信息来推断是否降解


In [11]:
both_is_nan = model_df[['DC50 (nM)', 'Dmax (%)']].isna().all(axis=1).sum()
print(both_is_nan)
print("有", model_df.shape[0] - both_is_nan, "条数据只有DC50 (nM)和Dmax (%)这两项信息，这两个直接决定了是否降解")

7210
有 2170 条数据只有DC50 (nM)和Dmax (%)这两项信息，这两个直接决定了是否降解


In [18]:
# 统计 "Percent degradation (%)" 和 "Assay (Percent degradation)" 这两列中非缺失（非NaN）值的数据量，
# 用于判断有多少条数据至少有一项关于降解百分比的信息。
# 这有助于了解降解相关信息在数据集中的覆盖情况。
# notna() 函数用于检查数据框中是否存在非缺失值。
# all(axis=1) 表示对每一行进行检查，确保该行中所有列的值都为非缺失值。
# sum() 函数用于计算满足条件的行数。
# 因此，two_info 的值表示数据集中至少有一项关于降解百分比的信息的行数。


two_info = model_df[['Percent degradation (%)', 'Assay (Percent degradation)']].notna().all(axis=1).sum()
print("有", two_info, "条数据有降解相关信息，我们通过这两条信息来推断是否降解")
two_info


有 1422 条数据有降解相关信息，我们通过这两条信息来推断是否降解


1422

In [None]:
# 统计 "DC50 (nM)", "Dmax (%)", "Assay (DC50/Dmax)", "Percent degradation (%)", "Assay (Percent degradation)" 这五列中非缺失（非NaN）值的数据量，
# 用于判断有多少条数据至少有一项关于降解百分比的信息。
# notna() 函数用于检查数据框中是否存在非缺失值。
# sum() 函数用于计算满足条件的行数。
# 因此，four_info 的值表示数据集中至少有一项关于降解百分比的信息的行数。

four_info = model_df[['DC50 (nM)', 'Dmax (%)', 'Assay (DC50/Dmax)','Percent degradation (%)', 'Assay (Percent degradation)']].notna().sum()
four_info

DC50 (nM)                      1762
Dmax (%)                       1317
Assay (DC50/Dmax)              1892
Percent degradation (%)        1422
Assay (Percent degradation)    1422
dtype: int64

## Degradation labeling
- DC50, Dmax
- Percent degradation

Remove missing

这一格用于去除在DC50 (nM)、Dmax (%)、Percent degradation (%)和Assay (Percent degradation)四个与降解表型相关的指标中全部缺失的数据行。只有当这四项都为缺失（NaN）时，才会删除对应的样本。这样可以保证保留下的数据至少包含一项降解相关的信息，提高后续分析的有效数据量。


In [19]:
model_df = model_df.dropna(subset=['DC50 (nM)', 'Dmax (%)', 'Percent degradation (%)', 'Assay (Percent degradation)'], how='all') # 四项都没有才drop
print("model_df: {:,} x {:,}".format(*model_df.shape))

model_df: 3,492 x 89


In [20]:
model_df[['DC50 (nM)', 'Dmax (%)', 'Percent degradation (%)', 'Assay (Percent degradation)']]

Unnamed: 0,DC50 (nM),Dmax (%),Percent degradation (%),Assay (Percent degradation)
5,39.2/736.2,97.6/68.8,,
10,374,49,,
12,136,88,,
13,220,75,,
18,,,23.3/24.4,% SOS1 degradation in NCI-H358 cells after 24 ...
...,...,...,...,...
9373,,,45,% CDK13 degradation in DA-MB-231 cells after 1...
9374,252.5,,,
9377,,,4.50/7.35/6.11,% PARP1 degradation in MDA-MB-231 cells after ...
9378,,,0,% FLT3 degradation in MV4-11 cells at 10 nM fo...


### Explicit labeling (DC50, Dmax)
Labeling is performed according to the following principles
- DC50 < 100nM, Dmax >= 80%: 1
- DC50 ≥ 100 nM or Dmax < 80%: 0

In [32]:
import numpy as np

dc50_labels = []
for i in model_df['DC50 (nM)']:
    # 检查当前行的DC50 (nM)值是否为字符串类型
    if type(i) is str:
        # 如果DC50 (nM)值为字符串类型，检查它是否为'N.D.'
        if i == 'N.D.': # No Degradation
            # 如果DC50 (nM)值为'N.D.'，则将对应的标签设置为False
            dc50_labels.append(False)
            continue
        dc50_list = re.findall(r"\d+\.?\d*", i)
        if len(dc50_list) == 0:
            dc50_labels.append('nan')
            continue 
        dc50 = min([float(e) for e in dc50_list]) # 取最小值, 因为DC50越小，降解越严重
        if float(dc50) < 100.:
            dc50_labels.append(True)
        else:
            dc50_labels.append(False)
    else:
        # print(i)
        dc50_labels.append('nan')

dmax_labels = []
for i in model_df['Dmax (%)']:
    if type(i) is str:
        if i == 'N.D.': # No Degradation
            dmax_labels.append(False)
        else:
            dmax_list = re.findall(r"\d+\.?\d*", i)
            if len(dmax_list) == 0:
                dmax_labels.append('nan')
                continue
            dmax = max([float(e) for e in dmax_list]) 
            if float(dmax) >= 80.:
                dmax_labels.append(True)
            else:
                dmax_labels.append(False)
    else:
        dmax_labels.append('nan')

total_labels, valid_labels = [], []
for i in range(len(dc50_labels)):
    if dc50_labels[i] == 'nan' and dmax_labels[i] == 'nan':
        total_labels.append(np.nan)
    elif dc50_labels[i] is False or dmax_labels[i] is False:
        total_labels.append(False)
        valid_labels.append(False)
    else:
        total_labels.append(True)
        valid_labels.append(True)
        
print(f'total samples: {len(total_labels)}')
print(f'valid labels: {len(valid_labels)}')

model_df['label'] = total_labels
print('True/False: {}/{}'.format(sum(valid_labels), len(valid_labels) - sum(valid_labels)))
model_df

total samples: 3492
valid labels: 2170
True/False: 909/1261


Unnamed: 0,Compound ID,Uniprot,Target,E3 ligase,PDB,Name,Smiles,DC50 (nM),Dmax (%),Assay (DC50/Dmax),...,Heavy Atom Count,Ring Count,Hydrogen Bond Acceptor Count,Hydrogen Bond Donor Count,Rotatable Bond Count,Topological Polar Surface Area,Molecular Formula,InChI,InChI Key,label
5,275,P00533,EGFR,VHL,,,CC1=C(C2=CC=C(CNC(=O)[C@@H]3C[C@@H](O)CN3C(=O)...,39.2/736.2,97.6/68.8,Degradation of WT/Exon 20 Ins EGFR in OVCAR8/H...,...,73,8,13,4,21,186.36,C55H57ClFN7O8S,InChI=1S/C55H57ClFN7O8S/c1-34-50(73-33-61-34)3...,ZSCOIFSUFMYZEZ-YSWDPXALSA-N,True
10,750,Q06187,BTK,VHL,,SJF638,CC1=C(C2=CC=C(CNC(=O)[C@@H]3C[C@@H](O)CN3C(=O)...,374,49,Degradation of BTK in NAMALWA cells after 24 h...,...,68,8,15,4,18,212.18,C50H60N10O7S,InChI=1S/C50H60N10O7S/c1-32-44(68-31-55-32)35-...,RIOHYDUGYNZWPD-DIKPJKDTSA-N,False
12,1373,Q06187,BTK,VHL,,,C=CC(=O)N1C[C@@H](N2N=C(C3=CC=C(OC4=CC=CC=C4)C...,136,88,Degradation of BTK in K562 cells after 18 h tr...,...,74,8,17,4,20,255.55,C53H60N10O10S,InChI=1S/C53H60N10O10S/c1-6-43(66)61-27-36(63-...,JQIURFOHEWHROK-WNKYWPOYSA-N,False
13,1373,P51451,BLK,VHL,,,C=CC(=O)N1C[C@@H](N2N=C(C3=CC=C(OC4=CC=CC=C4)C...,220,75,Degradation of BLK in Ramos cells after 18 h t...,...,74,8,17,4,20,255.55,C53H60N10O10S,InChI=1S/C53H60N10O10S/c1-6-43(66)61-27-36(63-...,JQIURFOHEWHROK-WNKYWPOYSA-N,False
18,2634,Q07889,SOS1,VHL,,,CC1=CC(CN2C(N3CC4(CNC4)C3)=NC3=C(N4CCN(CCOCCOC...,,,,...,73,9,14,4,18,169.66,C54H70ClFN10O6S,InChI=1S/C54H70ClFN10O6S/c1-33-20-37(21-34(2)4...,OEOJRBFVJBZNNH-LGMUQQJESA-N,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9373,5247,Q14004,CDK13,CRBN,,,N#CC1=CC=C(N[C@H]2CC[C@H](N(C(=O)NCC3=CC=CC=C3...,,,,...,57,8,10,3,9,171.08,C43H43N9O5,InChI=1S/C43H43N9O5/c44-25-29-6-18-38(45-27-29...,YBVUZBDXHYYTNI-GUJMKDNBSA-N,
9374,5309,P09874,PARP1,CRBN,,,NC(=O)C1=CC=CC2=CN(C3=CC=C(C4CCCN(C5=CC=CC6=C5...,252.5,,Degradation of PARP1 in MDA-MB-231 cells after...,...,43,7,8,2,5,147.70,C32H28N6O5,InChI=1S/C32H28N6O5/c33-29(40)23-7-1-4-20-17-3...,DNTANRUFOCYNLP-UHFFFAOYSA-N,False
9377,5775,P09874,PARP1,CRBN,,,NC(=O)C1=CC=CC2=CN(C3=CC=C([C@@H]4CCCN(C5=CC=C...,,,,...,43,7,8,2,5,147.70,C32H28N6O5,InChI=1S/C32H28N6O5/c33-29(40)24-5-1-3-20-17-3...,CHNTWWHIAMEPSW-ICCFGIFFSA-N,
9378,5838,P36888,FLT3,CRBN,,,CCC1=NC(C(N)=O)=C(NC2=CC=C(N3CCC(N4CCN(C5=CC=C...,,,,...,58,8,14,4,11,204.66,C41H50N10O7,InChI=1S/C41H50N10O7/c1-3-28-37(43-24-13-21-58...,VBURYPGHAHUOKN-UHFFFAOYSA-N,


In [None]:
# 统计label列中为NaN的行数, 这意味着DC50 (nM)和Dmax (%)都为NaN
nan_labels = model_df[['label']].isna().all(axis=1).sum()
nan_labels

344

### Implicit labeling (Percentage Degradation)

- Extract information and expand tags via `Percentage Degradation`
- fill the nan labels

In [41]:
def extract_percent_values(x):
    if pd.isna(x):
        return np.nan  # na values
    # if x == '0' or 'N.D.' in x:
    #     return [0.0]
    # replce 'N.D.' with '0'
    x = x.replace('N.D.', '0')
    vals = [re.findall('\d+\.?\d*', e)[-1] for e in x.split('/')] # e.g.：20-50，keep 50, means we only keep the max value
    return [float(val) for val in vals]
model_df['percent_values'] = model_df['Percent degradation (%)'].apply(extract_percent_values)
print(f'model_df: {model_df.shape}')
extract_percent_values = model_df['percent_values'].dropna().to_list() # dropna() 意味着我们只保留有降解百分比信息的行
print(len(extract_percent_values))
extract_percent_values

model_df: (3492, 91)
1422


[[23.3, 24.4],
 [96.0],
 [60.0],
 [93.0],
 [31.3, 75.3],
 [28.6, 72.9],
 [98.0],
 [63.0],
 [98.0],
 [26.7, 50.2],
 [10.7],
 [32.0, 35.0, 40.0],
 [0.0, 15.0, 19.0],
 [4.4, 14.0],
 [97.0],
 [68.0],
 [97.0],
 [49.0],
 [96.0],
 [89.0],
 [97.0],
 [15.0],
 [71.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [0.0],
 [7.7],
 [25.0, 63.0, 89.0],
 [66.0],
 [0.0],
 [23.0],
 [0.0, 4.0, 10.0],
 [15.4, 36.6],
 [97.0],
 [69.0],
 [98.0],
 [59.0],
 [96.0],
 [92.0],
 [98.0],
 [14.0],
 [0.0],
 [0.0],
 [0.0],
 [7.4],
 [24.7, 8.52],
 [11.0, 25.0, 29.0],
 [58.0, 78.0],
 [5.8, 30.4],
 [96.0],
 [71.0],
 [95.0],
 [45.0],
 [94.0],
 [66.0],
 [92.0],
 [5.0],
 [3.04, 2.02],
 [27.3, 51.4],
 [1.6],
 [5.0],
 [20.0, 69.0, 93.0],
 [10.0],
 [16.4, 37.0],
 [25.3, 28.5, 43.9],
 [10.0],
 [87.0],
 [72.0],
 [58.0],
 [0.0],
 [20.32, 1.83, 30.99],
 [24.76, 55.35, 42.57],
 [16.0, 20.0, 25.0],
 [39.0, 83.0, 96.0],
 [0.0, 0.0],
 [0.0],
 [93.0],
 [67.0],
 [95.0],
 [64.0],
 [95.0],
 [79.0],
 [93.0],
 [91.0],
 [94.0],
 [63.0],
 [92.0],

In [None]:
# 本 cell 的功能是从 "Assay (Percent degradation)" 列中提取剂量（dose）信息。
# 通过正则表达式解析，每条记录中的 "at xxx nM/..." 形式的数字，并将这些剂量值（转换为 float）整理为列表，
# 保存到 "dose_values" 新列中。
# 这样可以用于后续基于剂量和对应百分比降解进一步判断分子标签（label）的有效性和增强数据分析。
def extrac_dose_values(x):
    if pd.isna(x):
        return np.nan
    vals = re.findall(' at ([\d.]+(?:/[\d.]+)*) (?:nM|μM)', x)
    if len(vals) == 0:
        print(x)
        return np.nan
    
    values = []
    for val in vals:
        if '/' in val:
            values.extend(val.split('/'))
        else:
            values.append(val)
    return [float(val) for val in values]
model_df['dose_values'] = model_df['Assay (Percent degradation)'].apply(lambda x: re.sub(r'with', 'at', x) if isinstance(x, str) else x) # 'cells with 100/10 nM' to 'cells at 100/10 nM'
model_df['dose_values'] = model_df['dose_values'].apply(lambda x: re.sub(r' nM/(\d)', r'/\1', x) if isinstance(x, str) else x) # 'cells at 100 nM/10 nM' to 'cells at 100/10 nM' and 'at 100 nM/on peritoneal macrophages from C57BL/6 mice  at 10 nM'
model_df['dose_values'] = model_df['dose_values'].apply(lambda x: re.sub(r'after', 'at', x) if isinstance(x, str) else x) # 'cells after 100 nM' to 'cells at 100 nM'
model_df['dose_values'] = model_df['dose_values'].apply(lambda x: x.replace('\xa0', ' ') if isinstance(x, str) else x) # '\xa0'
# tmp = model_df['dose_values'].to_list()
# tmp
model_df['dose_values'] = model_df['dose_values'].apply(extrac_dose_values)
print(f'model_df: {model_df.shape}')
extrac_dose_values = model_df['dose_values'].dropna().to_list()
print(len(extrac_dose_values))
extrac_dose_values

100/1000
100
100
100
200/2000
200/2000
100
100
100
200/2000
5000
100/1000/10000
30/100/300
100/1000
100
100
100
100
100
100
100
100
10000
10000
10000
5000
10000
10000
10000
5000
10/30/100
10000
10000
10000
30/100/300
100/1000
100
100
100
100
100
100
100
100
10000
10000
10000
5000
100/10
100/1000/10000
30/100
100/1000
100
100
100
100
100
100
100
100
100/1000
200/2000
5000
1000
0.1/1/10
10000
100/1000
100/1000/10000
10000
1000
1000
10
10
10000/1000/100
10000/1000/100
100/1000/10000
10/30/100
500/5000
1000
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
100
200/2000
100/1000/10000
5000
100/1000/10000
30/100
0/100/500/1000/5000/10000
100/1000/10000
100/1000/10000
100/1000/10000
100/1000/10000
30/100
200/2000
100/1000/10000
10000
10000
10000
100/1000
100/1000/10000
30/100
100/1000
100/1000
10/100
3/30
10/100
3/30
100/1000
500/5000
500/5000
100/1000
200/2000
10000
10000
10000
100/1000
5000
10/100/1000
30/100
100/1000
100/1000
100/1000
10/100
3/30
1

[[100.0, 1000.0],
 [100.0],
 [100.0],
 [100.0],
 [200.0, 2000.0],
 [200.0, 2000.0],
 [100.0],
 [100.0],
 [100.0],
 [200.0, 2000.0],
 [5000.0],
 [100.0, 1000.0, 10000.0],
 [30.0, 100.0, 300.0],
 [100.0, 1000.0],
 [100.0],
 [100.0],
 [100.0],
 [100.0],
 [100.0],
 [100.0],
 [100.0],
 [100.0],
 [10000.0],
 [10000.0],
 [10000.0],
 [5000.0],
 [10000.0],
 [10000.0],
 [10000.0],
 [5000.0],
 [10.0, 30.0, 100.0],
 [10000.0],
 [10000.0],
 [10000.0],
 [30.0, 100.0, 300.0],
 [100.0, 1000.0],
 [100.0],
 [100.0],
 [100.0],
 [100.0],
 [100.0],
 [100.0],
 [100.0],
 [100.0],
 [10000.0],
 [10000.0],
 [10000.0],
 [5000.0],
 [100.0, 10.0],
 [100.0, 1000.0, 10000.0],
 [30.0, 100.0],
 [100.0, 1000.0],
 [100.0],
 [100.0],
 [100.0],
 [100.0],
 [100.0],
 [100.0],
 [100.0],
 [100.0],
 [100.0, 1000.0],
 [200.0, 2000.0],
 [5000.0],
 [1000.0],
 [0.1, 1.0, 10.0],
 [10000.0],
 [100.0, 1000.0],
 [100.0, 1000.0, 10000.0],
 [10000.0],
 [1000.0],
 [1000.0],
 [10.0],
 [10.0],
 [10000.0, 1000.0, 100.0],
 [10000.0, 1000.0, 

Use `percent_values` and `dose_values` to augment the label.
- If a label already exists, skip it.
- If not, then:
```python
    A = percent_values
    B = dose_values
    max_B = max(B)
    if max_B < 100:
        label = True
    elif max_B == 100
        if A[B.index(100.)] >= 80:
            label = True
    else label = False
```

In [None]:
# 这一部分用于使用 percent_values 和 dose_values 两列数据来补全或增强 label（True/False）标签。
# 其中规则为：
# - 如果 label 已经存在（非空），则不修改，直接跳过。
# - 如果没有 label，那么按照如下逻辑判断：
#     1. 若所有 dose_values 中的最大值 max_B 小于 100，则设置 label 为 True（即较低剂量下观察到降解, 代表有效）。
#     2. 如果存在剂量=100（即 100 在 dose_values），那么取与 100 剂量对应的降解百分比：
#           若 percent_values 对应位置的值大于等于 80，则 label 为 True（即 100 nM 下大于等于 80% 的降解视为有效）。
#     3. 否则，label 为 False（不满足上述条件视为无效）。
# 这个流程用于尽量自动、客观地基于剂量-响应关系推断分子的标签，以增强数据质量。

def update_row(row):
    label = row['label']
    if pd.notna(label): # skip
        return label
    
    percent_values = row['percent_values']
    dose_values = row['dose_values']
    print(percent_values, dose_values)
    assert len(percent_values) == len(dose_values)
    
    assert len(dose_values) >= 1
    max_dose_value = int(max(dose_values))
    if max_dose_value < 100:
        return True
    elif 100. in dose_values:
        index = dose_values.index(100.)
        return percent_values[index] >= 80.
    else:
        return False

model_df.loc[:, 'label'] = model_df.apply(update_row, axis=1)
model_df

[26.0, 35.0, 28.0] [100.0, 1000.0, 10000.0]
[15.0, 23.0, 23.0] [100.0, 1000.0, 10000.0]
[16.0, 20.0, 25.0] [100.0, 1000.0, 10000.0]
[11.0, 25.0, 29.0] [100.0, 1000.0, 10000.0]
[54.0, 84.0, 64.0] [100.0, 1000.0, 10000.0]
[8.0, 29.0, 65.0] [10.0, 100.0, 1000.0]
[15.0, 66.0, 87.0] [10.0, 100.0, 1000.0]
[10.0, 48.0, 88.0] [10.0, 100.0, 1000.0]
[11.0, 48.0, 86.0] [10.0, 100.0, 1000.0]
[30.0, 69.0, 75.0] [100.0, 1000.0, 10000.0]
[32.0, 35.0, 40.0] [100.0, 1000.0, 10000.0]
[30.0, 63.0, 96.0] [10.0, 100.0, 1000.0]
[50.0, 80.0, 80.0] [100.0, 1000.0, 10000.0]
[37.0, 61.0, 92.0] [10.0, 100.0, 1000.0]
[28.0, 48.0, 89.0] [10.0, 100.0, 1000.0]
[34.0, 51.0, 78.0] [10.0, 100.0, 1000.0]
[3.0, 31.0, 28.0] [100.0, 1000.0, 10000.0]
[0.0, 12.0, 0.0] [100.0, 1000.0, 10000.0]
[11.0, 50.0, 87.0] [10.0, 100.0, 1000.0]
[27.0, 75.0, 94.0] [10.0, 100.0, 1000.0]
[22.0, 72.0, 93.0] [10.0, 100.0, 1000.0]
[20.0, 81.0, 97.0] [10.0, 100.0, 1000.0]
[38.0, 80.0, 95.0] [10.0, 100.0, 1000.0]
[18.0, 34.0, 35.0] [100.0, 1000

Unnamed: 0,Compound ID,Uniprot,Target,E3 ligase,PDB,Name,Smiles,DC50 (nM),Dmax (%),Assay (DC50/Dmax),...,Hydrogen Bond Acceptor Count,Hydrogen Bond Donor Count,Rotatable Bond Count,Topological Polar Surface Area,Molecular Formula,InChI,InChI Key,label,percent_values,dose_values
19,11,Q9H8M2,BRD9,VHL,,,COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...,560,80,Degradation of BRD9 in HeLa cells after 4 h tr...,...,16,3,22,199.15,C54H69FN8O10S,InChI=1S/C54H69FN8O10S/c1-34-47(74-33-58-34)35...,MXAKQOVZPDLCDK-UDVNCTHFSA-N,False,,
40,22,Q9H8M2,BRD9,VHL,,VZ185,COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...,1.76,95,Degradation of BRD9 in RI-1 cells after 8 h tr...,...,14,3,19,180.69,C53H67FN8O8S,InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-...,ZAGCLFXBHOXXEN-JPTLTNPLSA-N,True,,
41,22,Q9H8M2,BRD9,VHL,,VZ185,COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...,4,,Degradation of HiBiT-BRD9 in HEK293 cells afte...,...,14,3,19,180.69,C53H67FN8O8S,InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-...,ZAGCLFXBHOXXEN-JPTLTNPLSA-N,True,,
42,22,Q9H8M2,BRD9,VHL,,VZ185,COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...,2/8,,Degradation of BRD9 in EOL-1/A-204 cells after...,...,14,3,19,180.69,C53H67FN8O8S,InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-...,ZAGCLFXBHOXXEN-JPTLTNPLSA-N,True,,
43,22,Q9NPI1,BRD7,VHL,,VZ185,COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...,4.5,95,Degradation of BRD7 in RI-1 cells after 8 h tr...,...,14,3,19,180.69,C53H67FN8O8S,InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-...,ZAGCLFXBHOXXEN-JPTLTNPLSA-N,True,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5381,3264,O60885,BRD4,FEM1B,,,CC1=C(C)C2=C(S1)N1C(C)=NN=C1[C@H](CC(=O)NCCOCC...,1100,85,Degradation of BRD4 in HEK293T cells after 8 h...,...,13,2,18,176.30,C40H45Cl2N9O6S,InChI=1S/C40H45Cl2N9O6S/c1-25-26(2)58-40-37(25...,CPDVGNBJFIONLX-HKBQPEDESA-N,False,,
5382,3265,O60885,BRD4,FEM1B,,,CC1=C(C)C2=C(S1)N1C(C)=NN=C1[C@H](CC(=O)NCCOCC...,3600,60,Degradation of BRD4 in HEK293T cells after 8 h...,...,14,2,21,185.53,C42H49Cl2N9O7S,InChI=1S/C42H49Cl2N9O7S/c1-27-28(2)61-42-39(27...,QMBOIOPJFSHXPV-XIFFEERXSA-N,False,,
5383,3266,O60885,BRD4,FEM1B,,,CC1=C(C)C2=C(S1)N1C(C)=NN=C1[C@H](CC(=O)NCCOCC...,1600,80,Degradation of BRD4 in HEK293T cells after 8 h...,...,15,2,24,194.76,C44H53Cl2N9O8S,InChI=1S/C44H53Cl2N9O8S/c1-29-30(2)64-44-41(29...,UUCUKSPUFPMKNK-DHUJRADRSA-N,False,,
5386,3269,P03372,ER,CRBN,,ARV-471,O=C1CC[C@H](N2CC3=CC(N4CCN(CC5CCN(C6=CC=C([C@@...,2,,Degradation of ER in ER-positive breast cancer...,...,7,2,7,96.43,C45H49N5O4,InChI=1S/C45H49N5O4/c51-37-12-15-39-33(27-37)8...,TZZDVPMABRWKIZ-XMOGEVODSA-N,True,,


In [17]:
model_df['label'].describe()

count      1631
unique        2
top       False
freq       1011
Name: label, dtype: object

## Keep the necessary items
`'Compound ID', 'Uniprot', 'Target', 'E3 ligase', 'PDB	Name', 'Smiles', 'DC50 (nM)', 'Dmax (%)', 'warhead smiles', 'e3 ligand smiles', 'linker smiles'`

 这些列的含义如下：
 - 'Compound ID'：化合物编号
 - 'Uniprot'：靶点蛋白对应的UniProt编号
 - 'Target'：被降解的靶标蛋白
 - 'E3 ligase'：PROTAC中招募的E3泛素连接酶
 - 'PDB'：蛋白结构数据库（Protein Data Bank）编号
 - 'Name'：化合物名称（如有）
 - 'Smiles'：化合物的SMILES结构字符串
 - 'DC50 (nM)'：降解50%所需的PROTAC浓度（纳摩尔）
 - 'Dmax (%)'：最大降解百分比
 - 'Assay (DC50/Dmax)'：DC50/Dmax对应的实验描述
 - 'Percent degradation (%)'：特定条件下的降解百分比
 - 'Assay (Percent degradation)'：降解百分比对应的实验描述
 - 'Molecular Weight'：分子量
 - 'Exact Mass'：精确分子质量
 - 'XLogP3'：预测的辛醇/水分配系数
 - 'Heavy Atom Count'：重原子数
 - 'Ring Count'：环数量
 - 'Hydrogen Bond Acceptor Count'：氢键受体数量
 - 'Hydrogen Bond Donor Count'：氢键供体数量
 - 'Rotatable Bond Count'：可旋转键的数量
 - 'Topological Polar Surface Area'：拓扑极性表面积
 - 'label'：该分子是否被视为“有效降解剂”，True为有效，False为无效
 - 'percent_values'：各剂量下的降解百分比（列表）
 - 'dose_values'：对应的剂量值（列表）


In [18]:
columns = ['Compound ID', 'Uniprot', 'Target', 'E3 ligase', 'PDB', 'Smiles', 
           'DC50 (nM)', 'Dmax (%)', 'Assay (DC50/Dmax)', 'Percent degradation (%)', 'Assay (Percent degradation)', 
            'Molecular Weight', 'Exact Mass', 'XLogP3', 
            'Heavy Atom Count', 'Ring Count', 'Hydrogen Bond Acceptor Count', 
            'Hydrogen Bond Donor Count', 'Rotatable Bond Count', 'Topological Polar Surface Area',
            # 'warhead smiles', 'e3 ligand smiles', 'linker smiles', 
           'label', 'percent_values', 'dose_values']
model_df = model_df[columns]
# model_df.to_csv('data/protacdb/label_protac.csv', index=False)
model_df

Unnamed: 0,Compound ID,Uniprot,Target,E3 ligase,PDB,Smiles,DC50 (nM),Dmax (%),Assay (DC50/Dmax),Percent degradation (%),...,XLogP3,Heavy Atom Count,Ring Count,Hydrogen Bond Acceptor Count,Hydrogen Bond Donor Count,Rotatable Bond Count,Topological Polar Surface Area,label,percent_values,dose_values
19,11,Q9H8M2,BRD9,VHL,,COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...,560,80,Degradation of BRD9 in HeLa cells after 4 h tr...,,...,3.69,74,8,16,3,22,199.15,False,,
40,22,Q9H8M2,BRD9,VHL,,COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...,1.76,95,Degradation of BRD9 in RI-1 cells after 8 h tr...,,...,5.06,71,8,14,3,19,180.69,True,,
41,22,Q9H8M2,BRD9,VHL,,COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...,4,,Degradation of HiBiT-BRD9 in HEK293 cells afte...,,...,5.06,71,8,14,3,19,180.69,True,,
42,22,Q9H8M2,BRD9,VHL,,COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...,2/8,,Degradation of BRD9 in EOL-1/A-204 cells after...,,...,5.06,71,8,14,3,19,180.69,True,,
43,22,Q9NPI1,BRD7,VHL,,COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...,4.5,95,Degradation of BRD7 in RI-1 cells after 8 h tr...,,...,5.06,71,8,14,3,19,180.69,True,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5381,3264,O60885,BRD4,FEM1B,,CC1=C(C)C2=C(S1)N1C(C)=NN=C1[C@H](CC(=O)NCCOCC...,1100,85,Degradation of BRD4 in HEK293T cells after 8 h...,,...,3.58,58,6,13,2,18,176.30,False,,
5382,3265,O60885,BRD4,FEM1B,,CC1=C(C)C2=C(S1)N1C(C)=NN=C1[C@H](CC(=O)NCCOCC...,3600,60,Degradation of BRD4 in HEK293T cells after 8 h...,,...,3.43,61,6,14,2,21,185.53,False,,
5383,3266,O60885,BRD4,FEM1B,,CC1=C(C)C2=C(S1)N1C(C)=NN=C1[C@H](CC(=O)NCCOCC...,1600,80,Degradation of BRD4 in HEK293T cells after 8 h...,,...,3.28,64,6,15,2,24,194.76,False,,
5386,3269,P03372,ER,CRBN,,O=C1CC[C@H](N2CC3=CC(N4CCN(CC5CCN(C6=CC=C([C@@...,2,,Degradation of ER in ER-positive breast cancer...,,...,6.36,54,9,7,2,7,96.43,True,,


In [48]:
model_df.to_csv('./protacdb3/label_protac.csv', index=False)

In [58]:
import pandas as pd

# 读取protac-fine.csv，只加载需要的3列（E3 ligase、E3 ligase Uniprot、Uniprot用于辅助）
protac_fine_df = pd.read_csv('./PROTAC-fine/protac-fine.csv', header=None, low_memory=False)

# 由于不能用负索引（即-1），要用实际的列数来索引最后一列
e3_ligase_col = 3
e3_ligase_uniprot_col = protac_fine_df.shape[1] - 1  # 最后一列

e3_map_df = protac_fine_df[[e3_ligase_col, e3_ligase_uniprot_col]]
e3_map_df.columns = ["E3 ligase", "E3 ligase Uniprot"]
# 去掉重复，去掉缺失（空字符串）
e3_map_df = e3_map_df.drop_duplicates()
e3_map_df = e3_map_df[(e3_map_df["E3 ligase"].notna()) & (e3_map_df["E3 ligase"] != "") & 
                      (e3_map_df["E3 ligase Uniprot"].notna()) & (e3_map_df["E3 ligase Uniprot"] != "")]
e3_map_df = e3_map_df.reset_index(drop=True)

# 显示映射表
e3_map_df

# 可以保存到csv
# e3_map_df.to_csv('./protacdb3/e3_ligase_to_uniprot.csv', index=False)


Unnamed: 0,E3 ligase,E3 ligase Uniprot
0,E3 ligase,E3 ligase Uniprot
1,VHL,P40337
2,CRBN,Q96SW2
3,MDM2,Q00987
4,cIAP1,Q13490
5,XIAP,P98170
6,FEM1B,Q9UK73


In [72]:
protac_fine_df = pd.read_csv('./protacdb3/label_protac.csv', low_memory=False)
# 查看"E3 ligase"的取值分布
print(protac_fine_df["E3 ligase"].value_counts())

E3 ligase
CRBN       2131
VHL        1206
cIAP1        61
MDM2         31
IAP          22
XIAP         17
Keap1         7
FEM1B         7
KEAP1         6
DCAF1         2
UBR box       1
DCAF16        1
Name: count, dtype: int64


In [73]:
# 通过E3 ligase名称获取其UniProt ID，可选用mygene.info API
import requests

def get_uniprot_id_for_e3(e3_name):
    """
    调用mygene.info接口，通过E3蛋白名称查询Uniprot ID。
    """
    url = "https://mygene.info/v3/query"
    params = {
        "q": e3_name,
        "species": "human",
        "fields": "uniprot",
        "size": 1
    }
    try:
        r = requests.get(url, params=params, timeout=10)
        r.raise_for_status()
        hits = r.json().get('hits', [])
        if not hits:
            return None
        hit = hits[0]
        uniprot = hit.get("uniprot")
        if not uniprot:
            return None
        # uniprot 可能是dict类型
        # 可能有 'Swiss-Prot', 'TrEMBL'
        if isinstance(uniprot, dict):
            if "Swiss-Prot" in uniprot:
                if isinstance(uniprot["Swiss-Prot"], list):
                    return uniprot["Swiss-Prot"][0]
                else:
                    return uniprot["Swiss-Prot"]
            elif "TrEMBL" in uniprot:
                if isinstance(uniprot["TrEMBL"], list):
                    return uniprot["TrEMBL"][0]
                else:
                    return uniprot["TrEMBL"]
        elif isinstance(uniprot, str):
            return uniprot
        elif isinstance(uniprot, list):
            return uniprot[0] if uniprot else None
    except Exception as e:
        print(f"Error querying UniProt for '{e3_name}': {e}")
        return None

# 查询所有E3名称，获取其UniProt ID
e3_names = protac_fine_df["E3 ligase"].dropna().unique()
e3_uniprot_dict = {}
for name in e3_names:
    uid = get_uniprot_id_for_e3(name)
    e3_uniprot_dict[name] = uid
e3_uniprot_dict

{'VHL': 'P40337',
 'CRBN': 'Q96SW2',
 'IAP': 'Q13490',
 'cIAP1': 'Q13490',
 'Keap1': 'Q14145',
 'XIAP': 'P98170',
 'KEAP1': 'Q14145',
 'MDM2': 'Q00987',
 'UBR box': 'Q8N806',
 'FEM1B': 'Q9UK73',
 'DCAF1': 'Q9Y4B6',
 'DCAF16': 'Q9NXF7'}

In [74]:
# 根据e3_uniprot_dict，将"E3 ligase"映射到UniProt ID并加入新列
protac_fine_df["E3 ligase Uniprot"] = protac_fine_df["E3 ligase"].map(e3_uniprot_dict)
# 输出为新的csv
protac_fine_df.to_csv("protac_fine_with_e3uniprot.csv", index=False)

