# Prepare your custom data

## Feature Preparation
- Initial data preparation for E3 ligase and POI, PROTAC
- The raw data format for E3 ligase and target proteins is amino acid sequences
- The raw data format for PROTAC is SMILES

In [1]:
import os
import os.path as osp
import pandas as pd

root = 'data/protacdb3'
os.makedirs(root, exist_ok=True)

In [2]:
raw_df = pd.read_csv(osp.join(root, 'protac_fine_with_e3uniprot.csv'))
raw_df

Unnamed: 0,Compound ID,Uniprot,Target,E3 ligase,PDB,Name,Smiles,DC50 (nM),Dmax (%),Assay (DC50/Dmax),...,Hydrogen Bond Donor Count,Rotatable Bond Count,Topological Polar Surface Area,Molecular Formula,InChI,InChI Key,label,percent_values,dose_values,E3 ligase Uniprot
0,275,P00533,EGFR,VHL,,,CC1=C(C2=CC=C(CNC(=O)[C@@H]3C[C@@H](O)CN3C(=O)...,39.2/736.2,97.6/68.8,Degradation of WT/Exon 20 Ins EGFR in OVCAR8/H...,...,4,21,186.36,C55H57ClFN7O8S,InChI=1S/C55H57ClFN7O8S/c1-34-50(73-33-61-34)3...,ZSCOIFSUFMYZEZ-YSWDPXALSA-N,True,,,P40337
1,750,Q06187,BTK,VHL,,SJF638,CC1=C(C2=CC=C(CNC(=O)[C@@H]3C[C@@H](O)CN3C(=O)...,374,49,Degradation of BTK in NAMALWA cells after 24 h...,...,4,18,212.18,C50H60N10O7S,InChI=1S/C50H60N10O7S/c1-32-44(68-31-55-32)35-...,RIOHYDUGYNZWPD-DIKPJKDTSA-N,False,,,P40337
2,1373,Q06187,BTK,VHL,,,C=CC(=O)N1C[C@@H](N2N=C(C3=CC=C(OC4=CC=CC=C4)C...,136,88,Degradation of BTK in K562 cells after 18 h tr...,...,4,20,255.55,C53H60N10O10S,InChI=1S/C53H60N10O10S/c1-6-43(66)61-27-36(63-...,JQIURFOHEWHROK-WNKYWPOYSA-N,False,,,P40337
3,1373,P51451,BLK,VHL,,,C=CC(=O)N1C[C@@H](N2N=C(C3=CC=C(OC4=CC=CC=C4)C...,220,75,Degradation of BLK in Ramos cells after 18 h t...,...,4,20,255.55,C53H60N10O10S,InChI=1S/C53H60N10O10S/c1-6-43(66)61-27-36(63-...,JQIURFOHEWHROK-WNKYWPOYSA-N,False,,,P40337
4,2634,Q07889,SOS1,VHL,,,CC1=CC(CN2C(N3CC4(CNC4)C3)=NC3=C(N4CCN(CCOCCOC...,,,,...,4,18,169.66,C54H70ClFN10O6S,InChI=1S/C54H70ClFN10O6S/c1-33-20-37(21-34(2)4...,OEOJRBFVJBZNNH-LGMUQQJESA-N,,"[23.3, 24.4]","[100.0, 1000.0]",P40337
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3487,5247,Q14004,CDK13,CRBN,,,N#CC1=CC=C(N[C@H]2CC[C@H](N(C(=O)NCC3=CC=CC=C3...,,,,...,3,9,171.08,C43H43N9O5,InChI=1S/C43H43N9O5/c44-25-29-6-18-38(45-27-29...,YBVUZBDXHYYTNI-GUJMKDNBSA-N,,[45.0],[1000.0],Q96SW2
3488,5309,P09874,PARP1,CRBN,,,NC(=O)C1=CC=CC2=CN(C3=CC=C(C4CCCN(C5=CC=CC6=C5...,252.5,,Degradation of PARP1 in MDA-MB-231 cells after...,...,2,5,147.70,C32H28N6O5,InChI=1S/C32H28N6O5/c33-29(40)23-7-1-4-20-17-3...,DNTANRUFOCYNLP-UHFFFAOYSA-N,False,,,Q96SW2
3489,5775,P09874,PARP1,CRBN,,,NC(=O)C1=CC=CC2=CN(C3=CC=C([C@@H]4CCCN(C5=CC=C...,,,,...,2,5,147.70,C32H28N6O5,InChI=1S/C32H28N6O5/c33-29(40)24-5-1-3-20-17-3...,CHNTWWHIAMEPSW-ICCFGIFFSA-N,,"[4.5, 7.35, 6.11]","[10000.0, 1000.0, 100.0]",Q96SW2
3490,5838,P36888,FLT3,CRBN,,,CCC1=NC(C(N)=O)=C(NC2=CC=C(N3CCC(N4CCN(C5=CC=C...,,,,...,4,11,204.66,C41H50N10O7,InChI=1S/C41H50N10O7/c1-3-28-37(43-24-13-21-58...,VBURYPGHAHUOKN-UHFFFAOYSA-N,,[0.0],[10.0],Q96SW2


### Prepare PROTAC

In [3]:
columns = [
    # 'Smiles',
    'Molecular Weight',
    'Exact Mass',
    'XLogP3',
    'Heavy Atom Count',
    'Ring Count',
    'Hydrogen Bond Acceptor Count',
    'Hydrogen Bond Donor Count',
    'Rotatable Bond Count',
    'Topological Polar Surface Area'
]

Calculate molecular properties for PROTAC

> [!NOTE]
> `XLogP3` is originally calculated via [XLogP3](http://www.sioc-ccbg.ac.cn/skins/ccbgwebsite/software/xlogp3/) software, here using `Descriptors.MolLogP()` from [RDKit](https://www.rdkit.org/)

In [4]:

from rdkit import Chem
from rdkit.Chem import Descriptors


def calculate_molecular_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return [
        Descriptors.MolWt(mol),
        Descriptors.ExactMolWt(mol),
        Descriptors.MolLogP(mol), # XLogP3 to LogP
        Descriptors.HeavyAtomCount(mol),
        Descriptors.RingCount(mol),
        Descriptors.NumHAcceptors(mol),
        Descriptors.NumHDonors(mol),
        Descriptors.NumRotatableBonds(mol),
        Descriptors.TPSA(mol)
    ]

In [5]:
property_df = raw_df['Smiles'].apply(calculate_molecular_descriptors).apply(pd.Series)
property_df.columns = columns
custom_df = pd.concat([raw_df, property_df], axis=1)
custom_df

Unnamed: 0,Compound ID,Uniprot,Target,E3 ligase,PDB,Name,Smiles,DC50 (nM),Dmax (%),Assay (DC50/Dmax),...,E3 ligase Uniprot,Molecular Weight,Exact Mass,XLogP3,Heavy Atom Count,Ring Count,Hydrogen Bond Acceptor Count,Hydrogen Bond Donor Count,Rotatable Bond Count,Topological Polar Surface Area
0,275,P00533,EGFR,VHL,,,CC1=C(C2=CC=C(CNC(=O)[C@@H]3C[C@@H](O)CN3C(=O)...,39.2/736.2,97.6/68.8,Degradation of WT/Exon 20 Ins EGFR in OVCAR8/H...,...,P40337,1030.620,1029.366189,9.06492,73.0,8.0,13.0,4.0,21.0,186.36
1,750,Q06187,BTK,VHL,,SJF638,CC1=C(C2=CC=C(CNC(=O)[C@@H]3C[C@@H](O)CN3C(=O)...,374,49,Degradation of BTK in NAMALWA cells after 24 h...,...,P40337,945.160,944.436715,5.77892,68.0,8.0,15.0,4.0,18.0,212.18
2,1373,Q06187,BTK,VHL,,,C=CC(=O)N1C[C@@H](N2N=C(C3=CC=C(OC4=CC=CC=C4)C...,136,88,Degradation of BTK in K562 cells after 18 h tr...,...,P40337,1029.190,1028.421459,5.01322,74.0,8.0,17.0,4.0,20.0,255.55
3,1373,P51451,BLK,VHL,,,C=CC(=O)N1C[C@@H](N2N=C(C3=CC=C(OC4=CC=CC=C4)C...,220,75,Degradation of BLK in Ramos cells after 18 h t...,...,P40337,1029.190,1028.421459,5.01322,74.0,8.0,17.0,4.0,20.0,255.55
4,2634,Q07889,SOS1,VHL,,,CC1=CC(CN2C(N3CC4(CNC4)C3)=NC3=C(N4CCN(CCOCCOC...,,,,...,P40337,1041.736,1040.487307,5.86076,73.0,9.0,14.0,4.0,18.0,169.66
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3487,5247,Q14004,CDK13,CRBN,,,N#CC1=CC=C(N[C@H]2CC[C@H](N(C(=O)NCC3=CC=CC=C3...,,,,...,Q96SW2,765.875,765.338715,4.82058,57.0,8.0,10.0,3.0,9.0,171.08
3488,5309,P09874,PARP1,CRBN,,,NC(=O)C1=CC=CC2=CN(C3=CC=C(C4CCCN(C5=CC=CC6=C5...,252.5,,Degradation of PARP1 in MDA-MB-231 cells after...,...,Q96SW2,576.613,576.212118,2.90960,43.0,7.0,8.0,2.0,5.0,147.70
3489,5775,P09874,PARP1,CRBN,,,NC(=O)C1=CC=CC2=CN(C3=CC=C([C@@H]4CCCN(C5=CC=C...,,,,...,Q96SW2,576.613,576.212118,2.90960,43.0,7.0,8.0,2.0,5.0,147.70
3490,5838,P36888,FLT3,CRBN,,,CCC1=NC(C(N)=O)=C(NC2=CC=C(N3CCC(N4CCN(C5=CC=C...,,,,...,Q96SW2,794.914,794.386394,2.67320,58.0,8.0,14.0,4.0,11.0,204.66


### Add Morgan Fingerprint

In [6]:
from rdkit.Chem import AllChem
from rdkit.Chem import MACCSkeys
from tqdm import tqdm
tqdm.pandas()

def calculate_maccs_fingerprint(smiles):
    """
    计算MACCS Fingerprint (166位)
    
    Args:
        smiles: SMILES字符串
    
    Returns:
        fingerprint字符串（逗号分隔）
    """
    if pd.isna(smiles) or smiles == '':
        return None
    try:
        mol = Chem.MolFromSmiles(str(smiles))
        if mol is None:
            return None
        # MACCSkeys返回167位（包括第0位，通常不使用）
        fp = MACCSkeys.GenMACCSKeys(mol)
        # 转换为列表并跳过第0位（通常不使用），使用1-166位
        fp_list = [int(fp.GetBit(i)) for i in range(1, 167)]  # 166位
        fp_str = ','.join(map(str, fp_list))
        return fp_str
    except Exception as e:
        return None



In [7]:
# 计算MACCS fingerprint
print("计算MACCS Fingerprint (166位)...")
maccs_fingerprints = custom_df['Smiles'].progress_apply(calculate_maccs_fingerprint)

# 添加MACCS fingerprint列
custom_df['MACCS_Fingerprint'] = maccs_fingerprints

# 统计信息
success_count = custom_df['MACCS_Fingerprint'].notna().sum()
failed_count = len(custom_df) - success_count
print(f"\n处理完成!")
print(f"成功: {success_count} / {len(custom_df)}")
print(f"失败: {failed_count}")

# 显示示例
print("\n示例数据:")
custom_df[['Smiles', 'MACCS_Fingerprint']].head(3)

计算MACCS Fingerprint (166位)...


100%|██████████| 3492/3492 [00:06<00:00, 554.97it/s]


处理完成!
成功: 3492 / 3492
失败: 0

示例数据:





Unnamed: 0,Smiles,MACCS_Fingerprint
0,CC1=C(C2=CC=C(CNC(=O)[C@@H]3C[C@@H](O)CN3C(=O)...,"0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,..."
1,CC1=C(C2=CC=C(CNC(=O)[C@@H]3C[C@@H](O)CN3C(=O)...,"0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,..."
2,C=CC(=O)N1C[C@@H](N2N=C(C3=CC=C(OC4=CC=CC=C4)C...,"0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,..."


In [25]:
# 添加 Morgan 指纹
from rdkit.Chem import AllChem

def calculate_morgan_fingerprint(smiles, radius=2, n_bits=2048):
    """
    计算 Morgan 指纹（ECFP）为定长 bit 向量 (默认为 2048 位)

    Args:
        smiles: SMILES字符串
        radius: 半径，默认为2
        n_bits: 指纹长度，默认为2048

    Returns:
        指纹字符串（逗号分隔的0/1）
    """
    if pd.isna(smiles) or smiles == '':
        return None
    try:
        mol = Chem.MolFromSmiles(str(smiles))
        if mol is None:
            return None
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
        fp_list = list(fp)
        fp_str = ','.join(map(str, fp_list))
        return fp_str
    except Exception as e:
        return None

print("计算Morgan Fingerprint (2048位)...")
morgan_fingerprints = custom_df['Smiles'].progress_apply(calculate_morgan_fingerprint)
custom_df['Morgan_Fingerprint'] = morgan_fingerprints

# 显示示例
print("\nMorgan指纹示例数据:")
print(custom_df[['Smiles', 'Morgan_Fingerprint']].head(3))


计算Morgan Fingerprint (2048位)...


100%|██████████| 3492/3492 [00:05<00:00, 666.72it/s]


Morgan指纹示例数据:
                                              Smiles  \
0  CC1=C(C2=CC=C(CNC(=O)[C@@H]3C[C@@H](O)CN3C(=O)...   
1  CC1=C(C2=CC=C(CNC(=O)[C@@H]3C[C@@H](O)CN3C(=O)...   
2  C=CC(=O)N1C[C@@H](N2N=C(C3=CC=C(OC4=CC=CC=C4)C...   

                                  Morgan_Fingerprint  
0  0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,...  
1  0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,...  
2  0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,...  





In [26]:
custom_df.to_csv(osp.join(root, 'protac_maccs_morgan.csv'), index=False)

In [24]:
# 查看MACCS_Fingerprint各位的数值分布（0/1分布），以及整体分布情况
import numpy as np

# 处理好的二进制指纹矩阵
maccs_matrix = custom_df['MACCS_Fingerprint'].dropna().apply(lambda x: [int(i) for i in x.split(',')])
maccs_matrix = np.array(maccs_matrix.tolist())
print("MACCS_Fingerprint 形状:", maccs_matrix.shape)

# 每个位（共166位）0/1的分布
bit_sums = maccs_matrix.sum(axis=0)  # 每个位为1的样本数
bit_means = maccs_matrix.mean(axis=0)  # 平均值（为1的比例）

print("每个位为1的样本数（前20位示例）:\n", bit_sums[:20])
print("每个位为1的比例（前20位示例）:\n", np.round(bit_means[:20], 3))

# 总体统计（总1数、总0数）
total_bits = maccs_matrix.size
total_ones = maccs_matrix.sum()
total_zeros = total_bits - total_ones
print(f"总位数: {total_bits}, 1的总数: {total_ones}, 0的总数: {total_zeros}")
print(f"整体 '1' 比例: {total_ones / total_bits:.4f}, '0' 比例: {total_zeros / total_bits:.4f}")

# 显示所有166个位出现“1”的比例
print("每个位出现1的比例（全部166位，每10位一行）:")
for i in range(0, 166, 10):
    bits_range = list(range(i + 1, min(i + 11, 167)))
    means_slice = np.round(bit_means[i:i+10], 3)
    print(f"{bits_range}: {means_slice}")

MACCS_Fingerprint 形状: (3492, 166)
每个位为1的样本数（前20位示例）:
 [  0   0   0   0   0   0   0 214   0   0 312   0   2   0   0   0 252   0
 300   0]
每个位为1的比例（前20位示例）:
 [0.    0.    0.    0.    0.    0.    0.    0.061 0.    0.    0.089 0.
 0.001 0.    0.    0.    0.072 0.    0.086 0.   ]
总位数: 579672, 1的总数: 270783, 0的总数: 308889
整体 '1' 比例: 0.4671, '0' 比例: 0.5329
每个位出现1的比例（全部166位，每10位一行）:
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]: [0.    0.    0.    0.    0.    0.    0.    0.061 0.    0.   ]
[11, 12, 13, 14, 15, 16, 17, 18, 19, 20]: [0.089 0.    0.001 0.    0.    0.    0.072 0.    0.086 0.   ]
[21, 22, 23, 24, 25, 26, 27, 28, 29, 30]: [0.    0.034 0.011 0.038 0.158 0.003 0.011 0.001 0.046 0.006]
[31, 32, 33, 34, 35, 36, 37, 38, 39, 40]: [0.004 0.067 0.074 0.027 0.    0.435 0.067 0.441 0.    0.   ]
[41, 42, 43, 44, 45, 46, 47, 48, 49, 50]: [0.115 0.339 0.054 0.    0.002 0.024 0.41  0.04  0.007 0.037]
[51, 52, 53, 54, 55, 56, 57, 58, 59, 60]: [0.094 0.347 0.205 0.198 0.1   0.007 0.134 0.1   0.09  0.1  ]
[61, 62, 

### Prepare E3 ligase and target

In [9]:
# collect all uniprot ids（去除nan）
uniprot_ids = set([x for x in custom_df['Uniprot'].tolist() + custom_df['E3 ligase Uniprot'].tolist() if pd.notnull(x)])
# uniprot_ids = set(custom_df['Uniprot'].tolist() + custom_df['E3 ligase Uniprot'].tolist())
print(uniprot_ids)

{'P07900', 'Q16186', 'O60674', 'Q07889', 'P62937', 'P11362', 'Q86X55', 'P53350', 'Q96C86', 'Q5S007', 'Q13547', 'Q09472', 'O75533', 'O75608', 'Q14004', 'O00418', 'P40763', 'O60885', 'Q92830', 'P50750', 'O75530', 'O14965', 'P11474', 'Q07820', 'Q9Y616', 'Q9UBN7', 'P62942', 'P49841', 'Q9NXF7', 'P14902', 'P07195', 'Q00987', 'Q9UGN5', 'P35348', 'P52333', 'P78356', 'P51692', 'P08238', 'P29597', 'Q9NPI1', 'Q13155', 'P24941', 'P51532', 'Q96RR4', 'Q16539', 'Q00534', 'Q9H8M2', 'Q86WV6', 'Q12834', 'P18031', 'P10636', 'Q9BZ95', 'P27361', 'Q13490', 'P35613', 'P51617', 'O15264', 'Q13153', 'Q03111', 'P06493', 'P40337', 'O00444', 'P10275', 'Q06187', 'P15056', 'P35968', 'P04629', 'Q92769', 'Q15059', 'Q14145', 'P00338', 'P03372', 'P25440', 'P11802', 'P50395', 'P0DTD1', 'P00533', 'P22736', 'Q93009', 'P36507', 'Q07817', 'Q9NWZ3', 'Q9UM73', 'Q8TBX8', 'Q96SW2', 'Q92831', 'Q15022', 'P07437', 'P10415', 'P06730', 'P30530', 'Q92918', 'P08581', 'P17706', 'Q16288', 'P48426', 'P14625', 'P33981', 'P51451', 'Q15910',

In [10]:
import pickle
import time
from io import StringIO

import requests
from Bio import SeqIO
from requests.exceptions import HTTPError, RequestException

try: 
    seq_cache = pickle.load(open(osp.join(root, 'seq_cache.pkl'), 'rb'))
except:
    seq_cache = {}

def get_aa_seq(uniprot_id):
    """
    Download fasta file from Uniprot and extract amino acid sequence
    
    Parameters:
        uniprot_id (str): UniProt ID of the protein.
        
    Returns:
        str: Amino acid sequence of the protein.
        
    Example:
        get_aa_seq('P04637')
        >>> 'MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGPDEAPRMPEAAPPVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHERCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNSSCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELPPGSTKRALPNNTSSSPQPKKKPLDGEYFTLQIRGRERFEMFRELNEALELKDAQAGKEPGGSRAHSSHLKSKKGQSTSRHKKLMFKTEGPDSD'
    """
    try:
        if uniprot_id in seq_cache:
            return seq_cache[uniprot_id]
        
        url = f'https://www.uniprot.org/uniprot/{uniprot_id}.fasta'
        r = requests.get(url)
        r.raise_for_status()
        fasta = r.text
        fasta_io = StringIO(fasta)
        seq_record = SeqIO.read(fasta_io, 'fasta')
        # time.sleep(1)
        seq = str(seq_record.seq)
        seq_cache[uniprot_id] = seq
        pickle.dump(seq_cache, open(osp.join(root, 'seq_cache.pkl'), 'wb'))
        return seq
    except HTTPError as http_err:
        print(f'发生HTTP错误: {http_err}')
    except RequestException as req_err:
        print(f'发生请求错误: {req_err}')
    except Exception as e:
        print(f'发生意外错误: {e}')

In [11]:
p_map = {uniprot_id: get_aa_seq(uniprot_id) for uniprot_id in uniprot_ids}
for k, v in p_map.items():
    print(k, v)

P07900 MPEETQTQDQPMEEEEVETFAFQAEIAQLMSLIINTFYSNKEIFLRELISNSSDALDKIRYESLTDPSKLDSGKELHINLIPNKQDRTLTIVDTGIGMTKADLINNLGTIAKSGTKAFMEALQAGADISMIGQFGVGFYSAYLVAEKVTVITKHNDDEQYAWESSAGGSFTVRTDTGEPMGRGTKVILHLKEDQTEYLEERRIKEIVKKHSQFIGYPITLFVEKERDKEVSDDEAEEKEDKEEEKEKEEKESEDKPEIEDVGSDEEEEKKDGDKKKKKKIKEKYIDQEELNKTKPIWTRNPDDITNEEYGEFYKSLTNDWEDHLAVKHFSVEGQLEFRALLFVPRRAPFDLFENRKKKNNIKLYVRRVFIMDNCEELIPEYLNFIRGVVDSEDLPLNISREMLQQSKILKVIRKNLVKKCLELFTELAEDKENYKKFYEQFSKNIKLGIHEDSQNRKKLSELLRYYTSASGDEMVSLKDYCTRMKENQKHIYYITGETKDQVANSAFVERLRKHGLEVIYMIEPIDEYCVQQLKEFEGKTLVSVTKEGLELPEDEEEKKKQEEKKTKFENLCKIMKDILEKKVEKVVVSNRLVTSPCCIVTSTYGWTANMERIMKAQALRDNSTMGYMAAKKHLEINPDHSIIETLRQKAEADKNDKSVKDLVILLYETALLSSGFSLEDPQTHANRIYRMIKLGLGIDEDDPTADDTSAAVTEEMPPLEGDDDTSRMEEVD
Q16186 MTTSGALFPSLVPGSRGASNKYLVEFRAGKMSLKGTTVTPDKRKGLVYIQQTDDSLIHFCWKDRTSGNVEDDLIIFPDDCEFKRVPQCPSGRVYVLKFKAGSKRLFFWMQEPKTDQDEEHCRKVNEYLNNPPMPGALGASGSSGHELSALGGEGGLQSLLGNMSHSQLMQLIGPAGLGGLGGLGALTGPGLASLLGSSGPPGSSSSSSSRSQSAAVTPSSTTSSTRATPAPSAPAAASATSPSPAPSSGNGAS

In [12]:
print(root)

data/protacdb3


In [13]:
with open(osp.join(root, 'p_map.pkl'), 'wb') as f:
    pickle.dump(p_map, f)

Processing protein sequence features into implicit structural representations
- Protein sequence hash table: `p_map`
- Protein structural representation hash table: `esm_s_map`

> ![NOTE] 
> This part needs to be prepared separately, taking into account the large GPU resources required for the Protein Language Model as well as persistent data to improve training efficiency. 
```shell
$ conda activate esm+
$ python get_embed_s.py
```

## Ready for PROTAC-STAN inference

- Your ready custom data: `customed.csv` and `esm_s_map.pkl`, keep them in the same directory, such as: `./data/custom`
- Using `inference.py`, usage: `python inference.py [-h] [--root ROOT] [--name NAME] [--save_att]`
  - for example:
    ```shell
    $ conda activate protac-stan
    $ python inference.py --root 'data/custom' --name 'custom'
    ```