# Prepare your custom data

## Feature Preparation
- Initial data preparation for E3 ligase and POI, PROTAC
- The raw data format for E3 ligase and target proteins is amino acid sequences
- The raw data format for PROTAC is SMILES

In [24]:
import os
import os.path as osp
import pandas as pd

root = 'data/protacdb3'
os.makedirs(root, exist_ok=True)

In [25]:
raw_df = pd.read_csv(osp.join(root, 'protac_fine_with_e3uniprot.csv'))
raw_df

Unnamed: 0,Compound ID,Uniprot,Target,E3 ligase,PDB,Name,Smiles,DC50 (nM),Dmax (%),Assay (DC50/Dmax),...,Hydrogen Bond Donor Count,Rotatable Bond Count,Topological Polar Surface Area,Molecular Formula,InChI,InChI Key,label,percent_values,dose_values,E3 ligase Uniprot
0,275,P00533,EGFR,VHL,,,CC1=C(C2=CC=C(CNC(=O)[C@@H]3C[C@@H](O)CN3C(=O)...,39.2/736.2,97.6/68.8,Degradation of WT/Exon 20 Ins EGFR in OVCAR8/H...,...,4,21,186.36,C55H57ClFN7O8S,InChI=1S/C55H57ClFN7O8S/c1-34-50(73-33-61-34)3...,ZSCOIFSUFMYZEZ-YSWDPXALSA-N,True,,,P40337
1,750,Q06187,BTK,VHL,,SJF638,CC1=C(C2=CC=C(CNC(=O)[C@@H]3C[C@@H](O)CN3C(=O)...,374,49,Degradation of BTK in NAMALWA cells after 24 h...,...,4,18,212.18,C50H60N10O7S,InChI=1S/C50H60N10O7S/c1-32-44(68-31-55-32)35-...,RIOHYDUGYNZWPD-DIKPJKDTSA-N,False,,,P40337
2,1373,Q06187,BTK,VHL,,,C=CC(=O)N1C[C@@H](N2N=C(C3=CC=C(OC4=CC=CC=C4)C...,136,88,Degradation of BTK in K562 cells after 18 h tr...,...,4,20,255.55,C53H60N10O10S,InChI=1S/C53H60N10O10S/c1-6-43(66)61-27-36(63-...,JQIURFOHEWHROK-WNKYWPOYSA-N,False,,,P40337
3,1373,P51451,BLK,VHL,,,C=CC(=O)N1C[C@@H](N2N=C(C3=CC=C(OC4=CC=CC=C4)C...,220,75,Degradation of BLK in Ramos cells after 18 h t...,...,4,20,255.55,C53H60N10O10S,InChI=1S/C53H60N10O10S/c1-6-43(66)61-27-36(63-...,JQIURFOHEWHROK-WNKYWPOYSA-N,False,,,P40337
4,2634,Q07889,SOS1,VHL,,,CC1=CC(CN2C(N3CC4(CNC4)C3)=NC3=C(N4CCN(CCOCCOC...,,,,...,4,18,169.66,C54H70ClFN10O6S,InChI=1S/C54H70ClFN10O6S/c1-33-20-37(21-34(2)4...,OEOJRBFVJBZNNH-LGMUQQJESA-N,,"[23.3, 24.4]","[100.0, 1000.0]",P40337
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3487,5247,Q14004,CDK13,CRBN,,,N#CC1=CC=C(N[C@H]2CC[C@H](N(C(=O)NCC3=CC=CC=C3...,,,,...,3,9,171.08,C43H43N9O5,InChI=1S/C43H43N9O5/c44-25-29-6-18-38(45-27-29...,YBVUZBDXHYYTNI-GUJMKDNBSA-N,,[45.0],[1000.0],Q96SW2
3488,5309,P09874,PARP1,CRBN,,,NC(=O)C1=CC=CC2=CN(C3=CC=C(C4CCCN(C5=CC=CC6=C5...,252.5,,Degradation of PARP1 in MDA-MB-231 cells after...,...,2,5,147.70,C32H28N6O5,InChI=1S/C32H28N6O5/c33-29(40)23-7-1-4-20-17-3...,DNTANRUFOCYNLP-UHFFFAOYSA-N,False,,,Q96SW2
3489,5775,P09874,PARP1,CRBN,,,NC(=O)C1=CC=CC2=CN(C3=CC=C([C@@H]4CCCN(C5=CC=C...,,,,...,2,5,147.70,C32H28N6O5,InChI=1S/C32H28N6O5/c33-29(40)24-5-1-3-20-17-3...,CHNTWWHIAMEPSW-ICCFGIFFSA-N,,"[4.5, 7.35, 6.11]","[10000.0, 1000.0, 100.0]",Q96SW2
3490,5838,P36888,FLT3,CRBN,,,CCC1=NC(C(N)=O)=C(NC2=CC=C(N3CCC(N4CCN(C5=CC=C...,,,,...,4,11,204.66,C41H50N10O7,InChI=1S/C41H50N10O7/c1-3-28-37(43-24-13-21-58...,VBURYPGHAHUOKN-UHFFFAOYSA-N,,[0.0],[10.0],Q96SW2


### Prepare PROTAC

In [26]:
columns = [
    # 'Smiles',
    'Molecular Weight',
    'Exact Mass',
    'XLogP3',
    'Heavy Atom Count',
    'Ring Count',
    'Hydrogen Bond Acceptor Count',
    'Hydrogen Bond Donor Count',
    'Rotatable Bond Count',
    'Topological Polar Surface Area'
]

Calculate molecular properties for PROTAC

> [!NOTE]
> `XLogP3` is originally calculated via [XLogP3](http://www.sioc-ccbg.ac.cn/skins/ccbgwebsite/software/xlogp3/) software, here using `Descriptors.MolLogP()` from [RDKit](https://www.rdkit.org/)

In [27]:

from rdkit import Chem
from rdkit.Chem import Descriptors


def calculate_molecular_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return [
        Descriptors.MolWt(mol),
        Descriptors.ExactMolWt(mol),
        Descriptors.MolLogP(mol), # XLogP3 to LogP
        Descriptors.HeavyAtomCount(mol),
        Descriptors.RingCount(mol),
        Descriptors.NumHAcceptors(mol),
        Descriptors.NumHDonors(mol),
        Descriptors.NumRotatableBonds(mol),
        Descriptors.TPSA(mol)
    ]

In [28]:
property_df = raw_df['Smiles'].apply(calculate_molecular_descriptors).apply(pd.Series)
property_df.columns = columns
custom_df = pd.concat([raw_df, property_df], axis=1)
custom_df

Unnamed: 0,Compound ID,Uniprot,Target,E3 ligase,PDB,Name,Smiles,DC50 (nM),Dmax (%),Assay (DC50/Dmax),...,E3 ligase Uniprot,Molecular Weight,Exact Mass,XLogP3,Heavy Atom Count,Ring Count,Hydrogen Bond Acceptor Count,Hydrogen Bond Donor Count,Rotatable Bond Count,Topological Polar Surface Area
0,275,P00533,EGFR,VHL,,,CC1=C(C2=CC=C(CNC(=O)[C@@H]3C[C@@H](O)CN3C(=O)...,39.2/736.2,97.6/68.8,Degradation of WT/Exon 20 Ins EGFR in OVCAR8/H...,...,P40337,1030.620,1029.366189,9.06492,73.0,8.0,13.0,4.0,21.0,186.36
1,750,Q06187,BTK,VHL,,SJF638,CC1=C(C2=CC=C(CNC(=O)[C@@H]3C[C@@H](O)CN3C(=O)...,374,49,Degradation of BTK in NAMALWA cells after 24 h...,...,P40337,945.160,944.436715,5.77892,68.0,8.0,15.0,4.0,18.0,212.18
2,1373,Q06187,BTK,VHL,,,C=CC(=O)N1C[C@@H](N2N=C(C3=CC=C(OC4=CC=CC=C4)C...,136,88,Degradation of BTK in K562 cells after 18 h tr...,...,P40337,1029.190,1028.421459,5.01322,74.0,8.0,17.0,4.0,20.0,255.55
3,1373,P51451,BLK,VHL,,,C=CC(=O)N1C[C@@H](N2N=C(C3=CC=C(OC4=CC=CC=C4)C...,220,75,Degradation of BLK in Ramos cells after 18 h t...,...,P40337,1029.190,1028.421459,5.01322,74.0,8.0,17.0,4.0,20.0,255.55
4,2634,Q07889,SOS1,VHL,,,CC1=CC(CN2C(N3CC4(CNC4)C3)=NC3=C(N4CCN(CCOCCOC...,,,,...,P40337,1041.736,1040.487307,5.86076,73.0,9.0,14.0,4.0,18.0,169.66
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3487,5247,Q14004,CDK13,CRBN,,,N#CC1=CC=C(N[C@H]2CC[C@H](N(C(=O)NCC3=CC=CC=C3...,,,,...,Q96SW2,765.875,765.338715,4.82058,57.0,8.0,10.0,3.0,9.0,171.08
3488,5309,P09874,PARP1,CRBN,,,NC(=O)C1=CC=CC2=CN(C3=CC=C(C4CCCN(C5=CC=CC6=C5...,252.5,,Degradation of PARP1 in MDA-MB-231 cells after...,...,Q96SW2,576.613,576.212118,2.90960,43.0,7.0,8.0,2.0,5.0,147.70
3489,5775,P09874,PARP1,CRBN,,,NC(=O)C1=CC=CC2=CN(C3=CC=C([C@@H]4CCCN(C5=CC=C...,,,,...,Q96SW2,576.613,576.212118,2.90960,43.0,7.0,8.0,2.0,5.0,147.70
3490,5838,P36888,FLT3,CRBN,,,CCC1=NC(C(N)=O)=C(NC2=CC=C(N3CCC(N4CCN(C5=CC=C...,,,,...,Q96SW2,794.914,794.386394,2.67320,58.0,8.0,14.0,4.0,11.0,204.66


In [29]:
custom_df.to_csv(osp.join(root, 'custom.csv'), index=False)

### Prepare E3 ligase and target

In [31]:
# collect all uniprot ids（去除nan）
uniprot_ids = set([x for x in custom_df['Uniprot'].tolist() + custom_df['E3 ligase Uniprot'].tolist() if pd.notnull(x)])
# uniprot_ids = set(custom_df['Uniprot'].tolist() + custom_df['E3 ligase Uniprot'].tolist())
print(uniprot_ids)

{'Q9BWU1', 'P48426', 'O60885', 'P42226', 'P42229', 'P07195', 'P04629', 'P23458', 'Q92769', 'Q9NYV4', 'P56524', 'P16591', 'Q14004', 'Q05397', 'Q13155', 'P03436', 'Q13164', 'P0DTD1', 'Q15004', 'Q06187', 'P36969', 'Q9NWZ3', 'P51451', 'Q02750', 'Q16288', 'P01116', 'P06493', 'P25440', 'Q13882', 'P14679', 'O96028', 'P00918', 'Q13489', 'P35613', 'P21802', 'Q06124', 'O14965', 'P78356', 'Q00534', 'P36507', 'Q9C5S2', 'Q15059', 'P18031', 'Q13153', 'P08238', 'P53350', 'O60760', 'P10721', 'Q07817', 'Q86WV6', 'Q9NPI1', 'Q16539', 'P10636', 'Q14289', 'Q12866', 'P50395', 'P62937', 'Q16186', 'P40337', 'P06730', 'P11362', 'Q13490', 'P08581', 'Q96C86', 'Q5S007', 'P11474', 'Q9UBN7', 'Q07820', 'P98170', 'O15379', 'Q15910', 'O75530', 'O43924', 'P33981', 'P52333', 'Q9Y4B6', 'Q86U86', 'Q9H8M2', 'Q86X55', 'P15056', 'Q8TBX8', 'P37840', 'P09874', 'P11802', 'Q00987', 'P03372', 'P49336', 'Q92831', 'Q16342', 'P31749', 'Q9UGN5', 'P31751', 'P51531', 'P35348', 'P14174', 'P07437', 'O60674', 'Q92793', 'P49840', 'Q13547',

In [32]:
import pickle
import time
from io import StringIO

import requests
from Bio import SeqIO
from requests.exceptions import HTTPError, RequestException

try: 
    seq_cache = pickle.load(open(osp.join(root, 'seq_cache.pkl'), 'rb'))
except:
    seq_cache = {}

def get_aa_seq(uniprot_id):
    """
    Download fasta file from Uniprot and extract amino acid sequence
    
    Parameters:
        uniprot_id (str): UniProt ID of the protein.
        
    Returns:
        str: Amino acid sequence of the protein.
        
    Example:
        get_aa_seq('P04637')
        >>> 'MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGPDEAPRMPEAAPPVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHERCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNSSCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELPPGSTKRALPNNTSSSPQPKKKPLDGEYFTLQIRGRERFEMFRELNEALELKDAQAGKEPGGSRAHSSHLKSKKGQSTSRHKKLMFKTEGPDSD'
    """
    try:
        if uniprot_id in seq_cache:
            return seq_cache[uniprot_id]
        
        url = f'https://www.uniprot.org/uniprot/{uniprot_id}.fasta'
        r = requests.get(url)
        r.raise_for_status()
        fasta = r.text
        fasta_io = StringIO(fasta)
        seq_record = SeqIO.read(fasta_io, 'fasta')
        # time.sleep(1)
        seq = str(seq_record.seq)
        seq_cache[uniprot_id] = seq
        pickle.dump(seq_cache, open(osp.join(root, 'seq_cache.pkl'), 'wb'))
        return seq
    except HTTPError as http_err:
        print(f'发生HTTP错误: {http_err}')
    except RequestException as req_err:
        print(f'发生请求错误: {req_err}')
    except Exception as e:
        print(f'发生意外错误: {e}')

In [33]:
p_map = {uniprot_id: get_aa_seq(uniprot_id) for uniprot_id in uniprot_ids}
for k, v in p_map.items():
    print(k, v)

Q9BWU1 MDYDFKAKLAAERERVEDLFEYEGCKVGRGTYGHVYKARRKDGKDEKEYALKQIEGTGISMSACREIALLRELKHPNVIALQKVFLSHSDRKVWLLFDYAEHDLWHIIKFHRASKANKKPMQLPRSMVKSLLYQILDGIHYLHANWVLHRDLKPANILVMGEGPERGRVKIADMGFARLFNSPLKPLADLDPVVVTFWYRAPELLLGARHYTKAIDIWAIGCIFAELLTSEPIFHCRQEDIKTSNPFHHDQLDRIFSVMGFPADKDWEDIRKMPEYPTLQKDFRRTTYANSSLIKYMEKHKVKPDSKVFLLLQKLLTMDPTKRITSEQALQDPYFQEDPLPTLDVFAGCQIPYPKREFLNEDDPEEKGDKNQQQQQNQHQQPTAPPQQAAAPPQAPPPQQNSTQTNGTAGGAGAGVGGTGAGLQHSQDSSLNQVPPNKKPRLGPSGANSGGPVMPSDYQHSSSRLNYQSSVQGSSQSQSTLGYSSSSQQSSQYHPSHQAHRY
P48426 MATPGNLGSSVLASKTKTKKKHFVAQKVKLFRASDPLLSVLMWGVNHSINELSHVQIPVMLMPDDFKAYSKIKVDNHLFNKENMPSHFKFKEYCPMVFRNLRERFGIDDQDFQNSLTRSAPLPNDSQARSGARFHTSYDKRYIIKTITSEDVAEMHNILKKYHQYIVECHGITLLPQFLGMYRLNVDGVEIYVIVTRNVFSHRLSVYRKYDLKGSTVAREASDKEKAKELPTLKDNDFINEGQKIYIDDNNKKVFLEKLKKDVEFLAQLKLMDYSLLVGIHDVERAEQEEVECEENDGEEEGESDGTHPVGTPPDSPGNTLNSSPPLAPGEFDPNIDVYGIKCHENSPRKEVYFMAIIDILTHYDAKKKAAHAAKTVKHGAGAEISTVNPEQYSKRFLDFIGHILT
O60885 MSAESGPGTRLRNLPVMGDGLETSQMSTTQAQAQPQPANAASTNPPPPETSNPNKPKRQTNQLQYLLRV

In [34]:
print(root)

data/protacdb3


In [35]:
with open(osp.join(root, 'p_map.pkl'), 'wb') as f:
    pickle.dump(p_map, f)

Processing protein sequence features into implicit structural representations
- Protein sequence hash table: `p_map`
- Protein structural representation hash table: `esm_s_map`

> ![NOTE] 
> This part needs to be prepared separately, taking into account the large GPU resources required for the Protein Language Model as well as persistent data to improve training efficiency. 
```shell
$ conda activate ems+
$ python get_embed_s.py
```

## Ready for PROTAC-STAN inference

- Your ready custom data: `customed.csv` and `esm_s_map.pkl`, keep them in the same directory, such as: `./data/custom`
- Using `inference.py`, usage: `python inference.py [-h] [--root ROOT] [--name NAME] [--save_att]`
  - for example:
    ```shell
    $ conda activate protac-stan
    $ python inference.py --root 'data/custom' --name 'custom'
    ```