In [1]:
import os
import glob
import pandas as pd
import pickle
import numpy as np
from pymol import cmd

In [2]:
name_list = os.listdir('protacs')
name_list = [x.split('.')[0] for x in name_list]

In [3]:
import random
random.seed(1)
random.shuffle(name_list)

In [4]:
with open('name.pkl','wb') as f:
    pickle.dump(name_list, f)

In [5]:
name_list[:5]

['1578_TRKA_CRBN',
 '1409_cKIT_CRBN',
 '1487_SHP2_VHL',
 '1742_MEK1_CRBN',
 '1705_IRAK3_CRBN']

In [6]:
# for i in name_list:
#     cmd.load("protacs/"+i+".pdb")
#     cmd.remove('h.')
#     cmd.select("target","byres chain C around 5 and chain A")
#     cmd.save("ligase_pocket/"+i+".mol2","target")
#     cmd.select("ligase","byres chain D around 5 and Chain B")
#     cmd.save("target_pocket/"+i+".mol2","ligase")
#     cmd.delete("all")

atom, dist

In [7]:
ligase_atom = {}
ligase_bond = {}
for name in name_list:
    with open('ligase_pocket/'+name+'.mol2') as f:
        lines = f.readlines()
    atoms = lines[lines.index('@<TRIPOS>ATOM\n')+1:lines.index('@<TRIPOS>BOND\n')]
    bonds = lines[lines.index('@<TRIPOS>BOND\n')+1:lines.index('@<TRIPOS>SUBSTRUCTURE\n')]
    atom = [i.split('\t')[1].strip()[0] for i in atoms]
    CODE = ['C','N','O','S']
    atom_code = [CODE.index(x) if x in atom else 4 for x in atom]
    bond_1 = [int(i.split(' ')[1])-1 for i in bonds]
    bond_2 = [int(i.split(' ')[2])-1 for i in bonds]
    bond = np.eye(len(atom))
    for i in range(len(bond_1)):
        bond[bond_1[i]][bond_2[i]] = 1
        bond[bond_2[i]][bond_1[i]] = 1
    ligase_atom[name] = atom_code
    ligase_bond[name] = bond
with open('ligase_atom.pkl','wb') as f:
    pickle.dump(ligase_atom, f)
with open('ligase_bond.pkl','wb') as f:
    pickle.dump(ligase_bond, f)

In [8]:
target_atom = {}
target_bond = {}
for name in name_list:
    with open('target_pocket/'+name+'.mol2') as f:
        lines = f.readlines()
    atoms = lines[lines.index('@<TRIPOS>ATOM\n')+1:lines.index('@<TRIPOS>BOND\n')]
    bonds = lines[lines.index('@<TRIPOS>BOND\n')+1:lines.index('@<TRIPOS>SUBSTRUCTURE\n')]
    atom = [i.split('\t')[1].strip()[0] for i in atoms]
    CODE = ['C','N','O','S']
    atom_code = [CODE.index(x) if x in atom else 4 for x in atom]
    bond_1 = [int(i.split(' ')[1])-1 for i in bonds]
    bond_2 = [int(i.split(' ')[2])-1 for i in bonds]
    bond = np.eye(len(atom))
    for i in range(len(bond_1)):
        bond[bond_1[i]][bond_2[i]] = 1
        bond[bond_2[i]][bond_1[i]] = 1
    target_atom[name] = atom_code
    target_bond[name] = bond
with open('target_atom.pkl','wb') as f:
    pickle.dump(target_atom, f)
with open('target_bond.pkl','wb') as f:
    pickle.dump(target_bond, f)

In [9]:
ligase_ligand_atom = {}
ligase_ligand_bond = {}
for name in name_list:
    with open('E3_ligand_sdf/E3_ligand_'+name.split('_')[0]+'.sdf') as f:
        lines = f.readlines()
    atoms = []
    bond_1 = []
    bond_2 = []
    for line in lines:
        if len(line.strip().split()) == 16:
            atoms.append(line.strip().split()[3])
        if len(line.strip().split()) == 4:
            bond_1.append(int(line.strip().split()[0])-1)
            bond_2.append(int(line.strip().split()[1])-1)

    CODE = ['C','N','O','S','F','Cl','Br','I','P']
    atom_code = [CODE.index(x) if x in atoms else 9 for x in atoms]
    bond = np.eye(len(atom_code))
    for i in range(len(bond_1)):
        bond[bond_1[i]][bond_2[i]] = 1
        bond[bond_2[i]][bond_1[i]] = 1

    ligase_ligand_atom[name] = atom_code
    ligase_ligand_bond[name] = bond
with open('ligase_ligand_atom.pkl','wb') as f:
    pickle.dump(ligase_ligand_atom, f)
with open('ligase_ligand_bond.pkl','wb') as f:
    pickle.dump(ligase_ligand_bond, f)

In [10]:
target_ligand_atom = {}
target_ligand_bond = {}
for name in name_list:
    with open('target_ligand_sdf/target_ligand_'+name.split('_')[0]+'.sdf') as f:
        lines = f.readlines()
    atoms = []
    bond_1 = []
    bond_2 = []
    for line in lines:
        if len(line.strip().split()) == 16:
            atoms.append(line.strip().split()[3])
        if len(line.strip().split()) == 4:
            bond_1.append(int(line.strip().split()[0])-1)
            bond_2.append(int(line.strip().split()[1])-1)

    CODE = ['C','N','O','S','F','Cl','Br','I','P']
    atom_code = [CODE.index(x) if x in atoms else 9 for x in atoms]
    bond = np.eye(len(atom_code))
    for i in range(len(bond_1)):
        bond[bond_1[i]][bond_2[i]] = 1
        bond[bond_2[i]][bond_1[i]] = 1

    target_ligand_atom[name] = atom_code
    target_ligand_bond[name] = bond
with open('target_ligand_atom.pkl','wb') as f:
    pickle.dump(target_ligand_atom, f)
with open('target_ligand_bond.pkl','wb') as f:
    pickle.dump(target_ligand_bond, f)

In [11]:
smiles = {}
for i in name_list:
    smi_num = i.split("_")[0]
    with open("linker_smi/linker_"+smi_num+".smi") as f:
        smi = f.read()
    smiles[i] = smi.strip()
with open("smiles.pkl","wb") as f:
    pickle.dump(smiles,f)

In [12]:
label_csv = pd.read_csv("protacs.csv")

In [13]:
id = list(label_csv["Compound ID"])
tar = list(label_csv["Target"])
e3  = list(label_csv["E3 Ligase"])
lab = list(label_csv["Degradation Identification"])

In [14]:
labels = {}
for i in range(len(id)):
    a = str(id[i])+"_"+tar[i].replace(" ","").replace("-","").replace("/","")+"_"+e3[i].replace(" ","")
    labels[a] = lab[i]

In [15]:
for i in name_list:
    if i not in labels:
        print(i) 

In [16]:
labels_code = {}
for i in labels:
    if labels[i]=='Good' or labels[i]=='Excellent':
        labels_code[i] = 1
    elif  labels[i]=='Moderate' or labels[i]=="Not Good" or labels[i]=='Poor':
        labels_code[i] = 0

In [17]:
for i in name_list:
    if i not in labels_code:
        print(i)

In [18]:
with open("label.pkl","wb") as f:
    pickle.dump(labels_code, f)