# Import modules

In [26]:
import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.manifold import TSNE

import seaborn as sns
import matplotlib.pyplot as plt

from rdkit.Chem import MolFromSmiles
from rdkit.Chem.rdMolDescriptors import GetMorganFingerprintAsBitVect
tqdm.pandas()

In [2]:
# mpl.rcParams['figure.dpi'] = 400
# sns.set(context='paper', style='dark')

In [3]:
DATA_DIR = '../data'

In [4]:
surechem_df = pd.read_parquet(f'{DATA_DIR}/surechembl_dump.pq')
surechem_df.drop_duplicates(subset=['InChIKey'], inplace=True)
surechem_df.head(2)

Unnamed: 0,SureChEMBL_ID,SMILES,InChIKey,PATENT_ID,PUBLICATION_DATE,Field
0,SCHEMBL4,C[C@H](CS)C(=O)N1CCC[C@H]1C(O)=O,FAKRSMQSSFJEIM-RQJHMYQMSA-N,EP-2842582-A2,2015-03-04,Description
1,SCHEMBL4,C[C@H](CS)C(=O)N1CCC[C@H]1C(O)=O,FAKRSMQSSFJEIM-RQJHMYQMSA-N,EP-2838373-A2,2015-02-25,Description


# Load surechembl data

In [6]:
# Subset the data to only approved patents
surechem_df = surechem_df[surechem_df['PATENT_ID'].str.contains('B')]
smiles_df = pd.DataFrame(surechem_df['SMILES'].unique(), columns=['SMILES'])
smiles_df['label'] = 'Patented drugs'
smiles_df.head(2)

Unnamed: 0,SMILES,label
0,C[C@H](CS)C(=O)N1CCC[C@H]1C(O)=O,Patented drugs
1,O=C(O)\C=C/C(=O)O.CCOC(=O)[C@H](CCC1=CC=CC=C1)...,Patented drugs


# Load drugbank data

In [7]:
approved_drugs_df = pd.read_csv(
    f'{DATA_DIR}/drugbank_approved_structure_links.csv.zip', compression='zip',
    usecols=['SMILES']
)
approved_drugs_df['label'] = 'Approved drugs'
approved_drugs_df.drop_duplicates(inplace=True)
approved_drugs_df.head(2)

Unnamed: 0,SMILES,label
0,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...,Approved drugs
1,CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=...,Approved drugs


# Merge the two data sources

In [8]:
combined_df = pd.concat(
    [smiles_df, approved_drugs_df], 
    ignore_index=True
)
combined_df.drop_duplicates(subset=['SMILES'], inplace=True)
combined_df.head(2)

Unnamed: 0,SMILES,label
0,C[C@H](CS)C(=O)N1CCC[C@H]1C(O)=O,Patented drugs
1,O=C(O)\C=C/C(=O)O.CCOC(=O)[C@H](CCC1=CC=CC=C1)...,Patented drugs


In [9]:
combined_df['label'].value_counts()

Patented drugs    6520901
Approved drugs       2586
Name: label, dtype: int64

# Get ECFP4 fingerprints for the compounds

In [10]:
skipped_smiles = 0

fingerprint_list = []
fingerprint_labels = []

for smiles, label in tqdm(combined_df.values):
    try:
        molecule = MolFromSmiles(smiles)
        if molecule is None:
            skipped_smiles += 1
            continue
        
        fingerprint_list.append(GetMorganFingerprintAsBitVect(molecule, 4))
        fingerprint_labels.append(label)
        
    except:
        skipped_smiles += 1
        continue

  0%|          | 0/6523487 [00:00<?, ?it/s][19:21:33] Explicit valence for atom # 3 N, 4, is greater than permitted
[19:21:33] Explicit valence for atom # 14 N, 4, is greater than permitted
  0%|          | 475/6523487 [00:00<47:49, 2273.58it/s][19:21:33] Explicit valence for atom # 6 N, 5, is greater than permitted
  0%|          | 944/6523487 [00:00<50:19, 2160.07it/s][19:21:33] Explicit valence for atom # 1 N, 5, is greater than permitted
  0%|          | 1162/6523487 [00:00<52:10, 2083.25it/s][19:21:33] Explicit valence for atom # 16 N, 4, is greater than permitted
  0%|          | 1372/6523487 [00:00<54:15, 2003.66it/s][19:21:33] Explicit valence for atom # 32 N, 4, is greater than permitted
  0%|          | 1574/6523487 [00:00<55:36, 1954.77it/s][19:21:33] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 6 7 8
  0%|          | 2276/6523487 [00:01<1:03:51, 1702.02it/s][19:21:34] Explicit valence for atom # 8 N, 5, is greater than permitted
[19:21:34] Explicit valence for atom #

In [11]:
skipped_smiles, len(fingerprint_list)

(14969, 6508518)

In [None]:
fingerprint_df = pd.DataFrame()
fingerprint_df['fingerprint'] = fingerprint_list
fingerprint_df['label'] = fingerprint_labels
fingerprint_df.head(2)

# Visualize using tSNE plot

In [None]:
fingerprints = df['Fingerprints'].values.tolist()

In [33]:
# Sklearn TSNE model
model = TSNE(n_components=2, random_state=0, perplexity=30, n_iter=50)
tsne_result = tsne.fit_transform(fingerprints)

In [None]:
tsne_df = pd.DataFrame()
tsne_df['TSNE_C1'] = tsne_result.T[0]
tsne_df['TSNE_C2'] = tsne_result.T[1]
tsne_df['label'] = fingerprint_labels

In [None]:
sns.scatterplot(
    data=tsne_df,
    x='TSNE_C1'
    y='TSNE_C2',
    s=22, 
    hue='label',
    linewidth=0.2,
    alpha=1
)
plt.set_xlabel('TSNE Component 1')
plt.set_ylabel('TSNE Component 2')

lim = (tsne_df.min()-5, tsne_result.max()+5)

plt.set_xlim(lim)
plt.set_ylim(lim)
plt.set_aspect('equal')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0)
plt.show()