# Import modules

In [None]:
import pandas as pd
from tqdm import tqdm

from sklearn.manifold import TSNE

import seaborn as sns
import matplotlib.pyplot as plt

from rdkit.Chem import MolFromSmiles
from rdkit.Chem.rdMolDescriptors import GetMorganFingerprintAsBitVect
tqdm.pandas()

In [None]:
surechem_df = pd.read_csv('data/EDA_df.txt.gz', sep='\t', compression='gzip')
surechem_df.drop_duplicates(subset=['InChIKey'], inplace=True)
surechem_df.head(2)

# Load surechembl data

In [None]:
# Subset the data to only approved patents
surechem_df = surechem_df[surechem_df['PATENT_ID'].str.contains('B')]
smiles_df = pd.DataFrame(surechem_df['SMILES'].unique(), columns=['SMILES'])
smiles_df['label'] = 'Patented drugs'
smiles_df.head(2)

# Load approved drugbank data

In [None]:
approved_drugs_df = pd.read_csv(
    'data/drugbank_approved_structure_links.csv.zip', compression='zip',
    usecols=['SMILES']
)
approved_drugs_df['label'] = 'Approved drugs'
approved_drugs_df.drop_duplicates(inplace=True)
approved_drugs_df.head(2)

# Load withdrawn drugbank data

In [None]:
withdrawn_drugs_df = pd.read_csv(
    'data/drugbank_withdrawn_structure_links.csv.zip', compression='zip',
    usecols=['SMILES']
)
withdrawn_drugs_df['label'] = 'Withdrawn drugs'
withdrawn_drugs_df.drop_duplicates(inplace=True)
withdrawn_drugs_df.head(2)

# Merge the two data sources

In [None]:
combined_df = pd.concat(
    [smiles_df, approved_drugs_df, withdrawn_drugs_df], 
    ignore_index=True
)
combined_df.drop_duplicates(subset=['SMILES'], inplace=True)
combined_df.head(2)

In [None]:
combined_df['label'].value_counts()

# Get ECFP4 fingerprints for the compounds

In [None]:
skipped_smiles = 0

fingerprint_list = []
fingerprint_labels = []

for smiles, label in tqdm(combined_df.values):
    try:
        molecule = MolFromSmiles(smiles)
        if molecule is None:
            skipped_smiles += 1
            continue
        
        fingerprint_list.append(GetMorganFingerprintAsBitVect(molecule, 4))
        fingerprint_labels.append(label)
        
    except:
        skipped_smiles += 1
        continue

In [None]:
skipped_smiles, len(fingerprint_list)

In [None]:
fingerprint_df = pd.DataFrame()
fingerprint_df['fingerprint'] = fingerprint_list
fingerprint_df['label'] = fingerprint_labels
fingerprint_df.head(2)

# Visualize using tSNE plot

In [None]:
fingerprints_array = numpy.array(fingerprints)