# PubChem repurposed molecules

## Imports

In [None]:
import pandas as pd
from pandas import DataFrame
import os

from ensemble import Ensemble

from deepsweet_models import DeepSweetTextCNN
from deepsweet_utils import PipelineUtils
from deepchem.data import NumpyDataset
from standardizer.CustomStandardizer import CustomStandardizer
import numpy as np

from deepsweet_models import DeepSweetRF, DeepSweetDNN, DeepSweetGCN, DeepSweetSVM, DeepSweetBiLSTM

## Apply TextCNN to 60M compounds of pubchem - Filter 1

In [None]:


def textcnn_predictor(molecules, ids):
    models_folder_path = "../resources/models"
    dataset = NumpyDataset(molecules, ids=np.array(ids))
    standardisation_params = {
        'REMOVE_ISOTOPE': True,
        'NEUTRALISE_CHARGE': True,
        'REMOVE_STEREO': False,
        'KEEP_BIGGEST': True,
        'ADD_HYDROGEN': False,
        'KEKULIZE': True,
        'NEUTRALISE_CHARGE_LATE': True}

    CustomStandardizer(params=standardisation_params).standardize(dataset)
    dataset, _ = PipelineUtils.filter_valid_sequences(models_folder_path, dataset)

    textcnn = DeepSweetTextCNN(models_folder_path)
    predictions, dataset = textcnn.predict(molecules)

    filtered_ids = []
    filtered_mols = []
    filtered_predictions = []

    for i, prediction in enumerate(predictions):

        filtered_ids.append(dataset.ids[i])
        filtered_mols.append(dataset.mols[i])
        filtered_predictions.append(prediction[1])

    return filtered_ids, filtered_mols, filtered_predictions

In [None]:
df = DataFrame(columns=["cid","smiles", "predictions"])

# OPEN THE WHOLE PUBCHEM - IT HAS TO ALLOCATE AROUND 7GB OF MEMORY
# the file was obtained from ftp://ftp.ncbi.nlm.nih.gov/pubchem/Compound/Extras/CID-SMILES.gz
pubchem = pd.read_csv("CID-SMILES",header=None, sep="\t")
pubchem.columns = ["cid","smiles"]
pubchem = pubchem.iloc[:60000000,:]

In [None]:
import warnings

warnings.filterwarnings('ignore')

division = pubchem.shape[0] // 10000
last_id = 0
output_file_path = "../resources/data/predicted_molecules_TextCNN_80.csv"
if os.path.exists(output_file_path):
    df_all = pd.read_csv(output_file_path)
    last_id = max(df_all["cid"])
    last_id = int(pubchem.loc[pubchem["cid"] == last_id].iloc[0].name)
else:
    last_id = 0
    df_all = None

print(last_id)
for i in range(last_id+division,pubchem.shape[0],division):
    split_pubchem = pubchem.iloc[last_id:i,:]
    filtered_ids, filtered_mols, filtered_predictions = \
        textcnn_predictor(list(split_pubchem["smiles"].values),list(split_pubchem["cid"].values))
    df = DataFrame(columns=["cid","smiles", "predictions"])
    df["cid"] = filtered_ids
    df["smiles"] = filtered_mols
    df["predictions"] = filtered_predictions

    if os.path.exists("predicted_molecules_TextCNN.csv"):
        df_all = pd.read_csv("predicted_molecules_TextCNN.csv")
        df_all = df_all.append(df)

    else:
        df_all = df

    df_all.to_csv(output_file_path, index=False)
    last_id+=division


## Run Filter 2 - ensemble

In [None]:


def predict_with_ensemble(molecules, ids):

    models_folder_path = "../resources/models"
    list_of_models = []
    list_of_models.append(DeepSweetRF(models_folder_path, "2d", "SelectFromModelFS"))
    list_of_models.append(DeepSweetDNN(models_folder_path, "rdk", "all"))
    list_of_models.append(DeepSweetGCN(models_folder_path))
    list_of_models.append(DeepSweetSVM(models_folder_path, "ecfp4", "all"))
    list_of_models.append(DeepSweetDNN(models_folder_path, "atompair_fp", "SelectFromModelFS"))
    list_of_models.append(DeepSweetBiLSTM(models_folder_path))

    ensemble = Ensemble(list_of_models, models_folder_path)

    predictions, dataset, _ = ensemble.predict(molecules, ids)

    filtered_ids = []
    filtered_mols = []
    filtered_predictions = []

    for i, prediction in enumerate(predictions):

        if prediction > 0.5:
            filtered_ids.append(dataset.ids[i])
            filtered_mols.append(dataset.mols[i])
            filtered_predictions.append(prediction)

    return filtered_ids, filtered_mols, filtered_predictions

In [112]:
text_cnn_predictions = pd.read_csv("../resources/data/predicted_molecules_TextCNN.csv")

In [116]:
from pandas import DataFrame

division = text_cnn_predictions.shape[0] // 100
last_id = 0
print(division)
if os.path.exists("../resources/data/predicted_molecules_ensemble.csv"):
    df_all = pd.read_csv("../resources/data/predicted_molecules_ensemble.csv")
    last_id = max(df_all["cid"])
    last_id = int(text_cnn_predictions[text_cnn_predictions["cid"] == last_id].iloc[0].name)
else:
    last_id = 0
    df_all = None
    
for i in range(last_id+division,text_cnn_predictions.shape[0],division):
    split_text_cnn_predictions = text_cnn_predictions.iloc[last_id:i,:]
    filtered_ids, filtered_mols, filtered_predictions = \
        predict_with_ensemble(list(split_text_cnn_predictions["smiles"].values),list(split_text_cnn_predictions["cid"].values))

    df = DataFrame(columns=["cid","smiles", "predictions"])
    df["cid"] = filtered_ids
    df["smiles"] = filtered_mols
    df["predictions"] = filtered_predictions
    
    if os.path.exists("../resources/data/predicted_molecules_ensemble.csv"):
        df_all = pd.read_csv("../resources/data/predicted_molecules_ensemble.csv")
        df_all = df_all.append(df)
        
    else:
        df_all = df
    
    df_all.to_csv(f"../resources/data/predicted_molecules_ensemble.csv", index=False)
    last_id+=division

15460
29337844
1546071
