Explore Biofacquim against the world.
Perform a tSNE from Tanimoto Distance Matrix and save results.

In [1]:
"""Import Libraries"""
import pandas as pd
import numpy as np

import rdkit    
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, MACCSkeys
from rdkit.DataManip.Metric.rdMetricMatrixCalc import GetTanimotoSimMat, GetTanimotoDistMat

import sklearn
from sklearn import datasets, decomposition
from sklearn.manifold import TSNE

In [2]:
"""Open Database"""
Data = pd.read_csv("Databases_CABANA.csv", sep = ",")
Data.head()

Unnamed: 0.1,Unnamed: 0,ID Database,Name,SMILES,HBA,HBD,RB,LogP,TPSA,MW,Heavy Atom,Ring Count,Fraction CSP3,subLibrary,Library
0,0,AfroDb.564,NPR_00036,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC(OC...,7.0,3.0,51.0,22.4139,105.45,1194.003,85.0,85.0,85.0,AFRODB,AFRODB
1,1,AfroDb.71,ABD_UD_004,C[C@H](CCC(O[C@H](C[C@@H]([C@@H]1CC2)[C@H]3[C@...,27.0,14.0,18.0,-3.5356,418.89,1195.309,83.0,83.0,83.0,AFRODB,AFRODB
2,2,AfroDb.70,ABD_UD_003,C[C@H](CC[C@@]([C@H]1C)(OC)O[C@H](C2)[C@H]1[C@...,26.0,14.0,17.0,-2.6588,393.98,1195.353,83.0,83.0,83.0,AFRODB,AFRODB
3,3,AfroDb.937,WA_0086,COc(cc(/C=C/C(OCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,11.0,3.0,61.0,16.9936,201.42,1117.597,79.0,79.0,79.0,AFRODB,AFRODB
4,4,AfroDb.936,WA_0085,COc(cc(/C=C/C(OCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,8.0,3.0,52.0,17.018,122.52,943.489,67.0,67.0,67.0,AFRODB,AFRODB


In [3]:
#count elements
Data.shape

(21084, 15)

In [4]:
#Take a random sample (Modify frac)
Data = Data.sample(frac=0.1, replace=True, random_state=1992)
Data.shape

(2108, 15)

In [5]:
#compute distance matrix 
def compute_distance_matrix(Data):
    """MACCS Keys fp"""
    smiles = list(Data["SMILES"])
    smi=[Chem.MolFromSmiles(x) for x in smiles]
    fps =  [MACCSkeys.GenMACCSKeys(x) for x in smi]
    tanimoto_sim_mat_lower_triangle=GetTanimotoSimMat(fps)
    n_mol = len(fps)
    similarity_matrix = np.ones([n_mol,n_mol])
    i_lower= np.tril_indices(n=n_mol,m=n_mol,k=-1)
    i_upper= np.triu_indices(n=n_mol,m=n_mol,k=1)
    similarity_matrix[i_lower] = tanimoto_sim_mat_lower_triangle
    similarity_matrix[i_upper] = similarity_matrix.T[i_upper] 
    distance_matrix = np.subtract(1,similarity_matrix)
    return distance_matrix

In [6]:
#execute function
distance_matrix = compute_distance_matrix(Data)

In [7]:
"""Perform tSNE"""
#write a funtion to perform tSNE and merge id information
def compute_tsne(matrix, Data):
        TSNE_sim = TSNE(n_components=2,
                        init='pca',
                        random_state=1992, 
                        angle = 0.3,
                        perplexity=50
                        ).fit_transform(matrix)
        tsne_result = pd.DataFrame(data = TSNE_sim , 
                                columns=["PC 1","PC 2"]
                                )
        tsne_result["Library"] = list(Data.Library)
        tsne_result["SMILES"] = list(Data.SMILES)
        tsne_result["Name"] = list(Data.Name)
        print("tSNE is done!")
        return tsne_result


In [8]:
tsne_result = compute_tsne(distance_matrix, Data)

tSNE is done!


In [9]:
tsne_result.head()

Unnamed: 0,PC 1,PC 2,Library,SMILES,Name
0,-47.938419,40.677586,Epidatabase,ONC(c(cc1)ccc1N[C@@H](CCN1c(cc2)cc(Cl)c2Cl)C1=...,SBSM341754
1,-19.640524,24.254223,Epidatabase,ONC(CCCCCNC(C=C1c(cccc2)c2-c2c1cccc2)=O)=O,SBSM462973
2,-15.006295,51.33025,Epidatabase,ONC(/C=C/c1cccc2ccccc12)=O,SBSM267041
3,-54.15976,32.496826,Epidatabase,ONC(c1cn(CCN(C2)C(c(cc3)ccc3-c3ccccc3)=O)c2n1)=O,SBSM330733
4,26.841022,-9.17039,Epidatabase,Clc(cc1)cnc1N/N=C(\c1ccccc1)/c1ncccc1,SBSM195263


In [10]:
#Save results as .csv File
tsne_result.to_csv("Results_tSNE.csv", sep = ",")