In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

In [11]:
pfas_data = pd.read_csv("../data/pfas_data.csv")
pfas_data.head()

Unnamed: 0,RDKIT_SMILES,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,FC(F)Cl,0,1.2238,1.497686,11.6946,5.720793,0,0,5,4,...,0,0,0,0,0,0,0,0,0,0
1,FC(F)=C(F)F,0,1.2816,1.642499,12.4054,5.748,0,0,6,6,...,0,0,0,0,0,0,0,0,0,0
2,FC(F)(Cl)C(F)(Cl)Cl,0,2.7863,7.763468,27.3736,11.731,0,0,8,8,...,0,0,0,0,0,0,0,0,0,0
3,C=C(F)F,0,1.1144,1.241887,11.1906,5.967586,0,0,6,4,...,0,0,0,0,0,0,0,0,0,0
4,OC(C(F)(F)F)C(F)(F)F,0,1.6855,2.84091,18.8317,10.757586,0,0,12,10,...,0,0,0,0,0,0,0,0,0,0


In [12]:
pfas_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6071 entries, 0 to 6070
Columns: 2091 entries, RDKIT_SMILES to PubchemFP880
dtypes: float64(1176), int64(914), object(1)
memory usage: 96.9+ MB


In [13]:
pfas_data["RDKIT_SMILES"].isna().mean()

np.float64(0.0)

In [14]:
molecules = pfas_data["RDKIT_SMILES"].tolist()
molecules[:5]

['FC(F)Cl',
 'FC(F)=C(F)F',
 'FC(F)(Cl)C(F)(Cl)Cl',
 'C=C(F)F',
 'OC(C(F)(F)F)C(F)(F)F']

In [16]:
from rdkit import Chem
from rdkit.Chem import AllChem

In [17]:
mols = [Chem.MolFromSmiles(smiles) for smiles in molecules]

In [18]:
mols[:5]

[<rdkit.Chem.rdchem.Mol at 0xffff4fb8f060>,
 <rdkit.Chem.rdchem.Mol at 0xffff4fb8f300>,
 <rdkit.Chem.rdchem.Mol at 0xffff4fb8f0d0>,
 <rdkit.Chem.rdchem.Mol at 0xffff4fb8f140>,
 <rdkit.Chem.rdchem.Mol at 0xffff4fb8f290>]

In [19]:
fps = [AllChem.GetMACCSKeysFingerprint(mol) for mol in mols]

In [20]:
fps[:5]

[<rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0xffff8d354510>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0xffff8d354660>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0xffff8d3546d0>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0xffff8d354740>,
 <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0xffff8d3547b0>]

In [21]:
fps_array = np.array(fps)
fps_array

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 0, 0]], shape=(6071, 167))

In [22]:
from sklearn.manifold import TSNE

### “Curse of dimensionality” o “maldición de la dimensionalidad”

Cuando aumentas el número de dimensiones (variables o características), el espacio de los datos crece muy rápido, y lo que parecía suficiente en baja dimensión deja de serlo en alta dimensión. Problemas principales: datos se vuelven muy dispersos, distancias pierden significados, se necesitan de más datos o uso de modelos más complejos.

In [24]:
tsne = TSNE(n_components= 2, perplexity=50, random_state=42)
tsne

0,1,2
,n_components,2
,perplexity,50
,early_exaggeration,12.0
,learning_rate,'auto'
,max_iter,1000
,n_iter_without_progress,300
,min_grad_norm,1e-07
,metric,'euclidean'
,metric_params,
,init,'pca'


In [27]:
fps_tsne = tsne.fit_transform(fps_array)

In [35]:
tsne_df = pd.DataFrame(data = fps_tsne, columns = ["Component_1", "Component_2"],
                     index = pfas_data["RDKIT_SMILES"])

In [42]:
pfas_classes = pd.read_csv("../data/pfas_classes.csv", index_col = "RDKIT_SMILES")


In [44]:
tsne_df_joined = pd.merge(tsne_df, pfas_classes, on = "RDKIT_SMILES", how = "inner")
tsne_df_joined

Unnamed: 0_level_0,Component_1,Component_2,First_Class,Second_Class
RDKIT_SMILES,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
FC(F)Cl,-56.922298,60.720055,PFAS derivatives,PFAS halogen derivatives
FC(F)=C(F)F,-69.412216,26.313509,PFAS derivatives,With fluorinated C=C or C=O carbon
FC(F)(Cl)C(F)(Cl)Cl,-32.573349,29.395290,PFAS derivatives,PFAS halogen derivatives
C=C(F)F,-69.444672,26.623600,PFAS derivatives,With fluorinated C=C or C=O carbon
OC(C(F)(F)F)C(F)(F)F,-22.384430,-27.337769,Other aliphatics,Others
...,...,...,...,...
O=S(=O)(O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F,55.338821,32.783382,PFAAs,PFSAs
O=S(=O)(O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F,54.854397,32.900291,PFAAs,PFSAs
O=S(=O)(O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F,54.804661,32.875336,PFAAs,PFSAs
O=S(=O)(O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F,55.374973,32.738735,PFAAs,PFSAs


In [45]:
fig = px.scatter(tsne_df_joined,
                 x = "Component_1",
                 y = "Component_2",
                 color = "First_Class",
                 hover_name = tsne_df_joined.index,
                 hover_data = {"Component_1": ":.2f",
                               "Component_2": ":.2f",
                               "First_Class": True})

fig.update_layout(title = "Moléculas de PFAS mediante t-SNE",
                  xaxis_title = "Componente t-SNE 1",
                  yaxis_title = "Componente t-SNE 2",
                  height = 600, width = 900)
fig.show();

In [46]:
tsne = TSNE(n_components = 3, perplexity = 50, random_state = 42)

In [47]:
fps_tsne = tsne.fit_transform(fps_array)

In [49]:
tsne_df = pd.DataFrame(data = fps_tsne,
                     columns = ["Component_1", "Component_2", "Component_3"],
                     index = pfas_data["RDKIT_SMILES"])

In [50]:
tsne_df_joined = pd.merge(tsne_df, pfas_classes, on = "RDKIT_SMILES", how = "inner")


In [51]:
fig = px.scatter_3d(tsne_df_joined,
                    x = "Component_1",
                    y = "Component_2",
                    z = "Component_3",
                    color = "First_Class",
                    hover_name = tsne_df_joined.index,
                    hover_data = {"Component_1": ":.2f",
                                  "Component_2": ":.2f",
                                  "Component_3": ":.2f",
                                  "First_Class": True})

fig.update_layout(scene = dict(xaxis_title = "Componente t-SNE 1",
                               yaxis_title = "Componente t-SNE 2",
                               zaxis_title = "Componente t-SNE 3"))
fig.show();