# Cargar datos
<hr>

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("Datos_con_ECFP.csv")

In [3]:
df.head()

Unnamed: 0,SMILES,BBB+/BBB-,ECFP,0,1,2,3,4,5,6,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,O=C(O)c1cc(N=Nc2ccc(S(=O)(=O)Nc3ccccn3)cc2)ccc1O,BBB-,[0 0 0 ... 0 0 0],0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,COC1(NC(=O)C(C(=O)O)c2ccc(O)cc2)C(=O)N2C(C(=O)...,BBB-,[0 1 0 ... 0 0 0],0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Oc1c(I)cc(Cl)c2cccnc12,BBB-,[0 0 0 ... 0 0 0],0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,CCNC(=NCCSCc1ncccc1Br)NC#N,BBB-,[0 0 0 ... 0 0 0],0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,CN1CC[C@]23c4c5ccc(OC6O[C@H](C(=O)O)[C@@H](O)[...,BBB-,[0 0 0 ... 0 0 0],0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Modelo Red Neuronal con PCA
<hr>

In [4]:
X = df.drop(columns=["SMILES", "BBB+/BBB-", "ECFP"])
y = df["BBB+/BBB-"]

In [5]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
pipe = Pipeline([
    ('pca', PCA()),
    ('NB', MLPClassifier())
])

In [8]:
param_grid = {
    'pca__n_components': [150],  # Ejemplo de opciones para el número de componentes en PCA
    'NB__activation': ['tanh', 'relu'],
    'NB__solver': ['sgd', 'adam'],
    'NB__alpha': [0.0001, 0.001, 0.01, 0.1],
    'NB__learning_rate': ['constant', 'invscaling', 'adaptive'],
    'NB__max_iter': [500]
}

In [9]:
grid_search = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2)

In [10]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] END NB__activation=tanh, NB__alpha=0.0001, NB__learning_rate=constant, NB__max_iter=500, NB__solver=sgd, pca__n_components=150; total time=   8.9s
[CV] END NB__activation=tanh, NB__alpha=0.0001, NB__learning_rate=constant, NB__max_iter=500, NB__solver=sgd, pca__n_components=150; total time=   9.9s
[CV] END NB__activation=tanh, NB__alpha=0.0001, NB__learning_rate=constant, NB__max_iter=500, NB__solver=sgd, pca__n_components=150; total time=   9.7s
[CV] END NB__activation=tanh, NB__alpha=0.0001, NB__learning_rate=constant, NB__max_iter=500, NB__solver=sgd, pca__n_components=150; total time=   7.9s
[CV] END NB__activation=tanh, NB__alpha=0.0001, NB__learning_rate=constant, NB__max_iter=500, NB__solver=sgd, pca__n_components=150; total time=   7.6s
[CV] END NB__activation=tanh, NB__alpha=0.0001, NB__learning_rate=constant, NB__max_iter=500, NB__solver=adam, pca__n_components=150; total time=  10.8s
[CV] END NB__activation=t



[CV] END NB__activation=relu, NB__alpha=0.0001, NB__learning_rate=constant, NB__max_iter=500, NB__solver=sgd, pca__n_components=150; total time=  16.5s




[CV] END NB__activation=relu, NB__alpha=0.0001, NB__learning_rate=constant, NB__max_iter=500, NB__solver=sgd, pca__n_components=150; total time=  16.1s




[CV] END NB__activation=relu, NB__alpha=0.0001, NB__learning_rate=constant, NB__max_iter=500, NB__solver=sgd, pca__n_components=150; total time=  16.7s




[CV] END NB__activation=relu, NB__alpha=0.0001, NB__learning_rate=constant, NB__max_iter=500, NB__solver=sgd, pca__n_components=150; total time=  16.6s




[CV] END NB__activation=relu, NB__alpha=0.0001, NB__learning_rate=constant, NB__max_iter=500, NB__solver=sgd, pca__n_components=150; total time=  16.0s
[CV] END NB__activation=relu, NB__alpha=0.0001, NB__learning_rate=constant, NB__max_iter=500, NB__solver=adam, pca__n_components=150; total time=   6.1s
[CV] END NB__activation=relu, NB__alpha=0.0001, NB__learning_rate=constant, NB__max_iter=500, NB__solver=adam, pca__n_components=150; total time=   6.4s
[CV] END NB__activation=relu, NB__alpha=0.0001, NB__learning_rate=constant, NB__max_iter=500, NB__solver=adam, pca__n_components=150; total time=   8.2s
[CV] END NB__activation=relu, NB__alpha=0.0001, NB__learning_rate=constant, NB__max_iter=500, NB__solver=adam, pca__n_components=150; total time=   6.7s
[CV] END NB__activation=relu, NB__alpha=0.0001, NB__learning_rate=constant, NB__max_iter=500, NB__solver=adam, pca__n_components=150; total time=   7.4s
[CV] END NB__activation=relu, NB__alpha=0.0001, NB__learning_rate=invscaling, NB__m



[CV] END NB__activation=relu, NB__alpha=0.0001, NB__learning_rate=adaptive, NB__max_iter=500, NB__solver=sgd, pca__n_components=150; total time=  16.3s




[CV] END NB__activation=relu, NB__alpha=0.0001, NB__learning_rate=adaptive, NB__max_iter=500, NB__solver=sgd, pca__n_components=150; total time=  16.5s




[CV] END NB__activation=relu, NB__alpha=0.0001, NB__learning_rate=adaptive, NB__max_iter=500, NB__solver=sgd, pca__n_components=150; total time=  16.9s




[CV] END NB__activation=relu, NB__alpha=0.0001, NB__learning_rate=adaptive, NB__max_iter=500, NB__solver=sgd, pca__n_components=150; total time=  16.3s




[CV] END NB__activation=relu, NB__alpha=0.0001, NB__learning_rate=adaptive, NB__max_iter=500, NB__solver=sgd, pca__n_components=150; total time=  16.9s
[CV] END NB__activation=relu, NB__alpha=0.0001, NB__learning_rate=adaptive, NB__max_iter=500, NB__solver=adam, pca__n_components=150; total time=   6.4s
[CV] END NB__activation=relu, NB__alpha=0.0001, NB__learning_rate=adaptive, NB__max_iter=500, NB__solver=adam, pca__n_components=150; total time=   7.0s
[CV] END NB__activation=relu, NB__alpha=0.0001, NB__learning_rate=adaptive, NB__max_iter=500, NB__solver=adam, pca__n_components=150; total time=   7.7s
[CV] END NB__activation=relu, NB__alpha=0.0001, NB__learning_rate=adaptive, NB__max_iter=500, NB__solver=adam, pca__n_components=150; total time=   6.5s
[CV] END NB__activation=relu, NB__alpha=0.0001, NB__learning_rate=adaptive, NB__max_iter=500, NB__solver=adam, pca__n_components=150; total time=   6.9s




[CV] END NB__activation=relu, NB__alpha=0.001, NB__learning_rate=constant, NB__max_iter=500, NB__solver=sgd, pca__n_components=150; total time=  16.0s




[CV] END NB__activation=relu, NB__alpha=0.001, NB__learning_rate=constant, NB__max_iter=500, NB__solver=sgd, pca__n_components=150; total time=  16.5s




[CV] END NB__activation=relu, NB__alpha=0.001, NB__learning_rate=constant, NB__max_iter=500, NB__solver=sgd, pca__n_components=150; total time=  16.4s




[CV] END NB__activation=relu, NB__alpha=0.001, NB__learning_rate=constant, NB__max_iter=500, NB__solver=sgd, pca__n_components=150; total time=  16.4s




[CV] END NB__activation=relu, NB__alpha=0.001, NB__learning_rate=constant, NB__max_iter=500, NB__solver=sgd, pca__n_components=150; total time=  16.4s
[CV] END NB__activation=relu, NB__alpha=0.001, NB__learning_rate=constant, NB__max_iter=500, NB__solver=adam, pca__n_components=150; total time=   6.4s
[CV] END NB__activation=relu, NB__alpha=0.001, NB__learning_rate=constant, NB__max_iter=500, NB__solver=adam, pca__n_components=150; total time=   7.9s
[CV] END NB__activation=relu, NB__alpha=0.001, NB__learning_rate=constant, NB__max_iter=500, NB__solver=adam, pca__n_components=150; total time=   6.7s
[CV] END NB__activation=relu, NB__alpha=0.001, NB__learning_rate=constant, NB__max_iter=500, NB__solver=adam, pca__n_components=150; total time=   6.6s
[CV] END NB__activation=relu, NB__alpha=0.001, NB__learning_rate=constant, NB__max_iter=500, NB__solver=adam, pca__n_components=150; total time=   7.3s
[CV] END NB__activation=relu, NB__alpha=0.001, NB__learning_rate=invscaling, NB__max_iter



[CV] END NB__activation=relu, NB__alpha=0.001, NB__learning_rate=adaptive, NB__max_iter=500, NB__solver=sgd, pca__n_components=150; total time=  16.9s




[CV] END NB__activation=relu, NB__alpha=0.001, NB__learning_rate=adaptive, NB__max_iter=500, NB__solver=sgd, pca__n_components=150; total time=  16.2s




[CV] END NB__activation=relu, NB__alpha=0.001, NB__learning_rate=adaptive, NB__max_iter=500, NB__solver=sgd, pca__n_components=150; total time=  16.4s




[CV] END NB__activation=relu, NB__alpha=0.001, NB__learning_rate=adaptive, NB__max_iter=500, NB__solver=sgd, pca__n_components=150; total time=  16.0s




[CV] END NB__activation=relu, NB__alpha=0.001, NB__learning_rate=adaptive, NB__max_iter=500, NB__solver=sgd, pca__n_components=150; total time=  16.2s
[CV] END NB__activation=relu, NB__alpha=0.001, NB__learning_rate=adaptive, NB__max_iter=500, NB__solver=adam, pca__n_components=150; total time=   6.8s
[CV] END NB__activation=relu, NB__alpha=0.001, NB__learning_rate=adaptive, NB__max_iter=500, NB__solver=adam, pca__n_components=150; total time=   7.6s
[CV] END NB__activation=relu, NB__alpha=0.001, NB__learning_rate=adaptive, NB__max_iter=500, NB__solver=adam, pca__n_components=150; total time=   7.0s
[CV] END NB__activation=relu, NB__alpha=0.001, NB__learning_rate=adaptive, NB__max_iter=500, NB__solver=adam, pca__n_components=150; total time=   6.6s
[CV] END NB__activation=relu, NB__alpha=0.001, NB__learning_rate=adaptive, NB__max_iter=500, NB__solver=adam, pca__n_components=150; total time=   6.1s




[CV] END NB__activation=relu, NB__alpha=0.01, NB__learning_rate=constant, NB__max_iter=500, NB__solver=sgd, pca__n_components=150; total time=  17.1s




[CV] END NB__activation=relu, NB__alpha=0.01, NB__learning_rate=constant, NB__max_iter=500, NB__solver=sgd, pca__n_components=150; total time=  22.8s




[CV] END NB__activation=relu, NB__alpha=0.01, NB__learning_rate=constant, NB__max_iter=500, NB__solver=sgd, pca__n_components=150; total time=  28.7s




[CV] END NB__activation=relu, NB__alpha=0.01, NB__learning_rate=constant, NB__max_iter=500, NB__solver=sgd, pca__n_components=150; total time=  28.6s




[CV] END NB__activation=relu, NB__alpha=0.01, NB__learning_rate=constant, NB__max_iter=500, NB__solver=sgd, pca__n_components=150; total time=  28.7s
[CV] END NB__activation=relu, NB__alpha=0.01, NB__learning_rate=constant, NB__max_iter=500, NB__solver=adam, pca__n_components=150; total time=  11.0s
[CV] END NB__activation=relu, NB__alpha=0.01, NB__learning_rate=constant, NB__max_iter=500, NB__solver=adam, pca__n_components=150; total time=   9.9s
[CV] END NB__activation=relu, NB__alpha=0.01, NB__learning_rate=constant, NB__max_iter=500, NB__solver=adam, pca__n_components=150; total time=   6.1s
[CV] END NB__activation=relu, NB__alpha=0.01, NB__learning_rate=constant, NB__max_iter=500, NB__solver=adam, pca__n_components=150; total time=   6.4s
[CV] END NB__activation=relu, NB__alpha=0.01, NB__learning_rate=constant, NB__max_iter=500, NB__solver=adam, pca__n_components=150; total time=   5.1s
[CV] END NB__activation=relu, NB__alpha=0.01, NB__learning_rate=invscaling, NB__max_iter=500, N



[CV] END NB__activation=relu, NB__alpha=0.01, NB__learning_rate=adaptive, NB__max_iter=500, NB__solver=sgd, pca__n_components=150; total time=  17.4s




[CV] END NB__activation=relu, NB__alpha=0.01, NB__learning_rate=adaptive, NB__max_iter=500, NB__solver=sgd, pca__n_components=150; total time=  17.1s




[CV] END NB__activation=relu, NB__alpha=0.01, NB__learning_rate=adaptive, NB__max_iter=500, NB__solver=sgd, pca__n_components=150; total time=  17.3s




[CV] END NB__activation=relu, NB__alpha=0.01, NB__learning_rate=adaptive, NB__max_iter=500, NB__solver=sgd, pca__n_components=150; total time=  17.4s




[CV] END NB__activation=relu, NB__alpha=0.01, NB__learning_rate=adaptive, NB__max_iter=500, NB__solver=sgd, pca__n_components=150; total time=  17.6s
[CV] END NB__activation=relu, NB__alpha=0.01, NB__learning_rate=adaptive, NB__max_iter=500, NB__solver=adam, pca__n_components=150; total time=   7.2s
[CV] END NB__activation=relu, NB__alpha=0.01, NB__learning_rate=adaptive, NB__max_iter=500, NB__solver=adam, pca__n_components=150; total time=   7.1s
[CV] END NB__activation=relu, NB__alpha=0.01, NB__learning_rate=adaptive, NB__max_iter=500, NB__solver=adam, pca__n_components=150; total time=   7.1s
[CV] END NB__activation=relu, NB__alpha=0.01, NB__learning_rate=adaptive, NB__max_iter=500, NB__solver=adam, pca__n_components=150; total time=   6.2s
[CV] END NB__activation=relu, NB__alpha=0.01, NB__learning_rate=adaptive, NB__max_iter=500, NB__solver=adam, pca__n_components=150; total time=   6.6s




[CV] END NB__activation=relu, NB__alpha=0.1, NB__learning_rate=constant, NB__max_iter=500, NB__solver=sgd, pca__n_components=150; total time=  16.9s




[CV] END NB__activation=relu, NB__alpha=0.1, NB__learning_rate=constant, NB__max_iter=500, NB__solver=sgd, pca__n_components=150; total time=  16.8s




[CV] END NB__activation=relu, NB__alpha=0.1, NB__learning_rate=constant, NB__max_iter=500, NB__solver=sgd, pca__n_components=150; total time=  16.7s




[CV] END NB__activation=relu, NB__alpha=0.1, NB__learning_rate=constant, NB__max_iter=500, NB__solver=sgd, pca__n_components=150; total time=  16.9s




[CV] END NB__activation=relu, NB__alpha=0.1, NB__learning_rate=constant, NB__max_iter=500, NB__solver=sgd, pca__n_components=150; total time=  17.4s
[CV] END NB__activation=relu, NB__alpha=0.1, NB__learning_rate=constant, NB__max_iter=500, NB__solver=adam, pca__n_components=150; total time=   6.6s
[CV] END NB__activation=relu, NB__alpha=0.1, NB__learning_rate=constant, NB__max_iter=500, NB__solver=adam, pca__n_components=150; total time=   7.6s
[CV] END NB__activation=relu, NB__alpha=0.1, NB__learning_rate=constant, NB__max_iter=500, NB__solver=adam, pca__n_components=150; total time=   6.8s
[CV] END NB__activation=relu, NB__alpha=0.1, NB__learning_rate=constant, NB__max_iter=500, NB__solver=adam, pca__n_components=150; total time=   6.7s
[CV] END NB__activation=relu, NB__alpha=0.1, NB__learning_rate=constant, NB__max_iter=500, NB__solver=adam, pca__n_components=150; total time=   6.9s
[CV] END NB__activation=relu, NB__alpha=0.1, NB__learning_rate=invscaling, NB__max_iter=500, NB__solv



[CV] END NB__activation=relu, NB__alpha=0.1, NB__learning_rate=adaptive, NB__max_iter=500, NB__solver=sgd, pca__n_components=150; total time=  16.8s




[CV] END NB__activation=relu, NB__alpha=0.1, NB__learning_rate=adaptive, NB__max_iter=500, NB__solver=sgd, pca__n_components=150; total time=  17.2s




[CV] END NB__activation=relu, NB__alpha=0.1, NB__learning_rate=adaptive, NB__max_iter=500, NB__solver=sgd, pca__n_components=150; total time=  16.8s




[CV] END NB__activation=relu, NB__alpha=0.1, NB__learning_rate=adaptive, NB__max_iter=500, NB__solver=sgd, pca__n_components=150; total time=  17.4s




[CV] END NB__activation=relu, NB__alpha=0.1, NB__learning_rate=adaptive, NB__max_iter=500, NB__solver=sgd, pca__n_components=150; total time=  16.8s
[CV] END NB__activation=relu, NB__alpha=0.1, NB__learning_rate=adaptive, NB__max_iter=500, NB__solver=adam, pca__n_components=150; total time=   6.5s
[CV] END NB__activation=relu, NB__alpha=0.1, NB__learning_rate=adaptive, NB__max_iter=500, NB__solver=adam, pca__n_components=150; total time=   6.1s
[CV] END NB__activation=relu, NB__alpha=0.1, NB__learning_rate=adaptive, NB__max_iter=500, NB__solver=adam, pca__n_components=150; total time=   7.1s
[CV] END NB__activation=relu, NB__alpha=0.1, NB__learning_rate=adaptive, NB__max_iter=500, NB__solver=adam, pca__n_components=150; total time=   7.9s
[CV] END NB__activation=relu, NB__alpha=0.1, NB__learning_rate=adaptive, NB__max_iter=500, NB__solver=adam, pca__n_components=150; total time=   8.4s


In [11]:
grid_search.best_params_

{'NB__activation': 'relu',
 'NB__alpha': 0.01,
 'NB__learning_rate': 'invscaling',
 'NB__max_iter': 500,
 'NB__solver': 'adam',
 'pca__n_components': 150}

In [12]:
grid_search.best_score_

0.8710968775020016

In [13]:
pipe = Pipeline([
    ('pca', PCA(n_components=150)),
    ('NB', MLPClassifier(activation="relu", alpha=0.01, learning_rate="invscaling", max_iter=500, solver="adam"))
])

In [14]:
pipe.fit(X_train, y_train)

In [15]:
y_predicha = pipe.predict(X_test)

In [16]:
#Calcular estadísticos
accuracy = accuracy_score(y_test, y_predicha)
recall = recall_score(y_test, y_predicha, average='weighted')
precision = precision_score(y_test, y_predicha, average='weighted')
f1 = f1_score(y_test, y_predicha, average='weighted')
print("Accuracy", round(accuracy, 3))
print("Recall", round(recall, 3))
print("precision", round(precision,3))
print("f1", round(f1,3))

Accuracy 0.874
Recall 0.874
precision 0.874
f1 0.874


In [17]:
y_entrenamiento = pipe.predict(X_train)

In [18]:
accuracy = accuracy_score(y_train, y_entrenamiento)
recall = recall_score(y_train, y_entrenamiento, average='weighted')
precision = precision_score(y_entrenamiento, y_entrenamiento, average='weighted')
f1 = f1_score(y_train, y_entrenamiento, average='weighted')
print("Accuracy", round(accuracy, 3))
print("Recall", round(recall, 3))
print("precision", round(precision,3))
print("f1", round(f1,3))

Accuracy 0.983
Recall 0.983
precision 1.0
f1 0.983


In [19]:
import joblib

In [20]:
joblib.dump(pipe, "RN_entrenado.joblib")

['RN_entrenado.joblib']