# Cargar datos
<hr>

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("Datos_con_ECFP.csv")

In [4]:
df.head()

Unnamed: 0,SMILES,BBB+/BBB-,ECFP,0,1,2,3,4,5,6,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,O=C(O)c1cc(N=Nc2ccc(S(=O)(=O)Nc3ccccn3)cc2)ccc1O,BBB-,[0 0 0 ... 0 0 0],0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,COC1(NC(=O)C(C(=O)O)c2ccc(O)cc2)C(=O)N2C(C(=O)...,BBB-,[0 1 0 ... 0 0 0],0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Oc1c(I)cc(Cl)c2cccnc12,BBB-,[0 0 0 ... 0 0 0],0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,CCNC(=NCCSCc1ncccc1Br)NC#N,BBB-,[0 0 0 ... 0 0 0],0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,CN1CC[C@]23c4c5ccc(OC6O[C@H](C(=O)O)[C@@H](O)[...,BBB-,[0 0 0 ... 0 0 0],0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
X = df.drop(columns=["SMILES", "BBB+/BBB-", "ECFP"])
y = df["BBB+/BBB-"]

# Modelo Naive Bayes con PCA
<hr>

In [5]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
pipe = Pipeline([
    ('pca', PCA()),
    ('NB', GaussianNB())
])

In [20]:
param_grid = {
    'pca__n_components': [150],  # Ejemplo de opciones para el número de componentes en PCA
    'NB__var_smoothing': [1e-4, 1e-3, 1e-2, 0.1, 1, 10],       # Número de árboles en el bosque
}


In [21]:
grid_search = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2)

In [22]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END ....NB__var_smoothing=0.0001, pca__n_components=150; total time=   1.6s
[CV] END ....NB__var_smoothing=0.0001, pca__n_components=150; total time=   1.5s
[CV] END ....NB__var_smoothing=0.0001, pca__n_components=150; total time=   1.1s
[CV] END ....NB__var_smoothing=0.0001, pca__n_components=150; total time=   1.0s
[CV] END ....NB__var_smoothing=0.0001, pca__n_components=150; total time=   1.0s
[CV] END .....NB__var_smoothing=0.001, pca__n_components=150; total time=   0.9s
[CV] END .....NB__var_smoothing=0.001, pca__n_components=150; total time=   1.0s
[CV] END .....NB__var_smoothing=0.001, pca__n_components=150; total time=   1.0s
[CV] END .....NB__var_smoothing=0.001, pca__n_components=150; total time=   1.0s
[CV] END .....NB__var_smoothing=0.001, pca__n_components=150; total time=   1.0s
[CV] END ......NB__var_smoothing=0.01, pca__n_components=150; total time=   0.9s
[CV] END ......NB__var_smoothing=0.01, pca__n_com

In [23]:
grid_search.best_params_

{'NB__var_smoothing': 0.01, 'pca__n_components': 150}

In [24]:
grid_search.best_score_

0.7964771817453963

In [26]:
pipe = Pipeline([
    ('pca', PCA(n_components=150)),
    ('NB', GaussianNB(var_smoothing=0.01))
])

In [27]:
pipe.fit(X_train, y_train)

In [28]:
y_predicha = pipe.predict(X_test)

In [29]:
#Calcular estadísticos
accuracy = accuracy_score(y_test, y_predicha)
recall = recall_score(y_test, y_predicha, average='weighted')
precision = precision_score(y_test, y_predicha, average='weighted')
f1 = f1_score(y_test, y_predicha, average='weighted')
print("Accuracy", round(accuracy, 3))
print("Recall", round(recall, 3))
print("precision", round(precision,3))
print("f1", round(f1,3))

Accuracy 0.785
Recall 0.785
precision 0.791
f1 0.77


In [30]:
y_entrenamiento = pipe.predict(X_train)

In [31]:
accuracy = accuracy_score(y_train, y_entrenamiento)
recall = recall_score(y_train, y_entrenamiento, average='weighted')
precision = precision_score(y_entrenamiento, y_entrenamiento, average='weighted')
f1 = f1_score(y_train, y_entrenamiento, average='weighted')
print("Accuracy", round(accuracy, 3))
print("Recall", round(recall, 3))
print("precision", round(precision,3))
print("f1", round(f1,3))

Accuracy 0.796
Recall 0.796
precision 1.0
f1 0.783


In [32]:
import joblib

In [33]:
joblib.dump(pipe, "NB_entrenado.joblib")

['NB_entrenado.joblib']