# Cargar datos
<hr>

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("Datos_con_ECFP.csv")

In [3]:
df.head()

Unnamed: 0,SMILES,BBB+/BBB-,ECFP,0,1,2,3,4,5,6,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,O=C(O)c1cc(N=Nc2ccc(S(=O)(=O)Nc3ccccn3)cc2)ccc1O,BBB-,[0 0 0 ... 0 0 0],0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,COC1(NC(=O)C(C(=O)O)c2ccc(O)cc2)C(=O)N2C(C(=O)...,BBB-,[0 1 0 ... 0 0 0],0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Oc1c(I)cc(Cl)c2cccnc12,BBB-,[0 0 0 ... 0 0 0],0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,CCNC(=NCCSCc1ncccc1Br)NC#N,BBB-,[0 0 0 ... 0 0 0],0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,CN1CC[C@]23c4c5ccc(OC6O[C@H](C(=O)O)[C@@H](O)[...,BBB-,[0 0 0 ... 0 0 0],0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
X = df.drop(columns=["SMILES", "BBB+/BBB-", "ECFP"])
y = df["BBB+/BBB-"]

# Modelo Random Forest con PCA
<hr>

In [5]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
pipe = Pipeline([
    ('pca', PCA()),
    ('RF', RandomForestClassifier())
])

In [8]:
param_grid = {
    'pca__n_components': [100, 150],  # Ejemplo de opciones para el número de componentes en PCA
    'RF__n_estimators': [250, 500, 750, 1000],       # Número de árboles en el bosque
    'RF__max_features': ['sqrt'],    # Número de características a considerar para dividir el nodo
    'RF__max_depth': [40, 50, 60, 70, 80],     # Profundidad máxima de los árboles
}


In [9]:
grid_search = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2)

In [10]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


In [None]:
grid_search.best_estimator_

In [None]:
grid_search.best_params_

{'RF__max_depth': 70,
 'RF__max_features': 'sqrt',
 'RF__n_estimators': 250,
 'pca__n_components': 150}

In [None]:
grid_search.best_score_

0.8590872698158526

In [None]:
pipe = Pipeline([
    ('pca', PCA(n_components=150)),
    ('RF', RandomForestClassifier(max_features="sqrt", n_estimators=250, max_depth=70, bootstrap=True))
])

In [None]:
pipe.fit(X_train, y_train)

In [None]:
y_predicha = pipe.predict(X_test)

In [None]:
#Calcular estadísticos
accuracy = accuracy_score(y_test, y_predicha)
recall = recall_score(y_test, y_predicha, average='weighted')
precision = precision_score(y_test, y_predicha, average='weighted')
f1 = f1_score(y_test, y_predicha, average='weighted')
print("Accuracy", round(accuracy, 3))
print("Recall", round(recall, 3))
print("precision", round(precision,3))
print("f1", round(f1,3))

Accuracy 0.853
Recall 0.853
precision 0.856
f1 0.849


In [None]:
y_entrenamiento = pipe.predict(X_train)

In [None]:
accuracy = accuracy_score(y_train, y_entrenamiento)
recall = recall_score(y_train, y_entrenamiento, average='weighted')
precision = precision_score(y_entrenamiento, y_entrenamiento, average='weighted')
f1 = f1_score(y_train, y_entrenamiento, average='weighted')
print("Accuracy", round(accuracy, 3))
print("Recall", round(recall, 3))
print("precision", round(precision,3))
print("f1", round(f1,3))

Accuracy 0.983
Recall 0.983
precision 1.0
f1 0.983


In [None]:
import joblib

In [None]:
joblib.dump(pipe, "RF_entrenado.joblib")

['RF_entrenado.joblib']