In [1]:
import pandas as pd

url = 'https://raw.githubusercontent.com/4GeeksAcademy/decision-tree-project-tutorial/main/diabetes.csv'

df = pd.read_csv(url)
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [2]:
procesed_data = df.drop(['Insulin','SkinThickness'], axis=1)
procesed_data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,33.6,0.627,50,1
1,1,85,66,26.6,0.351,31,0
2,8,183,64,23.3,0.672,32,1
3,1,89,66,28.1,0.167,21,0
4,0,137,40,43.1,2.288,33,1


# Machine Learning

In [3]:
from sklearn.model_selection import train_test_split

num_variables = ['Pregnancies', 'Glucose', 'BloodPressure','BMI', 'DiabetesPedigreeFunction', 'Age']

X = procesed_data.drop("Outcome", axis = 1)[num_variables]
y = procesed_data["Outcome"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [4]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train) #Usar la escala del training tanto para el training como para el test

X_train_norm = scaler.transform(X_train) 
X_train_norm = pd.DataFrame(X_train_norm, index = X_train.index, columns = num_variables) #Convertir los datos de nuevo en dataframe

X_test_norm = scaler.transform(X_test)
X_test_norm = pd.DataFrame(X_test_norm, index = X_test.index, columns = num_variables)

In [5]:
from xgboost import XGBClassifier

model = XGBClassifier(random_state = 42)
model.fit(X_train_norm, y_train)

In [6]:
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_test_norm)
accuracy_score(y_test, y_pred)

0.7337662337662337

De entrada el resultado es muy similar al obtenido con Decision Tree y algo inferior al de Random Forest, voy a tratar de optimizar hiperparámetros. Aunque voy a probar un enfoque distinto, primero voy a utilizar los hyperparámetros que optimicé para el modelo de Random Forest, y después voy a tratar de optimizar el parámetro de Learning rate que es particular de este modelo para tratar de hacer el código más eficiente. Igualmente al final probaré una optimización normal para ver si este modo es eficaz

# Hyperparameters optimization

In [11]:
print(model.get_params())

{'objective': 'binary:logistic', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'feature_types': None, 'gamma': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': None, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': None, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': None, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': 42, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': None, 'subsample': None, 'tree_method': None, 'validate_parameters': None, 'verbosity': None}


In [12]:
new_model = XGBClassifier(max_depth= 10, min_child_weight= 2, gamma= 10, n_estimators= 60, random_state = 42)
new_model.fit(X_train_norm, y_train)

y_pred = new_model.predict(X_test_norm)
accuracy_score(y_test, y_pred)

0.7662337662337663

El modelo ha mejorado un poco, lo que es buena señal, voy a intentar optimizar ahora el learning_rate

In [13]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [60],
    'learning_rate' : [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3],
    'max_depth': [10],
    'gamma': [10],
    'min_child_weight': [2]
} #Estos son los parametros que exploraré

rf = XGBClassifier(random_state=42)

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5,
                           scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_train_norm, y_train)

print("Bes hyperparametes:", grid_search.best_params_)
print("Validation Score:", grid_search.best_score_)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END gamma=10, learning_rate=0.0001, max_depth=10, min_child_weight=2, n_estimators=60; total time=   0.0s
[CV] END gamma=10, learning_rate=0.0001, max_depth=10, min_child_weight=2, n_estimators=60; total time=   0.0s
[CV] END gamma=10, learning_rate=0.0001, max_depth=10, min_child_weight=2, n_estimators=60; total time=   0.0s
[CV] END gamma=10, learning_rate=0.0001, max_depth=10, min_child_weight=2, n_estimators=60; total time=   0.0s
[CV] END gamma=10, learning_rate=0.0001, max_depth=10, min_child_weight=2, n_estimators=60; total time=   0.0s
[CV] END gamma=10, learning_rate=0.001, max_depth=10, min_child_weight=2, n_estimators=60; total time=   0.0s
[CV] END gamma=10, learning_rate=0.001, max_depth=10, min_child_weight=2, n_estimators=60; total time=   0.0s
[CV] END gamma=10, learning_rate=0.001, max_depth=10, min_child_weight=2, n_estimators=60; total time=   0.0s
[CV] END gamma=10, learning_rate=0.001, max_depth=10, m

In [14]:
new_model = XGBClassifier(max_depth= 10, learning_rate=0.1, min_child_weight= 2, gamma= 10, n_estimators= 60, random_state = 42)
new_model.fit(X_train_norm, y_train)

y_pred = new_model.predict(X_test_norm)
accuracy_score(y_test, y_pred)

0.7792207792207793

Este método ha sido muy rápido (1.8s) aunque no se ha mejorado tanto como esperaba el modelo, voy a probar una optimización clásica

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate' : [0.01, 0.1, 0.2],
    'max_depth': [None, 10, 20],
    'gamma': [2, 5, 10],
    'min_child_weight': [1, 2, 4]
} #Estos son los parametros que exploraré

rf = XGBClassifier(random_state=42)

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5,
                           scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_train_norm, y_train)

print("Bes hyperparametes:", grid_search.best_params_)
print("Validation Score:", grid_search.best_score_)

Fitting 10 folds for each of 243 candidates, totalling 2430 fits
[CV] END gamma=2, learning_rate=0.01, max_depth=None, min_child_weight=1, n_estimators=50; total time=   0.0s
[CV] END gamma=2, learning_rate=0.01, max_depth=None, min_child_weight=1, n_estimators=50; total time=   0.0s
[CV] END gamma=2, learning_rate=0.01, max_depth=None, min_child_weight=1, n_estimators=50; total time=   0.1s
[CV] END gamma=2, learning_rate=0.01, max_depth=None, min_child_weight=1, n_estimators=50; total time=   0.0s
[CV] END gamma=2, learning_rate=0.01, max_depth=None, min_child_weight=1, n_estimators=50; total time=   0.0s
[CV] END gamma=2, learning_rate=0.01, max_depth=None, min_child_weight=1, n_estimators=50; total time=   0.0s
[CV] END gamma=2, learning_rate=0.01, max_depth=None, min_child_weight=1, n_estimators=50; total time=   0.0s
[CV] END gamma=2, learning_rate=0.01, max_depth=None, min_child_weight=1, n_estimators=50; total time=   0.0s
[CV] END gamma=2, learning_rate=0.01, max_depth=None, m

In [56]:
new_model = XGBClassifier(max_depth= None, learning_rate=0.1, min_child_weight= 2, gamma= 5, n_estimators= 50, random_state = 42)
new_model.fit(X_train_norm, y_train)

y_pred = new_model.predict(X_test_norm)
accuracy_score(y_test, y_pred)

0.7532467532467533

Curiosamente con los 'mejores' hiperparametros se obtiene un rendimimento peor a lo que he hecho antes, incluso probando números se pueden encontrar aún mejores resultados. Voy a probar otros métodos para optimizar los hiperparámetros

In [57]:
new_model = XGBClassifier(max_depth= 10, learning_rate=0.1, min_child_weight= 4, gamma= 5, n_estimators= 50, random_state = 42)
new_model.fit(X_train_norm, y_train)

y_pred = new_model.predict(X_test_norm)
accuracy_score(y_test, y_pred)

0.7922077922077922

Esta accuracy es un 4% superior a la anterior, pero por alguna razón no se consideró. Los hyperparámetros que más parecen influir son 'min_child_weight' y 'gamma'. Voy a probar otras funciones de optimización 

# Optimización con Bayes

In [None]:
from skopt import BayesSearchCV

# Definir el espacio de búsqueda
param_spaces = {
    'n_estimators': (50, 200),                          
    'learning_rate': (0.01, 0.2, 'uniform'),           
    'max_depth': [None, 10, 20],                          
    'gamma': (2, 8, 'uniform'),                          
    'min_child_weight': (1, 4)                            
}

rf = XGBClassifier(random_state=42)

bayes_search = BayesSearchCV(estimator=rf,
                             search_spaces=param_spaces,
                             n_iter=50,         # Número de iteraciones
                             cv=5,              # Validación cruzada 5-fold
                             scoring='accuracy',
                             n_jobs=-1,
                             random_state=42)

# Ajustar el modelo
bayes_search.fit(X_train_norm, y_train)

print("Mejores hiperparámetros:", bayes_search.best_params_)
print("Mejor score:", bayes_search.best_score_)

Mejores hiperparámetros: OrderedDict([('gamma', 4), ('learning_rate', 0.14826789120369177), ('max_depth', 20), ('min_child_weight', 2), ('n_estimators', 151)])
Mejor score: 0.7800105764145954


In [63]:
new_model = XGBClassifier(max_depth= 20, learning_rate=0.14826789120369177, min_child_weight= 2, gamma= 4, n_estimators= 151, random_state = 42)
new_model.fit(X_train_norm, y_train)

y_pred = new_model.predict(X_test_norm)
accuracy_score(y_test, y_pred)

0.7597402597402597

Igual de mal

# Optimización con Randomized

In [65]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

# Definir distribuciones para cada hiperparámetro
param_dist = {
    'n_estimators': randint(50, 201),               
    'learning_rate': uniform(0.01, 0.19),             
    'max_depth': list(range(5, 21)),         
    'gamma': uniform(0, 8),                          
    'min_child_weight': randint(1, 5)                 
}
# Inicializar el clasificador
rf = XGBClassifier(random_state=42)

# Configurar RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=rf,
                                   param_distributions=param_dist,
                                   n_iter=50,           
                                   cv=5,                
                                   scoring='accuracy',
                                   random_state=42,
                                   n_jobs=-1)

# Ajustar el modelo a los datos
random_search.fit(X_train_norm, y_train)

print("Mejores hiperparámetros:", random_search.best_params_)
print("Mejor score:", random_search.best_score_)


Mejores hiperparámetros: {'gamma': np.float64(4.331583790620527), 'learning_rate': np.float64(0.14219903587556562), 'max_depth': 11, 'min_child_weight': 4, 'n_estimators': 193}
Mejor score: 0.7882846861255497


In [66]:
new_model = XGBClassifier(max_depth= 11, learning_rate=0.14219903587556562, min_child_weight= 4, gamma= 4.331583790620527, n_estimators= 193, random_state = 42)
new_model.fit(X_train_norm, y_train)

y_pred = new_model.predict(X_test_norm)
accuracy_score(y_test, y_pred)

0.7597402597402597

Nada, probar números al azar parece más efectivo, o al menos probarlos tras la optimización. También puede ser que existan otros valores que está considerando esta optimización y no solo la accuracy, o tal vez al intentar optimizar el modelo estoy provocando el sobreajuste del mismo, aunque precisamente la optimización debería ser para evitarlo.