In [6]:
# Manejo de datos
import pandas as pd
import numpy as np

# Preparacion de datos para el modelo y Optimizacion de hipermarametros
from sklearn.model_selection import train_test_split, RandomizedSearchCV

# Modelos
from xgboost import XGBClassifier

# Metricas para calificar el modelo
from sklearn.metrics import *





In [2]:
# importamos los datos del proyecto DecisionTreeClassifier
X_train = pd.read_csv('/workspaces/decission_tree/src/xtrain.csv')
X_test = pd.read_csv('/workspaces/decission_tree/src/xtest.csv')
y_train = pd.read_csv('/workspaces/decission_tree/src/ytrain.csv')
y_test = pd.read_csv('/workspaces/decission_tree/src/ytest.csv')

In [11]:
# Instanciamos el modelo base de XgbClassifier
model = XGBClassifier(random_state = 42)
# Entrenamos el modelo
model.fit(X_train, y_train)

MODELO BASE DE XGBCLASSIFIER|
--

In [16]:
# Evaluacion del modelo base
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]  
# Calcular métricas
accuracy_base = accuracy_score(y_test, y_pred)
precision_base = precision_score(y_test, y_pred, average='binary')
recall_base = recall_score(y_test, y_pred, average='binary')
f1_base = f1_score(y_test, y_pred, average='binary')
conf_matrix_base = confusion_matrix(y_test, y_pred)

# Mostrar las métricas
print("Accuracy:", accuracy_base)
print("Precision:", precision_base)
print("Recall:", recall_base)
print("F1 Score:", f1_base)
print("Confusion Matrix:\n", conf_matrix_base)

Accuracy: 0.7931034482758621
Precision: 0.6326530612244898
Recall: 0.7209302325581395
F1 Score: 0.6739130434782609
Confusion Matrix:
 [[84 18]
 [12 31]]


---

MODELO XGBCLASSIFIER OPTIMIZADO CON RANDOMIZEDSEARCH|
--

In [None]:


# Definir el modelo XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Definir el espacio de búsqueda de los hiperparámetros
param_dist = {
    'n_estimators': np.arange(50, 400, 50),  
    'learning_rate': np.logspace(-3, 0, 10), 
    'subsample': np.arange(0.5, 1.0, 0.1),  
    'max_depth': np.arange(3, 10, 1),         
    'colsample_bytree': np.arange(0.5, 1.0, 0.1),  
    'min_child_weight': np.arange(1, 10, 1),  
    'gamma': np.linspace(0, 0.5, 6),          
    'reg_alpha': np.logspace(-3, 1, 5),       
    'reg_lambda': np.logspace(-3, 1, 5)       
}

# RandomizedSearchCV
random_search = RandomizedSearchCV(xgb_model, param_distributions=param_dist, n_iter=1000, scoring='accuracy', n_jobs=-1, cv=5, verbose=3, random_state=42)

# Ajustamos al modelo
random_search.fit(X_train, y_train)

# Imprimir los mejores parámetros y el mejor dato de accuracy
print("Mejores hiperparámetros encontrados:")
print(random_search.best_params_)
print("Mejor precisión obtenida:", random_search.best_score_)

In [15]:
# Obtener mejores hiperparametros
best_params = random_search.best_params_
# instancia del modelo optimizado
xgb_opt = XGBClassifier(**best_params)
# Entrenamiento del modelo
xgb_opt.fit(X_train, y_train)
# Evaluación
y_pred = xgb_opt.predict(X_test)
y_proba = xgb_opt.predict_proba(X_test)[:, 1]  

# Calcular métricas
accuracy_opt = accuracy_score(y_test, y_pred)
precision_opt = precision_score(y_test, y_pred, average='binary')
recall_opt = recall_score(y_test, y_pred, average='binary')
f1_opt = f1_score(y_test, y_pred, average='binary')
conf_matrix_opt = confusion_matrix(y_test, y_pred)

# Mostrar las métricas
print("Accuracy:", accuracy_opt)
print("Precision:", precision_opt)
print("Recall:", recall_opt)
print("F1 Score:", f1_opt)
print("Confusion Matrix:\n", conf_matrix_opt)

Accuracy: 0.8206896551724138
Precision: 0.6888888888888889
Recall: 0.7209302325581395
F1 Score: 0.7045454545454546
Confusion Matrix:
 [[88 14]
 [12 31]]


---

CREAMOS DATAFRAME PARA COMPARAR LAS METRICAS|
--

In [24]:
# Definir dataframe para almacenar las metricas
columns = ['model', 'accuracy', 'precision', 'recall', 'f1']
metrics_df = pd.DataFrame(columns=columns)

In [29]:
# Metricas de los diferentes modelos

metrics_decissionTree_opt= {'model': 'decissionTree_opt',
                           'Accuracy': 0.8,
                           'Precision': 0.6842105263157895,
                           'Recall': 0.6046511627906976,
                           'F1 Score': 0.6419753086419753
}

metrics_random_forest = {
    'model': 'random forest',
    'Accuracy': 0.8,
    'Precision': 0.6590909090909091,
    'Recall': 0.6744186046511628,
    'F1 Score': 0.6666666666666666
}


metrics_xgboost_opt = {
    'model': 'xgb_optimized',
    'Accuracy': accuracy_opt,
    'Precision': precision_opt,
    'Recall': recall_opt,
    'F1 Score': f1_opt
}

metrics_xgboost = {
    'model': 'xgb_base',
    'Accuracy': accuracy_base,
    'Precision': precision_base,
    'Recall': recall_base,
    'F1 Score': f1_base
}


# Crear DataFrames individuales
df_random_forest = pd.DataFrame([metrics_random_forest])
df_xgboost_opt = pd.DataFrame([metrics_xgboost_opt])
df_xgboost = pd.DataFrame([metrics_xgboost])
df_decission_tree =  pd.DataFrame([metrics_decissionTree_opt])
# Se concatenan los dataframes de las metricas individuales
metrics_df = pd.concat([df_random_forest, df_xgboost, df_xgboost_opt, df_decission_tree], ignore_index=True)


In [30]:
metrics_df

Unnamed: 0,model,Accuracy,Precision,Recall,F1 Score
0,random forest,0.8,0.659091,0.674419,0.666667
1,xgb_base,0.793103,0.632653,0.72093,0.673913
2,xgb_optimized,0.82069,0.688889,0.72093,0.704545
3,decissionTree_opt,0.8,0.684211,0.604651,0.641975


CONCLUSION|
--

El modelo optimizado de XgbClassifier mejora en casi todas las metricas al resto de modelos.