In [14]:
print('Prueba rapida para ver sintaxis y uso de Xgboost')

# data analysis
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# model evaluations
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import RocCurveDisplay


Prueba rapida para ver sintaxis y uso de Xgboost


In [3]:
df = pd.read_csv("../data/heart-disease.csv")

In [4]:
X = df.drop("target", axis=1)
y = df.target

In [5]:
# split data into train & test

np.random.seed(42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

len(X_train), len(X_test), (242+61), len(X)
len(y_train), len(y_test), len(y)


(242, 61, 303)

In [6]:
import xgboost as xgb
from sklearn.model_selection import train_test_split

models = {
    "XGBoost Classifier": xgb.XGBClassifier(),
    "XGBoost RF Classifier": xgb.XGBRFClassifier(),
    "Random Forest Classifier": RandomForestClassifier()
}

# create a function to fit and score the model:
def fit_and_score(model, X_train, X_test, y_train, y_test):
    """
    Fits and evaluates given machine learning models.
    model : a dict of different XGBoost models
    X_train : training data (no labels)
    X_test : testing data (no labels)
    y_train : training labels
    y_test : test labels
    """
    np.random.seed(42)

    # Make a dict to keep model scores
    model_scores = dict()

    for name, clf in model.items():
        # fit the data:
        clf.fit(X_train, y_train)
        # model score:
        model_scores[name] = clf.score(X_test, y_test)
    return model_scores

# Example usage:
scores = fit_and_score(models, X_train, X_test, y_train, y_test)

print(scores)


{'XGBoost Classifier': 0.819672131147541, 'XGBoost RF Classifier': 0.8360655737704918, 'Random Forest Classifier': 0.8360655737704918}


In [7]:
model_scores = fit_and_score(

    model=models, 
    X_train=X_train, 
    X_test=X_test, 
    y_train=y_train, 
    y_test=y_test

);

model_scores

{'XGBoost Classifier': 0.819672131147541,
 'XGBoost RF Classifier': 0.8360655737704918,
 'Random Forest Classifier': 0.8360655737704918}

In [8]:
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'subsample': [0.8, 1.0],
    'colsample_bynode': [0.8, 1.0]
}

# Setup grid hyperparameter search for xgb.XGBRFClassifier()
gs_xgb_rf = GridSearchCV(xgb.XGBRFClassifier(),
                         param_grid=param_grid,
                         cv=6,
                         verbose=True)

# Fit grid hyperparameter search model
gs_xgb_rf.fit(X_train, y_train)

Fitting 6 folds for each of 36 candidates, totalling 216 fits


In [10]:
gs_xgb_rf.best_params_

{'colsample_bynode': 1.0,
 'learning_rate': 0.1,
 'max_depth': 7,
 'subsample': 0.8}

In [12]:
# Construir un nuevo modelo XGBRFClassifier con los mejores hiperparámetros encontrados
best_model = xgb.XGBRFClassifier(colsample_bynode=1.0, learning_rate=0.1, max_depth=7, subsample=0.8)

# Ajustar el modelo con los datos de entrenamiento
best_model.fit(X_train, y_train)

# Evaluar el rendimiento del mejor modelo en el conjunto de prueba
accuracy = best_model.score(X_test, y_test)
print("Accuracy: %.2f%%" % (accuracy * 100.0))


Accuracy: 81.97%


In [13]:
gs_xgb_rf.score(X_test, y_test)

0.819672131147541

In [15]:
print('11/jul/23 | 23:55')

11/jul/23 | 23:55
