# Explore here

In [2]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import *
from imblearn.metrics import specificity_score
from sklearn.metrics import confusion_matrix
from xgboost import XGBClassifier
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV

In [3]:
datos_diabetes = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/decision-tree-project-tutorial/main/diabetes.csv")

datos_diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
X_train, X_test, y_train, y_test = train_test_split(datos_diabetes.drop(['Outcome'],axis=1), datos_diabetes['Outcome'], test_size=0.2, random_state = 42)

In [5]:
param_dist = {
    'n_estimators': [5, 10, 20],
    'max_depth': [1, 3, 4, 5, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],  # Add learning_rate
    'subsample': [0.8, 0.9, 1.0],  # Add subsample
    'colsample_bytree': [0.8, 0.9, 1.0],  # Add colsample_bytree
    'min_child_weight': [1, 3, 5, 10, 15, 20],
}
# Create XGBClassifier
xgb = XGBClassifier()
# Create RandomizedSearchCV object
random_search = RandomizedSearchCV(estimator=xgb,
                                   param_distributions=param_dist,
                                   n_iter=100,  
                                   cv=5,
                                   scoring='accuracy',  
                                   random_state=42)
# Fit the model to find the best hyperparameters
random_search.fit(X_train, y_train)
# Get the best hyperparameters
best_params = random_search.best_params_
# Create the optimal XGBClassifier with the best hyperparameters
optimal_xgb = XGBClassifier(**best_params)
# Fit the optimal model on the training data
optimal_xgb.fit(X_train, y_train)
# Make predictions on the training and test data
predict_train = optimal_xgb.predict(X_train)
predict_test = optimal_xgb.predict(X_test)

In [6]:
model = XGBClassifier(n_estimators=200, max_depth=10, learning_rate=0.03)
model.fit(X_train, y_train)

In [7]:
y_pred = model.predict(X_test)
y_pred

array([1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0])

In [8]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.7337662337662337

In [9]:
def get_metrics(y_train, y_test, y_pred_train, y_pred_test):
    # Calcular métricas para el conjunto de entrenamiento
    train_accuracy = accuracy_score(y_train, y_pred_train)
    train_f1 = f1_score(y_train, y_pred_train)
    train_auc = roc_auc_score(y_train, y_pred_train)
    train_precision = precision_score(y_train, y_pred_train)
    train_recall = recall_score(y_train, y_pred_train)
    train_specificity = specificity_score(y_train, y_pred_train)

    # Calcular métricas para el conjunto de prueba
    test_accuracy = accuracy_score(y_test, y_pred_test)
    test_f1 = f1_score(y_test, y_pred_test)
    test_auc = roc_auc_score(y_test, y_pred_test)
    test_precision = precision_score(y_test, y_pred_test)
    test_recall = recall_score(y_test, y_pred_test)
    test_specificity = specificity_score(y_test, y_pred_test)

    # Calcular la diferencia entre métricas de entrenamiento y prueba
    diff_accuracy = train_accuracy - test_accuracy
    diff_f1 = train_f1 - test_f1
    diff_auc = train_auc - test_auc
    diff_precision = train_precision - test_precision
    diff_recall = train_recall - test_recall
    diff_specificity = train_specificity - test_specificity

    # Crear un DataFrame con los resultados
    metrics_df = pd.DataFrame([[train_accuracy, train_f1, train_auc, train_precision, train_recall, train_specificity],[test_accuracy, test_f1, test_auc, test_precision, test_recall, test_specificity],[diff_accuracy, diff_f1, diff_auc, diff_precision, diff_recall, diff_specificity]],
                              columns = ['Accuracy', 'F1', 'AUC', 'Precision', 'Recall', 'Specificity'],
                              index = ['Train','Test', 'Diferencia'])

    return metrics_df

In [10]:
# Realizamos predicciones en el conjunto de entrenamiento y prueba
test_pred = model.predict(X_test)
train_pred = model.predict(X_train)

# Evaluamos el rendimiento del modelo
get_metrics(y_train, y_test, train_pred, test_pred)

Unnamed: 0,Accuracy,F1,AUC,Precision,Recall,Specificity
Train,0.998371,0.997647,0.997653,1.0,0.995305,1.0
Test,0.733766,0.655462,0.728283,0.609375,0.709091,0.747475
Diferencia,0.264605,0.342185,0.26937,0.390625,0.286214,0.252525


In [11]:
get_metrics(y_train,y_test,predict_train ,predict_test)

Unnamed: 0,Accuracy,F1,AUC,Precision,Recall,Specificity
Train,0.845277,0.764268,0.816615,0.810526,0.723005,0.910224
Test,0.74026,0.636364,0.717172,0.636364,0.636364,0.79798
Diferencia,0.105017,0.127904,0.099443,0.174163,0.086641,0.112245
