In [2]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

### Clasificaci√≥n binaria desbalanceada con XGBoost

In [12]:
# Generate an imbalanced synthetic dataset
X, y = make_classification(n_samples=100000, n_classes=2, weights=[0.80, 0.20], random_state=42)
y[:20]
# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize XGBClassifier without considering imbalanced data
model = XGBClassifier(
    n_estimators=100,
    objective='binary:logistic',
    random_state=42
)

# Fit the model using train data
model.fit(X_train, y_train)

# Generate predictions using test data
predictions = model.predict(X_test)

# Evaluate the model using test data
print('Confusion Matrix:')
print(confusion_matrix(y_test, predictions))
print('\nClassification Report:')
print(classification_report(y_test, predictions))

Confusion Matrix:
[[15221   640]
 [ 1388  2751]]

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.96      0.94     15861
           1       0.81      0.66      0.73      4139

    accuracy                           0.90     20000
   macro avg       0.86      0.81      0.83     20000
weighted avg       0.89      0.90      0.89     20000



In [16]:
# Compute the positive class weight
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

# Initialize XGBClassifier with scale_pos_weight and max_delta_step
model = XGBClassifier(
    n_estimators=100,
    objective='binary:logistic',
    scale_pos_weight=scale_pos_weight,
    random_state=42
)

# Fit the model using train data
model.fit(X_train, y_train)

# Generate predictions using test data
predictions = model.predict(X_test)

# Evaluate the model using test data
print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))
print("\nClassification Report:")
print(classification_report(y_test, predictions))



Confusion Matrix:
[[13977  1884]
 [  638  3501]]

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.88      0.92     15861
           1       0.65      0.85      0.74      4139

    accuracy                           0.87     20000
   macro avg       0.80      0.86      0.83     20000
weighted avg       0.89      0.87      0.88     20000



### XGBoost con CV y ES

In [19]:
import xgboost as xgb
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

# Create a synthetic dataset
X, y = make_classification(n_samples=10000, n_features=20, n_informative=2, n_redundant=10, random_state=42)
print('Synthetic data has been randomly generated')
print('First 20 labels of y:', y[:20])


Synthetic data has been randomly generated
First 20 labels of y: [1 0 0 1 1 1 0 1 1 0 0 1 0 1 1 0 1 1 1 0]


In [21]:
# Initialize lists to store scores
accuracy_scores = []
f1_scores = []
roc_auc_scores = []

# Perform cross-validation with early stopping
for train_index, test_index in StratifiedKFold(n_splits=5, shuffle=True, random_state=42).split(X, y):
    # Split data into train and test folds
    X_train_fold, X_test_fold = X[train_index], X[test_index]
    y_train_fold, y_test_fold = y[train_index], y[test_index]

    # Further split the train fold into a training set and a validation set
    X_train_fold, X_val, y_train_fold, y_val = train_test_split(X_train_fold, y_train_fold, test_size=0.2, random_state=42)

    # Prepare the model
    model = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, objective='binary:logistic',
                              early_stopping_rounds=10, random_state=42)

    # Train the model using the training set and validate on the validation set
    model.fit(X_train_fold, y_train_fold, eval_set=[(X_val, y_val)], verbose=False)

    # Predict on the test fold
    y_pred_test = model.predict(X_test_fold)
    y_pred_proba_test = model.predict_proba(X_test_fold)[:, 1]  # Probability estimates for ROC AUC

    # Calculate scores
    accuracy = accuracy_score(y_test_fold, y_pred_test)
    f1 = f1_score(y_test_fold, y_pred_test)
    roc_auc = roc_auc_score(y_test_fold, y_pred_proba_test)

    # Append scores to the lists
    accuracy_scores.append(accuracy)
    f1_scores.append(f1)
    roc_auc_scores.append(roc_auc)

# Print average scores
print(f"CV avg. accuracy: {np.mean(accuracy_scores):.3f}")
print(f"CV avg. f1-score: {np.mean(f1_scores):.3f}")
print(f"CV avg. roc auc: {np.mean(roc_auc_scores):.3f}")


CV avg. accuracy: 0.937
CV avg. f1-score: 0.938
CV avg. roc auc: 0.974
