# XGBoost 

(eXtreme Gradient Boosting) es una implementación optimizada de gradient boosting. Aprende secuencialmente minimizando una función de pérdida con regularización L1/L2 para prevenir overfitting. Usa árboles de decisión como base learners.


In [None]:
from src.utils import get_sample_data, get_sample_data_kaggle
from src.feature_engineerings import create_pattern_features
from src.feature_engineerings import encode_categorical_features
from src.feature_engineerings import scale_features
from src.feature_engineerings import select_important_features

import xgboost as xgb
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

# ============================
# 1. Cargar datos de muestra
# ============================
df_sample = get_sample_data()

# ============================
# 2. Crear variables de patrones
# ============================
df = create_pattern_features(df_sample)

# (Opcional) Análisis de patrones
#pattern_analysis = df.groupby(['off_hours', 'suspicious_frequency'])['isFraud'].mean()

# ============================
# 3. Codificar variables categóricas
# ============================
df_encode, label_encoder = encode_categorical_features(df)

# ============================
# 4. Escalar datos
# ============================
X_train, X_test, y_train, y_test, scaler = scale_features(df_encode)

# ============================
# 5. Seleccionar características importantes
# ============================
feature_importance, selected_features, selector = select_important_features(X_train, y_train)
print("Características seleccionadas:", selected_features)

# Dataset final
X_train_final = X_train[selected_features]
X_test_final  = X_test[selected_features]

# ============================
# 6. Calcular peso para balancear clases
# ============================
normal_count = len(y_train[y_train == 0])
fraud_count = len(y_train[y_train == 1])
pos_weight = normal_count / fraud_count

# ============================
# 7. Configurar modelo XGBoost
# ============================
xgb_model = xgb.XGBClassifier(
    n_estimators=100,
    subsample=0.8,
    reg_alpha=0.5,
    reg_lambda=2.0,
    gamma=0,
    colsample_bytree=1.0,
    max_depth=10,
    learning_rate=0.1,
    scale_pos_weight=pos_weight,
    random_state=42
)

# ============================
# 8. Entrenar modelo
# ============================
print("Entrenando XGBoost...")
xgb_model.fit(X_train_final, y_train)

# ============================
# 9. Evaluar modelo
# ============================
y_pred = xgb_model.predict(X_test_final)
y_prob = xgb_model.predict_proba(X_test_final)[:, 1]  

# Threshold personalizado
threshold = 0.3
y_pred_custom = (y_prob > threshold).astype(int)

# Métricas
auc_score = roc_auc_score(y_test, y_prob)
print(f"Resultados XGBoost con threshold {threshold}:")
print("AUC Score:", auc_score)
print(confusion_matrix(y_test, y_pred_custom))
print(classification_report(y_test, y_pred_custom))

print("Modelo XGBoost entrenado y evaluado.")
print("Características seleccionadas:", list(xgb_model.feature_names_in_))


  from .autonotebook import tqdm as notebook_tqdm


ValueError: Error reading file: Error tokenizing data. C error: Expected 2 fields in line 6, saw 3


In [None]:
"""import joblib
import json

# === 1. Guardar modelo entrenado ===
joblib.dump(xgb_model, 'modelo_xgboost_fraude.pkl')

# === 2. Guardar scaler usado para normalización ===
joblib.dump(scaler, 'scaler.pkl')

# === 3. Guardar selector de features importantes ===
joblib.dump(selector, 'selector_kbest.pkl')

# === 4. Guardar nombres de las features seleccionadas ===
joblib.dump(selected_features, 'features_seleccionadas.pkl')

# 🔹 Mejor guardar solo los nombres de columnas, no todo X_train
columnas_entrenamiento = list(X_train.columns)
joblib.dump(columnas_entrenamiento, 'columnas_entrenamiento.pkl')

# === 5. Guardar threshold personalizado en un archivo JSON (más flexible) ===
config = {"threshold": 0.3}
with open('config_modelo.json', 'w') as f:
    json.dump(config, f)

print("✅ Todo fue guardado exitosamente para producción.")"""


## DESPLIEGUE ANTIFRAUDE MODELO XGBOOST

In [None]:
import pandas as pd
import joblib
import json

# === 1. Cargar componentes entrenados ===
model = joblib.load('modelo_xgboost_fraude.pkl')
scaler = joblib.load('scaler.pkl')
selector = joblib.load('selector_kbest.pkl')
columnas_entrenamiento = joblib.load('columnas_entrenamiento.pkl')  # lista de nombres de columnas

# === 2. Cargar configuración del modelo (threshold, etc.) ===
with open('config_modelo.json', 'r') as f:
    config = json.load(f)
threshold = config["threshold"]

# === 3. Diccionario de datos de prueba (ejemplo de fraude) ===
data_prueba_dict = {
    'step': 25,                
    'amount': 120.0,           
    'oldbalanceOrg': 1500.0,   
    'newbalanceOrig': 1380.0,  
    'oldbalanceDest': 500.0,   
    'newbalanceDest': 620.0,   
    'hour_of_day': 11,         
    'off_hours': 0,            
    'day_of_week': 3,          
    'weekend_activity': 0,     
    'amount_frequency': 1,     
    'suspicious_frequency': 0, 
    'structured_amount': 0,    
    'dest_diversity': 1,       
    'high_dest_diversity': 0,  
    'type_encoded': 3,         
    'type_CASH_IN': 0,
    'type_CASH_OUT': 0,
    'type_DEBIT': 0,
    'type_PAYMENT': 1,
    'type_TRANSFER': 0,        
    'type_fraud_rate': 0.01,   
    'hour_sin': 0.91,          
    'hour_cos': -0.41
}

# === 4. Convertir a DataFrame y alinear columnas ===
data_prueba = pd.DataFrame([data_prueba_dict])

# Alinear columnas exactamente como en el entrenamiento
data_prueba = data_prueba[columnas_entrenamiento]

# === 5. Escalar datos ===
data_scaled = scaler.transform(data_prueba)

# === 6. Seleccionar features importantes ===
data_selected = selector.transform(data_scaled)

# === 7. Obtener predicciones ===
probabilidad = model.predict_proba(data_selected)[0][1]
prediccion = int(probabilidad > threshold)  # umbral personalizado

# === 8. Mostrar resultados ===
print("¿Es fraude?:", "✅ SÍ" if prediccion == 1 else "❌ NO")
print(f"Probabilidad de fraude: {probabilidad:.2%}")
