In [1]:
# Librerías para visualización de datos
import matplotlib.pyplot as plt
import seaborn as sns

# Librerías para manipulación y análisis de datos
import numpy as np
import pandas as pd


from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import RFECV
from sklearn.inspection import permutation_importance
from sklearn.tree import DecisionTreeClassifier

from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import make_scorer, recall_score, auc

from sklearn.model_selection import cross_val_score

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import roc_curve


from toolbox_DS import *
from toolbox_ML import *


import warnings
warnings.filterwarnings(action="ignore", message=r'.*Use subset.*of np.ndarray is not recommended')




## Carga de datos

Train

In [2]:
train_set = pd.read_csv('./data/train_set.csv')
train_set

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,Response,income_missing,age,customes_seniority,Household_members,Total_amount,Total_purchase,Median_amount_purchase,Total_cmp,Total_%_cmp
0,5675,1960,PhD,Divorced,50611.0,0,1,2012-10-04,98,459,...,0,0,55,3,2.0,493,22,22.409091,1,0.2
1,5543,1966,Graduation,Together,57811.0,0,1,2013-06-24,49,545,...,0,0,49,2,3.0,802,25,32.080000,1,0.2
2,3011,1965,Graduation,Married,69139.0,0,1,2014-01-27,23,86,...,0,0,50,1,3.0,227,11,20.636364,0,0.0
3,535,1987,Graduation,Divorced,81361.0,0,0,2014-02-25,18,163,...,0,0,28,1,1.0,778,27,28.814815,0,0.0
4,10755,1976,2n Cycle,Married,23718.0,1,0,2013-09-02,76,6,...,0,0,39,2,3.0,81,9,9.000000,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1785,5320,1973,Master,Divorced,44051.0,1,1,2013-01-29,20,79,...,1,0,42,2,3.0,171,12,14.250000,0,0.0
1786,2894,1985,Graduation,Single,72903.0,0,0,2013-10-29,74,1067,...,1,0,30,2,1.0,2013,21,95.857143,3,0.6
1787,1726,1970,Graduation,Single,22585.0,0,0,2013-03-18,23,3,...,1,0,45,2,1.0,81,5,16.200000,1,0.2
1788,6905,1994,Graduation,Together,80685.0,0,0,2012-08-22,55,241,...,0,0,21,3,2.0,1004,21,47.809524,0,0.0


Test

In [3]:
test_set = pd.read_csv('./data/test_set.csv')
test_set.head(5)

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,Complain,Response,age,customes_seniority,Household_members,Total_amount,Total_purchase,Median_amount_purchase,Total_cmp,Total_%_cmp
0,2853,1980,Graduation,Single,51766.0,1,0,2014-03-11,74,60,...,0,0,35,1,2.0,275,12,22.916667,0,0.0
1,10492,1959,Graduation,Together,38285.0,2,1,2014-06-24,96,2,...,0,0,56,1,5.0,10,4,2.5,0,0.0
2,8939,1959,Graduation,Divorced,61250.0,0,1,2012-12-16,49,382,...,0,0,56,3,2.0,730,25,29.2,0,0.0
3,6274,1948,Master,Married,83790.0,0,0,2013-11-15,81,1076,...,0,0,67,2,2.0,1615,25,64.6,2,0.4
4,10232,1963,PhD,Divorced,48799.0,0,1,2013-11-05,9,174,...,0,0,52,2,2.0,331,15,22.066667,0,0.0


Imputo los cambios que vengo aplicando al dataset.    
Train

In [4]:
# Cambio el índice
train_set.set_index('ID', inplace=True)

# Cambio tipo a datetime
train_set['Dt_Customer'] = pd.to_datetime(train_set['Dt_Customer'])

# Cambiar tipo a categóricas
cols_to_category = ['Education', 'Marital_Status']
train_set[cols_to_category] = train_set[cols_to_category].astype('category')

# Elimino columna 'income_missing'
train_set = train_set.drop(columns=['income_missing', 'Year_Birth','Total_%_cmp','Dt_Customer','Median_amount_purchase'])

# Elimino el outlier de Income
train_set = train_set.loc[train_set['Income'] !=666666]

Y también al test

In [5]:
# Cambio el índice
test_set.set_index('ID', inplace=True)

# Cambio tipo a datetime
test_set['Dt_Customer'] = pd.to_datetime(test_set['Dt_Customer'])

# Cambiar tipo a categóricas
cols_to_category = ['Education', 'Marital_Status']
test_set[cols_to_category] = test_set[cols_to_category].astype('category')

# Elimino columna 'income_missing'
test_set = test_set.drop(columns=['Year_Birth','Total_%_cmp','Dt_Customer','Median_amount_purchase'])

# Elimino el outlier de Income
test_set = test_set.loc[test_set['Income'] !=666666]

## Transformación de variables

In [6]:
X_train = train_set.drop(columns='Response')
y_train = train_set['Response']

X_test = test_set.drop(columns='Response')
y_test = test_set['Response']

In [7]:
ratio = float(np.sum(y_train == 0)) / np.sum(y_train == 1)

In [8]:
features_num = X_train.select_dtypes(['int','float']).columns
features_cat = X_train.select_dtypes(['object', 'category']).columns
print('features_num', features_num)
print('features_cat', features_cat)

features_num Index(['Income', 'Kidhome', 'Teenhome', 'Recency', 'MntWines', 'MntFruits',
       'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
       'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
       'AcceptedCmp2', 'Complain', 'age', 'customes_seniority',
       'Household_members', 'Total_amount', 'Total_purchase', 'Total_cmp'],
      dtype='object')
features_cat Index(['Education', 'Marital_Status'], dtype='object')


In [9]:
# Definimos el OrdinalEncoder con el mapeo de 'education_ode'
ordinal_encoder = OrdinalEncoder(categories=[['Basic', '2n Cycle', 'Graduation', 'Master', 'PhD']])

categorical_features_onehot = ['Marital_Status']
categorical_features_ordinal = ['Education']
numerical_features = features_num

# Definimos el SimpleImputer
imputer = SimpleImputer(strategy='median') # Me da problemas el gridsearch.... Cuerioso porque antes no...

preprocessor = ColumnTransformer(
    transformers=[
        ('num', imputer, numerical_features),
        ('cat_onehot', OneHotEncoder(handle_unknown='ignore'), categorical_features_onehot),
        ('cat_ordinal', Pipeline([
            ('ordinal', ordinal_encoder),
            ('scaler', MinMaxScaler())
        ]), categorical_features_ordinal)
    ]
)

pipeline = Pipeline(steps=[
    ('preprocesor',preprocessor),
    ('algoritmo', RandomForestClassifier())
])

pipeline.fit(X_train,y_train)

X_train_transform = pipeline.named_steps['preprocesor'].transform(X_train)


In [10]:
X_train_transform

array([[5.0611e+04, 0.0000e+00, 1.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00],
       [5.7811e+04, 0.0000e+00, 1.0000e+00, ..., 1.0000e+00, 0.0000e+00,
        5.0000e-01],
       [6.9139e+04, 0.0000e+00, 1.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        5.0000e-01],
       ...,
       [2.2585e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        5.0000e-01],
       [8.0685e+04, 0.0000e+00, 0.0000e+00, ..., 1.0000e+00, 0.0000e+00,
        5.0000e-01],
       [3.3462e+04, 1.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        7.5000e-01]])

In [11]:
features_transformed = pipeline.named_steps['preprocesor'].get_feature_names_out()
features_transformed

array(['num__Income', 'num__Kidhome', 'num__Teenhome', 'num__Recency',
       'num__MntWines', 'num__MntFruits', 'num__MntMeatProducts',
       'num__MntFishProducts', 'num__MntSweetProducts',
       'num__MntGoldProds', 'num__NumDealsPurchases',
       'num__NumWebPurchases', 'num__NumCatalogPurchases',
       'num__NumStorePurchases', 'num__NumWebVisitsMonth',
       'num__AcceptedCmp3', 'num__AcceptedCmp4', 'num__AcceptedCmp5',
       'num__AcceptedCmp1', 'num__AcceptedCmp2', 'num__Complain',
       'num__age', 'num__customes_seniority', 'num__Household_members',
       'num__Total_amount', 'num__Total_purchase', 'num__Total_cmp',
       'cat_onehot__Marital_Status_Alone',
       'cat_onehot__Marital_Status_Divorced',
       'cat_onehot__Marital_Status_Married',
       'cat_onehot__Marital_Status_Others',
       'cat_onehot__Marital_Status_Single',
       'cat_onehot__Marital_Status_Together',
       'cat_onehot__Marital_Status_Widow', 'cat_ordinal__Education'],
      dtype=object)

In [12]:
X_train_transform_df = pd.DataFrame(X_train_transform, columns=(features_transformed))
X_train_transform_df

Unnamed: 0,num__Income,num__Kidhome,num__Teenhome,num__Recency,num__MntWines,num__MntFruits,num__MntMeatProducts,num__MntFishProducts,num__MntSweetProducts,num__MntGoldProds,...,num__Total_purchase,num__Total_cmp,cat_onehot__Marital_Status_Alone,cat_onehot__Marital_Status_Divorced,cat_onehot__Marital_Status_Married,cat_onehot__Marital_Status_Others,cat_onehot__Marital_Status_Single,cat_onehot__Marital_Status_Together,cat_onehot__Marital_Status_Widow,cat_ordinal__Education
0,50611.0,0.0,1.0,98.0,459.0,0.0,24.0,6.0,0.0,4.0,...,22.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.00
1,57811.0,0.0,1.0,49.0,545.0,7.0,114.0,37.0,21.0,78.0,...,25.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.50
2,69139.0,0.0,1.0,23.0,86.0,12.0,75.0,33.0,15.0,6.0,...,11.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.50
3,81361.0,0.0,0.0,18.0,163.0,23.0,424.0,27.0,65.0,76.0,...,27.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.50
4,23718.0,1.0,0.0,76.0,6.0,3.0,14.0,15.0,7.0,36.0,...,9.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1784,44051.0,1.0,1.0,20.0,79.0,7.0,58.0,6.0,3.0,18.0,...,12.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.75
1785,72903.0,0.0,0.0,74.0,1067.0,138.0,750.0,0.0,19.0,39.0,...,21.0,3.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.50
1786,22585.0,0.0,0.0,23.0,3.0,9.0,15.0,13.0,2.0,39.0,...,5.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.50
1787,80685.0,0.0,0.0,55.0,241.0,45.0,604.0,34.0,26.0,54.0,...,21.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.50


In [13]:
X_train_transform_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1789 entries, 0 to 1788
Data columns (total 35 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   num__Income                          1789 non-null   float64
 1   num__Kidhome                         1789 non-null   float64
 2   num__Teenhome                        1789 non-null   float64
 3   num__Recency                         1789 non-null   float64
 4   num__MntWines                        1789 non-null   float64
 5   num__MntFruits                       1789 non-null   float64
 6   num__MntMeatProducts                 1789 non-null   float64
 7   num__MntFishProducts                 1789 non-null   float64
 8   num__MntSweetProducts                1789 non-null   float64
 9   num__MntGoldProds                    1789 non-null   float64
 10  num__NumDealsPurchases               1789 non-null   float64
 11  num__NumWebPurchases          

In [14]:
features_trans_num = X_train_transform_df.select_dtypes(['int','float'])
features_trans_num

Unnamed: 0,num__Income,num__Kidhome,num__Teenhome,num__Recency,num__MntWines,num__MntFruits,num__MntMeatProducts,num__MntFishProducts,num__MntSweetProducts,num__MntGoldProds,...,num__Total_purchase,num__Total_cmp,cat_onehot__Marital_Status_Alone,cat_onehot__Marital_Status_Divorced,cat_onehot__Marital_Status_Married,cat_onehot__Marital_Status_Others,cat_onehot__Marital_Status_Single,cat_onehot__Marital_Status_Together,cat_onehot__Marital_Status_Widow,cat_ordinal__Education
0,50611.0,0.0,1.0,98.0,459.0,0.0,24.0,6.0,0.0,4.0,...,22.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.00
1,57811.0,0.0,1.0,49.0,545.0,7.0,114.0,37.0,21.0,78.0,...,25.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.50
2,69139.0,0.0,1.0,23.0,86.0,12.0,75.0,33.0,15.0,6.0,...,11.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.50
3,81361.0,0.0,0.0,18.0,163.0,23.0,424.0,27.0,65.0,76.0,...,27.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.50
4,23718.0,1.0,0.0,76.0,6.0,3.0,14.0,15.0,7.0,36.0,...,9.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1784,44051.0,1.0,1.0,20.0,79.0,7.0,58.0,6.0,3.0,18.0,...,12.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.75
1785,72903.0,0.0,0.0,74.0,1067.0,138.0,750.0,0.0,19.0,39.0,...,21.0,3.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.50
1786,22585.0,0.0,0.0,23.0,3.0,9.0,15.0,13.0,2.0,39.0,...,5.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.50
1787,80685.0,0.0,0.0,55.0,241.0,45.0,604.0,34.0,26.0,54.0,...,21.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.50


In [15]:
X_train[features_trans_num].isnull().any().any()

True

In [16]:
X_train[features_trans_num].isnull().sum()

Education              1789
Marital_Status         1789
Income                 1789
Kidhome                1789
Teenhome               1789
Recency                1789
MntWines               1789
MntFruits              1789
MntMeatProducts        1789
MntFishProducts        1789
MntSweetProducts       1789
MntGoldProds           1789
NumDealsPurchases      1789
NumWebPurchases        1789
NumCatalogPurchases    1789
NumStorePurchases      1789
NumWebVisitsMonth      1789
AcceptedCmp3           1789
AcceptedCmp4           1789
AcceptedCmp5           1789
AcceptedCmp1           1789
AcceptedCmp2           1789
Complain               1789
age                    1789
customes_seniority     1789
Household_members      1789
Total_amount           1789
Total_purchase         1789
Total_cmp              1789
dtype: int64

In [17]:
X_train_transform_df.head(60)

Unnamed: 0,num__Income,num__Kidhome,num__Teenhome,num__Recency,num__MntWines,num__MntFruits,num__MntMeatProducts,num__MntFishProducts,num__MntSweetProducts,num__MntGoldProds,...,num__Total_purchase,num__Total_cmp,cat_onehot__Marital_Status_Alone,cat_onehot__Marital_Status_Divorced,cat_onehot__Marital_Status_Married,cat_onehot__Marital_Status_Others,cat_onehot__Marital_Status_Single,cat_onehot__Marital_Status_Together,cat_onehot__Marital_Status_Widow,cat_ordinal__Education
0,50611.0,0.0,1.0,98.0,459.0,0.0,24.0,6.0,0.0,4.0,...,22.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,57811.0,0.0,1.0,49.0,545.0,7.0,114.0,37.0,21.0,78.0,...,25.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.5
2,69139.0,0.0,1.0,23.0,86.0,12.0,75.0,33.0,15.0,6.0,...,11.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.5
3,81361.0,0.0,0.0,18.0,163.0,23.0,424.0,27.0,65.0,76.0,...,27.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.5
4,23718.0,1.0,0.0,76.0,6.0,3.0,14.0,15.0,7.0,36.0,...,9.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.25
5,35388.0,1.0,0.0,20.0,6.0,4.0,7.0,4.0,3.0,8.0,...,5.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.25
6,87771.0,0.0,1.0,61.0,1492.0,38.0,287.0,50.0,57.0,33.0,...,20.0,4.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.5
7,59354.0,0.0,2.0,59.0,295.0,21.0,78.0,39.0,13.0,13.0,...,16.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.75
8,27733.0,1.0,0.0,16.0,0.0,7.0,5.0,26.0,2.0,17.0,...,7.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.25
9,75345.0,0.0,0.0,16.0,918.0,57.0,842.0,99.0,38.0,133.0,...,20.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.75


## Modelización

### Prueba de modelos con todas las variables

In [18]:
import traceback
from sklearn.exceptions import NotFittedError
pipe = Pipeline(steps=[
    ('preprocesor', preprocessor),
    ('algoritmo', RandomForestClassifier())
])


# Definir el grid de hiperparámetros
grid = [
    {'algoritmo': [RandomForestClassifier()],
     'algoritmo__max_depth': [5, 10, 15],
     'algoritmo__n_estimators': [50, 100, 200],
     'algoritmo__class_weight':['balanced']},
   
    {'algoritmo': [XGBClassifier()],
     'algoritmo__learning_rate': [0.1, 0.3, 0.5],
     'algoritmo__n_estimators': [100, 500, 1000],
     'algoritmo__scale_pos_weight':[ratio]},
    
    {'algoritmo': [LGBMClassifier()],
     'algoritmo__learning_rate': [0.1, 0.3, 0.5],
     'algoritmo__n_estimators': [100, 500, 1000],
     'algoritmo__is_unbalance' : [True],
     'algoritmo__objective': ['binary']}
]


# Realizar GridSearchCV con el scorer personalizado
grid_search = GridSearchCV(pipe, 
                           grid, 
                           cv=3, 
                           scoring=make_scorer(recall_score, pos_label=1), 
                           n_jobs=-1,
                           verbose= 3)


# Realizar GridSearchCV con el scorer personalizado
try:
    # Ajustar el GridSearchCV
    grid_search.fit(X_train[features_trans_num], y_train)
except ValueError as ve:
    print(f"ValueError: {ve}")
    traceback.print_exc()
    raise
except NotFittedError as nfe:
    print(f"NotFittedError: {nfe}")
    traceback.print_exc()
    raise
except Exception as e:
    print(f"Unexpected error: {e}")
    traceback.print_exc()
    raise
# Verificar si el ajuste fue exitoso
if grid_search.best_estimator_ is not None:
    print("Mejor modelo y parámetros:", grid_search.best_params_)
else:
    print("GridSearchCV no pudo encontrar el mejor modelo.")

Fitting 3 folds for each of 27 candidates, totalling 81 fits
ValueError: 
All the 81 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
81 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Alfonso\miniconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Alfonso\miniconda3\Lib\site-packages\sklearn\base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Alfonso\miniconda3\Lib\site-packages\sklearn\pipeline.py", line 471, in fit
    Xt = self._fit(X, y, routed_params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Alfonso\miniconda3\Lib\site-p

Traceback (most recent call last):
  File "C:\Users\Alfonso\AppData\Local\Temp\ipykernel_2940\3262487452.py", line 41, in <module>
    grid_search.fit(X_train[features_trans_num], y_train)
  File "c:\Users\Alfonso\miniconda3\Lib\site-packages\sklearn\base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Alfonso\miniconda3\Lib\site-packages\sklearn\model_selection\_search.py", line 970, in fit
    self._run_search(evaluate_candidates)
  File "c:\Users\Alfonso\miniconda3\Lib\site-packages\sklearn\model_selection\_search.py", line 1527, in _run_search
    evaluate_candidates(ParameterGrid(self.param_grid))
  File "c:\Users\Alfonso\miniconda3\Lib\site-packages\sklearn\model_selection\_search.py", line 947, in evaluate_candidates
    _warn_or_raise_about_fit_failures(out, self.error_score)
  File "c:\Users\Alfonso\miniconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _

ValueError: 
All the 81 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
81 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Alfonso\miniconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Alfonso\miniconda3\Lib\site-packages\sklearn\base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Alfonso\miniconda3\Lib\site-packages\sklearn\pipeline.py", line 471, in fit
    Xt = self._fit(X, y, routed_params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Alfonso\miniconda3\Lib\site-packages\sklearn\pipeline.py", line 408, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Alfonso\miniconda3\Lib\site-packages\joblib\memory.py", line 349, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Alfonso\miniconda3\Lib\site-packages\sklearn\pipeline.py", line 1303, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Alfonso\miniconda3\Lib\site-packages\sklearn\utils\_set_output.py", line 273, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Alfonso\miniconda3\Lib\site-packages\sklearn\base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Alfonso\miniconda3\Lib\site-packages\sklearn\compose\_column_transformer.py", line 914, in fit_transform
    result = self._call_func_on_transformers(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Alfonso\miniconda3\Lib\site-packages\sklearn\compose\_column_transformer.py", line 823, in _call_func_on_transformers
    return Parallel(n_jobs=self.n_jobs)(jobs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Alfonso\miniconda3\Lib\site-packages\sklearn\utils\parallel.py", line 67, in __call__
    return super().__call__(iterable_with_config)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Alfonso\miniconda3\Lib\site-packages\joblib\parallel.py", line 1088, in __call__
    while self.dispatch_one_batch(iterator):
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Alfonso\miniconda3\Lib\site-packages\joblib\parallel.py", line 901, in dispatch_one_batch
    self._dispatch(tasks)
  File "c:\Users\Alfonso\miniconda3\Lib\site-packages\joblib\parallel.py", line 819, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Alfonso\miniconda3\Lib\site-packages\joblib\_parallel_backends.py", line 208, in apply_async
    result = ImmediateResult(func)
             ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Alfonso\miniconda3\Lib\site-packages\joblib\_parallel_backends.py", line 597, in __init__
    self.results = batch()
                   ^^^^^^^
  File "c:\Users\Alfonso\miniconda3\Lib\site-packages\joblib\parallel.py", line 288, in __call__
    return [func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Alfonso\miniconda3\Lib\site-packages\joblib\parallel.py", line 288, in <listcomp>
    return [func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Alfonso\miniconda3\Lib\site-packages\sklearn\utils\parallel.py", line 129, in __call__
    return self.function(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Alfonso\miniconda3\Lib\site-packages\sklearn\pipeline.py", line 1303, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Alfonso\miniconda3\Lib\site-packages\sklearn\base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Alfonso\miniconda3\Lib\site-packages\sklearn\pipeline.py", line 535, in fit_transform
    Xt = self._fit(X, y, routed_params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Alfonso\miniconda3\Lib\site-packages\sklearn\pipeline.py", line 408, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Alfonso\miniconda3\Lib\site-packages\joblib\memory.py", line 349, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Alfonso\miniconda3\Lib\site-packages\sklearn\pipeline.py", line 1303, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Alfonso\miniconda3\Lib\site-packages\sklearn\utils\_set_output.py", line 273, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Alfonso\miniconda3\Lib\site-packages\sklearn\base.py", line 1064, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Alfonso\miniconda3\Lib\site-packages\sklearn\base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Alfonso\miniconda3\Lib\site-packages\sklearn\preprocessing\_encoders.py", line 1495, in fit
    fit_results = self._fit(
                  ^^^^^^^^^^
  File "c:\Users\Alfonso\miniconda3\Lib\site-packages\sklearn\preprocessing\_encoders.py", line 159, in _fit
    raise ValueError(msg)
ValueError: Found unknown categories [nan] in column 0 during fit


In [None]:
import traceback
from sklearn.exceptions import NotFittedError
pipe = Pipeline(steps=[
    ('preprocesor', preprocessor),
    ('algoritmo', RandomForestClassifier())
])


# Definir el grid de hiperparámetros
grid = [
    {'algoritmo': [RandomForestClassifier()],
     'algoritmo__max_depth': [5, 10, 15],
     'algoritmo__n_estimators': [50, 100, 200],
     'algoritmo__class_weight':['balanced']},
   
    {'algoritmo': [XGBClassifier()],
     'algoritmo__learning_rate': [0.1, 0.3, 0.5],
     'algoritmo__n_estimators': [100, 500, 1000],
     'algoritmo__scale_pos_weight':[ratio]},
    
    {'algoritmo': [LGBMClassifier()],
     'algoritmo__learning_rate': [0.1, 0.3, 0.5],
     'algoritmo__n_estimators': [100, 500, 1000],
     'algoritmo__is_unbalance' : [True],
     'algoritmo__objective': ['binary']}
]


# Verificar la presencia de valores NaN
if X_train[features_trans_num].isnull().any().any():
    print("Se encontraron valores NaN en los datos de entrada.")

# Configurar GridSearchCV para elevar errores
grid_search = GridSearchCV(pipe, 
                           grid, 
                           cv=3, 
                           scoring=make_scorer(recall_score, pos_label=1), 
                           n_jobs=-1,
                           error_score='raise')

# Intentar ajustar el GridSearchCV
try:
    grid_search.fit(X_train[features_trans_num], y_train)
except ValueError as e:
    print(f"Error durante el ajuste: {e}")
# Verificar si el ajuste fue exitoso
if grid_search.best_estimator_ is not None:
    print("Mejor modelo y parámetros:", grid_search.best_params_)
else:
    print("GridSearchCV no pudo encontrar el mejor modelo.")

Se encontraron valores NaN en los datos de entrada.
Error durante el ajuste: Found unknown categories [nan] in column 0 during fit


AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'

In [None]:
grid_search.best_score_

nan