In [1]:
import os

import pandas as pd
import polars as pl
import polars.selectors as cs
from matplotlib import pyplot as plt
import numpy as np

from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.experimental import enable_halving_search_cv

from catboost import CatBoostClassifier

from sklearn.model_selection import train_test_split, HalvingRandomSearchCV, RandomizedSearchCV, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from scipy.stats import randint, uniform
from sklearn.metrics import make_scorer, average_precision_score


from sklearn.impute import SimpleImputer
from optuna import Trial


In [2]:
TARGET = ['target']
TRAIN_DATA_PATH = os.path.join('data', 'first_100k.parquet')
TEST_DATA_PATH = os.path.join('data', 'second_100k_variables.parquet')

In [3]:
df = pl.scan_parquet(TRAIN_DATA_PATH).collect()

In [4]:
cb_ = CatBoostClassifier(verbose=0)

In [5]:
def optimize_numeric_column(df : pl.DataFrame, column_name : str) -> pl.DataFrame:
    
    # Extract the column from the DataFrame
    column = df[column_name]

    # Get the column data type
    dtype = column.dtype()

    int_types = {
        pl.UInt8: {0, 255},
        pl.UInt16: {0, 65535},
        pl.UInt32: {0, 4294967295},
        pl.UInt64: {0, 18446744073709551615},
        pl.Int8: {-128, 128},
        pl.Int16: {-32768, 32768},
        pl.Int32: {-2147483648, 2147483648},
        pl.Int64: {-9223372036854775808, 9223372036854775808}
    }

    if dtype in int_types:
        # Convert integer column to smaller integer types if possible
        column_min = column.min()
        column_max = column.max()
        
        if column_min is not None and column_max is not None:
            for int_type in int_types:
                int_min = min(int_types.get(int_type))
                int_max = max(int_types.get(int_type))

                if column_min >= int_min and column_max <= int_max:
                    column = column.cast(int_type)
                    break

    # Update the column in the DataFrame
    return df.drop(column_name).with_columns(column).select(df.columns)

In [7]:
categorical_columns = df.select(~cs.by_dtype(pl.NUMERIC_DTYPES)).columns
numerical_columns = [
    column for column in df.select(cs.by_dtype(pl.NUMERIC_DTYPES)).columns if column!='target' and column!='index'
]

In [8]:
X = df.select(numerical_columns+categorical_columns).to_pandas()
y = df.select(TARGET).to_pandas().values.ravel()

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, random_state=0)

del df

## Optuna preprocessing

Ignorar el código del `ExtraTreesClassifier`, lo dejé como referencia para fijarme de los trial.suggest_

In [9]:
# from sklearn.ensemble import ExtraTreesClassifier

# def instantiate_extra_trees(trial : Trial) -> ExtraTreesClassifier:
#   params = {
#     'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
#     'max_depth': trial.suggest_int('max_depth', 1, 20),
#     'max_features': trial.suggest_float('max_features', 0, 1),
#     'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
#     'n_jobs': -1,
#     'random_state': 42
#   }
#   return ExtraTreesClassifier(**params)


def instantiate_catboost(trial: Trial) -> CatBoostClassifier:
  params = {
    'iterations': 1_000,  #Better for batch learning 
    'depth': trial.suggest_int('depth', 1, 10),
    "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
    "subsample": trial.suggest_float("subsample", 0.05, 1.0),
    "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1.0),
    "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
    'random_state': 0,
  }
  return CatBoostClassifier(**params)

In [10]:
from sklearn.impute import SimpleImputer
from optuna import Trial

def instantiate_numerical_simple_imputer(trial : Trial, fill_value : int=-1) -> SimpleImputer:
  strategy = trial.suggest_categorical(
    'numerical_strategy', ['mean', 'median', 'most_frequent', 'constant']
  )
  return SimpleImputer(strategy=strategy, fill_value=fill_value)

def instantiate_categorical_simple_imputer(trial : Trial, fill_value : str='missing') -> SimpleImputer:
  strategy = trial.suggest_categorical(
    'categorical_strategy', ['most_frequent', 'constant']
  )
  return SimpleImputer(strategy=strategy, fill_value=fill_value)

In [11]:
from category_encoders import WOEEncoder

def instantiate_woe_encoder(trial : Trial) -> WOEEncoder:
  params = {
    'sigma': trial.suggest_float('sigma', 0.001, 5),
    'regularization': trial.suggest_float('regularization', 0, 5),
    'randomized': trial.suggest_categorical('randomized', [True, False])
  }
  return WOEEncoder(**params)

In [12]:
from sklearn.preprocessing import RobustScaler

def instantiate_robust_scaler(trial : Trial) -> RobustScaler:
  params = {
    'with_centering': trial.suggest_categorical(
      'with_centering', [True, False]
    ),
    'with_scaling': trial.suggest_categorical(
      'with_scaling', [True, False]
    )
  }
  return RobustScaler(**params)

## Pipeline

In [13]:
from sklearn.pipeline import Pipeline

def instantiate_numerical_pipeline(trial : Trial) -> Pipeline:
  pipeline = Pipeline([
    ('imputer', instantiate_numerical_simple_imputer(trial)),
    ('scaler', instantiate_robust_scaler(trial))
  ])
  return pipeline

def instantiate_categorical_pipeline(trial : Trial) -> Pipeline:
  pipeline = Pipeline([
    ('imputer', instantiate_categorical_simple_imputer(trial)),
    ('encoder', instantiate_woe_encoder(trial)),
  ])
  return pipeline



In [14]:
def instantiate_processor(trial : Trial, numerical_columns : list[str], categorical_columns : list[str]) -> ColumnTransformer:
  
  numerical_pipeline = instantiate_numerical_pipeline(trial)
  categorical_pipeline = instantiate_categorical_pipeline(trial)
  
  processor = ColumnTransformer([
    ('numerical_pipeline', numerical_pipeline, numerical_columns),
    ('categorical_pipeline', categorical_pipeline, categorical_columns)
  ])
  
  return processor

def instantiate_model(trial : Trial, numerical_columns : list[str], categorical_columns : list[str]) -> Pipeline:
  
  processor = instantiate_processor(
    trial, numerical_columns, categorical_columns
  )
  catboost = instantiate_catboost(trial)
  
  model = Pipeline([
    ('processor', processor),
    ('catboost', catboost)
  ])
  
  return model

## Objective Function

In [15]:
from typing import Optional
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import roc_auc_score, make_scorer
from pandas import DataFrame, Series
import numpy as np

def objective(trial : Trial, X : DataFrame, y : np.ndarray | Series, numerical_columns : Optional[list[str]]=None, categorical_columns : Optional[list[str]]=None, random_state : int=42) -> float:
  if numerical_columns is None:
    numerical_columns = [
      *X.select_dtypes(exclude=['object', 'category']).columns
    ]
  
  if categorical_columns is None:
    categorical_columns = [
      *X.select_dtypes(include=['object', 'category']).columns
    ]
  
  model = instantiate_model(trial, numerical_columns, categorical_columns)
  
  kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
  roc_auc_scorer = make_scorer(roc_auc_score, needs_proba=True)
  scores = cross_val_score(model, X, y, scoring=roc_auc_scorer, cv=kf)
  
  return np.min([np.mean(scores), np.median([scores])])



# CORRER ESTO

Esto es lo que debería optimizar el study y nos daría los mejores parámetros para el preprocessing y el modelo

In [16]:
import optuna

study = optuna.create_study(study_name='optimization', direction='maximize')

study.optimize(
    lambda trial: objective(trial, X_train, y_train),
    n_trials=30
)



[I 2024-07-05 06:12:39,195] A new study created in memory with name: optimization


0:	learn: 0.6589452	total: 123ms	remaining: 2m 3s
1:	learn: 0.6290643	total: 165ms	remaining: 1m 22s
2:	learn: 0.6010249	total: 211ms	remaining: 1m 10s
3:	learn: 0.5768464	total: 241ms	remaining: 60s
4:	learn: 0.5556150	total: 282ms	remaining: 56.1s
5:	learn: 0.5386149	total: 311ms	remaining: 51.6s
6:	learn: 0.5222052	total: 358ms	remaining: 50.8s
7:	learn: 0.5083598	total: 399ms	remaining: 49.5s
8:	learn: 0.4956767	total: 450ms	remaining: 49.5s
9:	learn: 0.4861272	total: 484ms	remaining: 47.9s
10:	learn: 0.4762673	total: 524ms	remaining: 47.1s
11:	learn: 0.4675651	total: 555ms	remaining: 45.7s
12:	learn: 0.4602027	total: 596ms	remaining: 45.3s
13:	learn: 0.4538124	total: 627ms	remaining: 44.2s
14:	learn: 0.4490282	total: 676ms	remaining: 44.4s
15:	learn: 0.4440668	total: 709ms	remaining: 43.6s
16:	learn: 0.4392297	total: 752ms	remaining: 43.5s
17:	learn: 0.4355876	total: 786ms	remaining: 42.9s
18:	learn: 0.4318839	total: 826ms	remaining: 42.7s
19:	learn: 0.4292479	total: 856ms	remaini

[W 2024-07-05 06:14:07,251] Trial 0 failed with parameters: {'numerical_strategy': 'mean', 'with_centering': False, 'with_scaling': True, 'categorical_strategy': 'most_frequent', 'sigma': 4.63557130226213, 'regularization': 3.6198456517538324, 'randomized': False, 'depth': 5, 'learning_rate': 0.059143502399963147, 'subsample': 0.25093177897830277, 'colsample_bylevel': 0.2311157322626548, 'min_data_in_leaf': 29} because of the following error: KeyboardInterrupt('').
Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipykernel_64219/785271184.py", line 6, in <lambda>
    lambda trial: objective(trial, X_train, y_train),
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_64219/1012677992.py", line 22, in objective
    scores = cross_val_score(model, X, y, scoring=roc_auc_scorer, cv=kf)
             ^^^

549:	learn: 0.3637495	total: 24.2s	remaining: 19.8s


KeyboardInterrupt: 