In [1]:
#load when is first time running
%reload_ext kedro.ipython

## Librerias

In [2]:
import warnings

warnings.filterwarnings("ignore")

In [3]:
#Import de librerias basicas tablas y matrices
import numpy as np 
import pandas as pd 

#Gradient Boosting
import lightgbm as lgb

#Funciones auxiliares sklearn
from sklearn.model_selection import train_test_split, StratifiedKFold #Split y cross Validation
from sklearn.metrics import cohen_kappa_score, accuracy_score, balanced_accuracy_score #Metricas
from sklearn.utils import shuffle 

#Visualizacióon
from plotly import express as px

#Plot de matriz de confusion normalizada en actuals
from utils import plot_confusion_matrix

import os

#Optimizacion de hiperparametros
import optuna
from optuna.artifacts import FileSystemArtifactStore, upload_artifact

#Guardado de objetos en archivos joblib
from joblib import load, dump

## Definimos paths y parametros de config

In [4]:
BASE_DIR = '../'

#Salida de modelos entrenados
PATH_TO_MODELS = os.path.join(BASE_DIR, "src/models")

#Artefactos a subir a optuna
PATH_TO_TEMP_FILES = os.path.join(BASE_DIR, "src/optuna_temp_artifacts")

#Artefactos que optuna gestiona
PATH_TO_OPTUNA_ARTIFACTS = os.path.join(BASE_DIR, "src/optuna_artifacts")



SEED = 37
TEST_SIZE = 0.30

In [21]:
def list_files_in_directory(path):
    # List all files in the given directory
    files = os.listdir(path)
    return files

list_files_in_directory(PATH_TO_OPTUNA_ARTIFACTS)

[1m[[0m[1m][0m

## Carga de datos

In [5]:
dataset = catalog.load("train")

In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14993 entries, 0 to 14992
Data columns (total 24 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Type           14993 non-null  int64  
 1   Name           13728 non-null  object 
 2   Age            14993 non-null  int64  
 3   Breed1         14993 non-null  int64  
 4   Breed2         14993 non-null  int64  
 5   Gender         14993 non-null  int64  
 6   Color1         14993 non-null  int64  
 7   Color2         14993 non-null  int64  
 8   Color3         14993 non-null  int64  
 9   MaturitySize   14993 non-null  int64  
 10  FurLength      14993 non-null  int64  
 11  Vaccinated     14993 non-null  int64  
 12  Dewormed       14993 non-null  int64  
 13  Sterilized     14993 non-null  int64  
 14  Health         14993 non-null  int64  
 15  Quantity       14993 non-null  int64  
 16  Fee            14993 non-null  int64  
 17  State          14993 non-null  int64  
 18  Rescue

Desestimo variables: 

* Name
* Description
* PetID
* RescuerID

In [7]:
features = ["Name", "Description", "PetID", "RescuerID"]
target = "AdoptionSpeed"

In [8]:
# Train test split

train, test = train_test_split(dataset.drop(columns= features, axis=1),
                               test_size = TEST_SIZE,
                               random_state = SEED,
                               stratify = dataset.AdoptionSpeed)

In [9]:
# Separamos features de Target
X_train = train.drop(columns = target, axis=1)
y_train = train[target]

X_test = test.drop(columns = target, axis =1)
y_test = test[target]

## LGBM

In [10]:
#Funcion que vamos a optimizar. Optuna requiere que usemos el objeto trial para generar los parametros a optimizar
def lgb_objective(trial):
    #PArametros para LightGBM
    lgb_params = {      
                        #PArametros fijos
                        'objective': 'multiclass',
                        'verbosity':-1,
                        'num_class': len(y_train.unique()),
                        #Hiperparametros a optimizar utilizando suggest_float o suggest_int segun el tipo de dato
                        #Se indica el nombre del parametro, valor minimo, valor maximo 
                        #en elgunos casos el parametro log=True para parametros que requieren buscar en esa escala
                        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
                        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
                        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
                        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
                        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
                        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
                        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
                        } 

    #Genero objeto dataset de entrenamiento
    lgb_train_dataset = lgb.Dataset(data=X_train,
                                    label=y_train)

    #ajuste de modelo
    lgb_model = lgb.train(lgb_params,
                          lgb_train_dataset)
    
    #Devuelvo el score en test
    return(cohen_kappa_score(y_test,lgb_model.predict(X_test).argmax(axis=1),
                             weights = 'quadratic'))

In [11]:
#Defino el estudio a optimizar
study = optuna.create_study(direction='maximize', #buscamos maximizar la metrica
                            storage="sqlite:///../src/db.sqlite3",  # Specify the storage URL here.
                            study_name="04 - LGB Multiclass", #nombre del experimento
                            load_if_exists=True) #continuar si ya existe

#Corremos 100 trials para buscar mejores parametros
study.optimize(lgb_objective, n_trials=100)

[I 2024-08-23 16:13:49,752] Using an existing study with name '04 - LGB Multiclass' instead of creating a new one.


[I 2024-08-23 16:13:51,004] Trial 200 finished with value: 0.35548663398583547 and parameters: {'lambda_l1': 4.788494053523894, 'lambda_l2': 2.717268256229093e-06, 'num_leaves': 222, 'feature_fraction': 0.9777051564835152, 'bagging_fraction': 0.7156186950523904, 'bagging_freq': 7, 'min_child_samples': 87}. Best is trial 195 with value: 0.3710338845595085.
[I 2024-08-23 16:13:52,754] Trial 201 finished with value: 0.3625529154757887 and parameters: {'lambda_l1': 6.442557894292415, 'lambda_l2': 3.1455605976448493e-06, 'num_leaves': 226, 'feature_fraction': 0.9903244785264335, 'bagging_fraction': 0.7326190585951554, 'bagging_freq': 7, 'min_child_samples': 93}. Best is trial 195 with value: 0.3710338845595085.
[I 2024-08-23 16:13:54,265] Trial 202 finished with value: 0.3548774333349385 and parameters: {'lambda_l1': 6.56203246736933, 'lambda_l2': 8.364440911960863e-07, 'num_leaves': 217, 'feature_fraction': 0.9976792696979697, 'bagging_fraction': 0.7011178734944339, 'bagging_freq': 7, 'min

In [12]:
#Obtenemos mejor resultado
study.best_params


[1m{[0m
    [32m'lambda_l1'[0m: [1;36m9.9811567627623[0m,
    [32m'lambda_l2'[0m: [1;36m4.063506237329485e-06[0m,
    [32m'num_leaves'[0m: [1;36m234[0m,
    [32m'feature_fraction'[0m: [1;36m0.963385272686421[0m,
    [32m'bagging_fraction'[0m: [1;36m0.7240942432705437[0m,
    [32m'bagging_freq'[0m: [1;36m7[0m,
    [32m'min_child_samples'[0m: [1;36m91[0m
[1m}[0m

### Entreno con los mejores parametros de optuna

In [13]:
#Vamos a replicar el resultado de la optimizacion reentrenando el modelo con el mejor conjunto de hiperparametros
#Generamos parametros incluyendo los fijos y la mejor solución que encontro optuna
lgb_params =  {      
                        'objective': 'multiclass',
                        'verbosity':-1,
                        'num_class': len(y_train.unique())} | study.best_params

lgb_train_dataset = lgb.Dataset(data=X_train,
                                label=y_train)


#Entreno
lgb_model = lgb.train(lgb_params,
                    lgb_train_dataset)

#Muestro matriz de confusion y kappa
display(plot_confusion_matrix(y_test,lgb_model.predict(X_test).argmax(axis=1)))

cohen_kappa_score(y_test,lgb_model.predict(X_test).argmax(axis=1),
                             weights = 'quadratic')


[1;36m0.37299163521315637[0m

In [19]:
test[target].value_counts()


AdoptionSpeed
[1;36m4[0m    [1;36m1259[0m
[1;36m2[0m    [1;36m1211[0m
[1;36m3[0m     [1;36m978[0m
[1;36m1[0m     [1;36m927[0m
[1;36m0[0m     [1;36m123[0m
Name: count, dtype: int64

## Modelo con CV 

In [14]:
#Genero una metrica para que lightGBM haga la evaluación y pueda hacer early_stopping en el cross validation
def lgb_custom_metric_kappa(dy_pred, dy_true):
    metric_name = 'kappa'
    value = cohen_kappa_score(dy_true.get_label(),dy_pred.argmax(axis=1),weights = 'quadratic')
    is_higher_better = True
    return(metric_name, value, is_higher_better)

#Funcion objetivo a optimizar. En este caso vamos a hacer 5fold cv sobre el conjunto de train. 
# El score de CV es el objetivo a optimizar. Ademas vamos a usar los 5 modelos del CV para estimar el conjunto de test,
# registraremos en optuna las predicciones, matriz de confusion y el score en test.
# CV Score -> Se usa para determinar el rendimiento de los hiperparametros con precision 
# Test Score -> Nos permite testear que esta todo OK, no use (ni debo usar) esos datos para nada en el entrenamiento 
# o la optimizacion de hiperparametros

def cv_es_lgb_objective(trial):

    #PArametros para LightGBM
    lgb_params = {      
                        #PArametros fijos
                        'objective': 'multiclass',
                        'verbosity':-1,
                        'num_class': len(y_train.unique()),
                        #Hiperparametros a optimizar utilizando suggest_float o suggest_int segun el tipo de dato
                        #Se indica el nombre del parametro, valor minimo, valor maximo 
                        #en elgunos casos el parametro log=True para parametros que requieren buscar en esa escala
                        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
                        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
                        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
                        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
                        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
                        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
                        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
                        } 

    #Voy a generar estimaciones de los 5 modelos del CV sobre los datos test y los acumulo en la matriz scores_ensemble
    scores_ensemble = np.zeros((len(y_test),len(y_train.unique())))

    #Score del 5 fold CV inicializado en 0
    score_folds = 0

    #Numero de splits del CV
    n_splits = 5

    #Objeto para hacer el split estratificado de CV
    skf = StratifiedKFold(n_splits=n_splits)

    for i, (if_index, oof_index) in enumerate(skf.split(X_train, y_train)):
        
        #Dataset in fold (donde entreno) 
        lgb_if_dataset = lgb.Dataset(data=X_train.iloc[if_index],
                                        label=y_train.iloc[if_index],
                                        free_raw_data=False)
        
        #Dataset Out of fold (donde mido la performance del CV)
        lgb_oof_dataset = lgb.Dataset(data=X_train.iloc[oof_index],
                                        label=y_train.iloc[oof_index],
                                        free_raw_data=False)

        #Entreno el modelo
        lgb_model = lgb.train(lgb_params,
                                lgb_if_dataset,
                                valid_sets=lgb_oof_dataset,
                                callbacks=[lgb.early_stopping(10, verbose=False)],
                                feval = lgb_custom_metric_kappa
                                )
        
        #Acumulo los scores (probabilidades) de cada clase para cada uno de los modelos que determino en los folds
        #Se predice el 20% de los datos que separe para tes y no uso para entrenar en ningun fold
        scores_ensemble = scores_ensemble + lgb_model.predict(X_test)
        
        #Score del fold (registros de dataset train que en este fold quedan out of fold)
        score_folds = score_folds + cohen_kappa_score(y_train.iloc[oof_index], 
                                                            lgb_model.predict(X_train.iloc[oof_index]).argmax(axis=1),weights = 'quadratic')/n_splits


    #Guardo prediccion del trial sobre el conjunto de test
    # Genero nombre de archivo
    predicted_filename = os.path.join(PATH_TO_TEMP_FILES,f'test_{trial.study.study_name}_{trial.number}.joblib')
    # Copia del dataset para guardar la prediccion
    predicted_df = test.copy()
    # Genero columna pred con predicciones sumadas de los 5 folds
    predicted_df['pred'] = [scores_ensemble[p,:] for p in range(scores_ensemble.shape[0])]
    # Grabo dataframe en temp_artifacts
    dump(predicted_df, predicted_filename)
    # Indico a optuna que asocie el archivo generado al trial
    upload_artifact(trial, predicted_filename, artifact_store)    

    #Grabo natriz de confusion
    #Nombre de archivo
    cm_filename = os.path.join(PATH_TO_TEMP_FILES,f'cm_{trial.study.study_name}_{trial.number}.jpg')
    #Grabo archivo
    plot_confusion_matrix(y_test,scores_ensemble.argmax(axis=1)).write_image(cm_filename)
    #Asocio al trial
    upload_artifact(trial, cm_filename, artifact_store)

    #Determino score en conjunto de test y asocio como metrica adicional en optuna
    test_score = cohen_kappa_score(y_test,scores_ensemble.argmax(axis=1),weights = 'quadratic')
    trial.set_user_attr("test_score", test_score)

    #Devuelvo score del 5fold cv a optuna para que optimice en base a eso
    return(score_folds)

In [22]:
#Inicio el store de artefactos (archivos) de optuna
artifact_store = FileSystemArtifactStore(base_path=PATH_TO_OPTUNA_ARTIFACTS)

#Genero estudio
study = optuna.create_study(direction='maximize',
                            storage="sqlite:///../src/db.sqlite3",  # Specify the storage URL here.
                            study_name="04 - LGB Multiclass CV",
                            load_if_exists = True)
#Corro la optimizacion
study.optimize(cv_es_lgb_objective, n_trials=100)

[I 2024-08-23 16:24:26,427] Using an existing study with name '04 - LGB Multiclass CV' instead of creating a new one.
[I 2024-08-23 16:24:32,357] Trial 100 finished with value: 0.32965043926574034 and parameters: {'lambda_l1': 1.3543941505437676e-07, 'lambda_l2': 0.0033351246620067403, 'num_leaves': 4, 'feature_fraction': 0.6134754113323936, 'bagging_fraction': 0.8761652060734261, 'bagging_freq': 3, 'min_child_samples': 55}. Best is trial 83 with value: 0.36524086892355434.
[I 2024-08-23 16:24:37,796] Trial 101 finished with value: 0.3337981061111496 and parameters: {'lambda_l1': 3.950007126853082e-08, 'lambda_l2': 0.0013268282135546878, 'num_leaves': 151, 'feature_fraction': 0.5267479386989775, 'bagging_fraction': 0.7869354345321885, 'bagging_freq': 3, 'min_child_samples': 62}. Best is trial 83 with value: 0.36524086892355434.
[I 2024-08-23 16:24:42,147] Trial 102 finished with value: 0.3319678141651714 and parameters: {'lambda_l1': 6.280896995995947e-07, 'lambda_l2': 0.00025382776645

In [23]:
!optuna-dashboard sqlite:///../work/db.sqlite3 --artifact-dir ../work/optuna_artifacts --port 8081

Traceback (most recent call last):
  File "/home/frank/maestria_mcd/labo2/Competencia/env/lib/python3.10/site-packages/sqlalchemy/engine/base.py", line 146, in __init__
    self._dbapi_connection = engine.raw_connection()
  File "/home/frank/maestria_mcd/labo2/Competencia/env/lib/python3.10/site-packages/sqlalchemy/engine/base.py", line 3302, in raw_connection
    return self.pool.connect()
  File "/home/frank/maestria_mcd/labo2/Competencia/env/lib/python3.10/site-packages/sqlalchemy/pool/base.py", line 449, in connect
    return _ConnectionFairy._checkout(self)
  File "/home/frank/maestria_mcd/labo2/Competencia/env/lib/python3.10/site-packages/sqlalchemy/pool/base.py", line 1263, in _checkout
    fairy = _ConnectionRecord.checkout(pool)
  File "/home/frank/maestria_mcd/labo2/Competencia/env/lib/python3.10/site-packages/sqlalchemy/pool/base.py", line 712, in checkout
    rec = pool._do_get()
  File "/home/frank/maestria_mcd/labo2/Competencia/env/lib/python3.10/site-packages/sqlalchemy/