In [289]:
import numpy as np 
import pandas as pd 
import lightgbm as lgb

from sklearn.model_selection import train_test_split, StratifiedKFold
#from sklearn.metrics import cohen_kappa_score, accuracy_score,balanced_accuracy_score
from sklearn.utils import shuffle
from sklearn.model_selection import KFold # Use KFold for regression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

#from plotly import express as px

#from utils import plot_confusion_matrix

import os

import optuna
from optuna.artifacts import FileSystemArtifactStore, upload_artifact

from joblib import load, dump
import pyarrow.parquet as pq
import joblib


import gc
import shutil
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin


## Armado Dataset

In [290]:
# Paths
BASE_DIR = '../'

#PATH_TO_TRAIN = os.path.join(BASE_DIR, "Dataset/train.parquet")
#PATH_TO_TRAIN = os.path.join(BASE_DIR, "Dataset/train_producto.parquet")
PATH_TO_TRAIN = os.path.join(BASE_DIR, "Dataset/1_c_producto_train.parquet")
PATH_TO_MODELS = os.path.join(BASE_DIR, "LGBM/models")
PATH_TO_TEMP_FILES = os.path.join(BASE_DIR, "LGBM/optuna_/optuna_temp_artifacts")
PATH_TO_OPTUNA_ARTIFACTS = os.path.join(BASE_DIR, "LGBM/optuna_/optuna_artifacts")
PATH_TO_MODELS_OLD=PATH_TO_MODELS_OLD = os.path.join(BASE_DIR, "LGBM/models/old")


nombrebase="sqlite:///db.sqlite33"
nombreestudio="producto_linear"

SEED = 42
BATCH_SIZE = 50
TEST_SIZE = 0.2
PATH_TO_TRAIN

'../Dataset/1_c_producto_train.parquet'

In [291]:
dataset = pd.read_parquet(PATH_TO_TRAIN)

In [292]:
del dataset['ultima_tn']

In [293]:
dataset['clase_producto'].dtype

dtype('float16')

In [294]:
dataset.shape

(29652, 311)

In [295]:
char_feats = dataset.select_dtypes(include=['object', 'category']).columns.tolist()
print(char_feats)
#numeric_feats = [f for f in dataset.columns if dataset[f].dtype!='O']

['cat1', 'cat2', 'cat3', 'brand', 'categoria', 'estado_producto']


In [296]:
numeric_feats = dataset.select_dtypes(include=['number']).columns.tolist()
print(numeric_feats)

['periodo', 'product_id', 'producto_total_tn', 'avg_tn', 'std_tn', 'clientes_distintos', 'cust_request_qty', 'cust_request_tn', 'inicio_vida_p', 'fin_vida_p', 'sku_size', 'stock_final', 'dias_mes_4', 'dias_mes_3', 'anio', 'mes', 'trimestre', 'outlier', 'outlier-2', 'tn_lag_1', 'tn_lag_2', 'tn_lag_3', 'tn_lag_4', 'tn_lag_5', 'tn_lag_6', 'tn_lag_7', 'tn_lag_8', 'tn_lag_9', 'tn_lag_10', 'tn_lag_11', 'tn_lag_12', 'tn_lag_13', 'tn_lag_14', 'tn_lag_15', 'tn_lag_16', 'tn_lag_17', 'tn_lag_18', 'tn_lag_19', 'tn_lag_20', 'tn_lag_21', 'tn_lag_22', 'tn_lag_23', 'tn_lag_24', 'tn_lag_25', 'tn_lag_26', 'tn_lag_27', 'tn_lag_28', 'tn_lag_29', 'tn_lag_30', 'tn_lag_31', 'tn_lag_32', 'tn_lag_33', 'tn_lag_34', 'tn_lag_35', 'tn_lag_36', 'tn_media_movil_3', 'tn_media_movil_6', 'tn_media_movil_9', 'tn_media_movil_12', 'tn_std_movil_3', 'tn_std_movil_6', 'tn_std_movil_9', 'tn_std_movil_12', 'tn_min_movil_3', 'tn_min_movil_6', 'tn_min_movil_9', 'tn_min_movil_12', 'delta_media_movil_12', 'delta_media_movil_3', '

In [297]:
dataset.shape

(29652, 311)

In [298]:
gc.collect()

3074

In [299]:
def reduce_mem_usage(df, verbose=True):
    start_mem = df.memory_usage(deep=True).sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtype

        #   - Datetime
        # 1) Datetime → int64 (ns) → float32
        if pd.api.types.is_datetime64_any_dtype(col_type):
            # view() extrae los nanosegundos desde epoch
            df[col] = df[col].view('int64').astype('float32')
            continue


        # Solo nos ocupamos de numéricos
        if not pd.api.types.is_numeric_dtype(col_type):
            if not pd.api.types.is_categorical_dtype(col_type):
                df[col] = df[col].astype('category')
            continue

        c_min, c_max = df[col].min(), df[col].max()
        has_na = df[col].isnull().any()

        # --- ENTEROS ---
        if pd.api.types.is_integer_dtype(col_type):
            # 1) Sin nulos → numpy ints
            if not has_na:
                if c_min >= np.iinfo(np.int8).min  and c_max <= np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min >= np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min >= np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                else:
                    df[col] = df[col].astype(np.int64)


        # --- FLOTANTES ---
        else:
            df[col] = df[col].astype(np.float32)

    end_mem = df.memory_usage(deep=True).sum() / 1024**2
    if verbose:
        print(f'Uso de memoria inicial del DataFrame: {start_mem:.2f} MB')
        print(f'Uso de memoria final del DataFrame:   {end_mem:.2f} MB')
        print(f'Memoria reducida en un {(100*(start_mem-end_mem)/start_mem):.2f}%')
    return df

In [300]:
dataset = reduce_mem_usage(dataset)

Uso de memoria inicial del DataFrame: 19.68 MB
Uso de memoria final del DataFrame:   33.22 MB
Memoria reducida en un -68.84%


  if not pd.api.types.is_categorical_dtype(col_type):
  if not pd.api.types.is_categorical_dtype(col_type):
  if not pd.api.types.is_categorical_dtype(col_type):
  df[col] = df[col].view('int64').astype('float32')


In [301]:
gc.collect()

0

In [302]:
dataset.shape

(29652, 311)

In [303]:
train, test = train_test_split(dataset,
                               test_size = TEST_SIZE,
                               random_state = SEED,
                               ) #stratify = dataset.clase saco esto porque no es un problema de clasificacion

In [304]:
train.shape

(23721, 311)

In [305]:
# Eliminar el DataFrame
del dataset

gc.collect()

0

In [306]:
label = 'clase_producto'
features = [col for col in train.columns if col != label]

In [307]:
train.to_parquet("train_tmp.parquet", index=False)
del train
gc.collect()

train = pd.read_parquet("train_tmp.parquet")


In [308]:
train['producto_total_tn'].dtype

dtype('float32')

In [309]:
# ⚙️ definís tu scaler
class StdDivisorScaler(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        X = np.asarray(X)
        self.stds_ = np.nanstd(X, axis=0)
        self.stds_[self.stds_ == 0] = 1.0
        return self

    def transform(self, X):
        X = np.asarray(X)
        X_out = X.copy()
        for col_idx in range(X_out.shape[1]):
            col_data = X_out[:, col_idx]
            mask = ~np.isnan(col_data)
            col_data[mask] = col_data[mask] / self.stds_[col_idx]
            X_out[:, col_idx] = col_data
        return X_out

# columnas
cols_to_keep = ['periodo', 'product_id', 'clase_producto'] + char_feats
cols_to_scale = [col for col in train.columns if col not in cols_to_keep]

# dtypes originales
dtypes_originales = train[cols_to_keep].dtypes.to_dict()
dtypes_escala = train[cols_to_scale].dtypes.to_dict()

# 🔷 aquí creás y fiteás el preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('custom_scaler', StdDivisorScaler(), cols_to_scale),
        ('passthrough', 'passthrough', cols_to_keep)
    ]
)

preprocessor.fit(train)

# Ahora sí, podés usar el scaler individualmente
X_scaled = preprocessor.named_transformers_['custom_scaler'].transform(train[cols_to_scale])

scaled_df = pd.DataFrame(X_scaled, columns=cols_to_scale, index=train.index)

for col in cols_to_scale:
    if pd.api.types.is_integer_dtype(dtypes_escala[col]):
        scaled_df[col] = scaled_df[col].astype(float)

passthrough_df = train[cols_to_keep].copy()
for col in cols_to_keep:
    passthrough_df[col] = passthrough_df[col].astype(dtypes_originales[col])

train = pd.concat([scaled_df, passthrough_df], axis=1)

print(train.dtypes)
print(train.head())

# opcional: guardar todo
joblib.dump((preprocessor, cols_to_scale, cols_to_keep, dtypes_originales, dtypes_escala), 'escalador_completo_stddiv.joblib')

del X_scaled
gc.collect()

  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,


producto_total_tn      float64
avg_tn                 float64
std_tn                 float64
clientes_distintos     float64
cust_request_qty       float64
                        ...   
cat2                  category
cat3                  category
brand                 category
categoria             category
estado_producto       category
Length: 311, dtype: object
   producto_total_tn    avg_tn    std_tn  clientes_distintos  \
0           0.139069  0.141088  0.177842           17.227564   
1           0.727734  0.762872  0.652542           16.673937   
2           0.243405  0.255158  0.241928           16.673937   
3           0.078218  0.073109  0.072461           18.693046   
4           0.045483  0.046849  0.034551           16.967033   

   cust_request_qty  cust_request_tn  inicio_vida_p   fin_vida_p  sku_size  \
0          1.440458         0.134560    4965.009136  6259.601069  0.467223   
1          2.480789         0.704141    4964.787604  6262.888968  0.408820   
2          1.

35

In [310]:
train['clase_producto'].dtype

dtype('float32')

In [311]:
test.to_parquet("test_tmp.parquet", index=False)
del test
gc.collect()

test = pd.read_parquet("test_tmp.parquet")

In [312]:
# 🔷 Cargar lo que guardaste en train
(preprocessor, cols_to_scale, cols_to_keep, dtypes_originales, dtypes_escala) = joblib.load(
    'escalador_completo_stddiv.joblib'
)

# 🔷 Transformar todo test
X_test_transformed = preprocessor.transform(test)

# El resultado es un numpy.ndarray con las columnas en el orden:
# primero cols_to_scale (escaladas) y después cols_to_keep (passthrough)
# reconstruimos en DataFrame
all_cols_order = cols_to_scale + cols_to_keep
test = pd.DataFrame(X_test_transformed, columns=all_cols_order, index=test.index)

# 🔷 Restaurar dtypes en cols_to_scale y cols_to_keep

# Para cols_to_scale: asegurarse de que las originalmente int sigan siendo float (por los NaN)
for col in cols_to_scale:
    if pd.api.types.is_integer_dtype(dtypes_escala[col]):
        test[col] = pd.to_numeric(test[col], errors='coerce').astype(float)
    elif pd.api.types.is_float_dtype(dtypes_escala[col]):
        test[col] = pd.to_numeric(test[col], errors='coerce').astype(float)

# Para cols_to_keep: devolver exactamente al dtype original
for col in cols_to_keep:
    # primero asegurarse que esté en str si es necesario
    if pd.api.types.is_categorical_dtype(dtypes_originales[col]) or pd.api.types.is_object_dtype(dtypes_originales[col]):
        test[col] = test[col].astype(str)
    # ahora castear al tipo original
    test[col] = test[col].astype(dtypes_originales[col])

print("\n✅ DataFrame final TEST con std divisor scaler:\n")
print(test.dtypes)
print(test.head())

# 🔷 liberar memoria
del X_test_transformed
gc.collect()


✅ DataFrame final TEST con std divisor scaler:

producto_total_tn      float64
avg_tn                 float64
std_tn                 float64
clientes_distintos     float64
cust_request_qty       float64
                        ...   
cat2                  category
cat3                  category
brand                 category
categoria             category
estado_producto       category
Length: 311, dtype: object
   producto_total_tn    avg_tn    std_tn  clientes_distintos  \
0           0.008812  0.008344  0.006130           18.465082   
1           0.004923  0.005129  0.004617           16.771636   
2           0.018745  0.018873  0.043925           17.357829   
3           0.241062  0.246344  0.289846           17.097299   
4           0.005793  0.005545  0.006115           18.269685   

   cust_request_qty  cust_request_tn  inicio_vida_p   fin_vida_p  sku_size  \
0          1.088346         0.008527    4964.787604  6262.888968  0.233611   
1          0.776247         0.004763    49

  if pd.api.types.is_categorical_dtype(dtypes_originales[col]) or pd.api.types.is_object_dtype(dtypes_originales[col]):
  if pd.api.types.is_categorical_dtype(dtypes_originales[col]) or pd.api.types.is_object_dtype(dtypes_originales[col]):
  if pd.api.types.is_categorical_dtype(dtypes_originales[col]) or pd.api.types.is_object_dtype(dtypes_originales[col]):
  if pd.api.types.is_categorical_dtype(dtypes_originales[col]) or pd.api.types.is_object_dtype(dtypes_originales[col]):
  if pd.api.types.is_categorical_dtype(dtypes_originales[col]) or pd.api.types.is_object_dtype(dtypes_originales[col]):
  if pd.api.types.is_categorical_dtype(dtypes_originales[col]) or pd.api.types.is_object_dtype(dtypes_originales[col]):


0

In [313]:
train.shape

(23721, 311)

In [314]:
X_train = train[features]
y_train = train[label]

X_test = test[features]
y_test = test[label]

In [315]:
X_train.shape

(23721, 310)

In [316]:
del train
#del test

gc.collect()

0

In [317]:
print("X_train shape:", X_train.shape)

X_train shape: (23721, 310)


## Entrenamiento

In [324]:
def lgb_objective(trial):

    # Parámetros para LightGBM
    lgb_params = {
        'objective': 'regression',
        'metric': 'mse', # Cambiado a Mean Squared Error
        'verbosity': -1,
        'linear_tree': True,
        'seed': SEED,
        # 'num_class': len(y_train.unique()), # Eliminar, esto es para clasificación
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 10, 150),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 0, 200),
        'max_bin': trial.suggest_int('max_bin', 64, 1024),
        'min_sum_hessian_in_leaf': trial.suggest_float('min_sum_hessian_in_leaf', 1e-3, 10.0, log=True)
    }
        
    # Voy a generar estimaciones de los 5 modelos del CV sobre los datos test y los acumulo en la matriz scores_ensemble
    # Para regresión, scores_ensemble será un array 1D
    scores_ensemble = np.zeros(len(y_test),dtype=np.float32)

    # Score del 5 fold CV inicializado en 0
    score_folds = 0

    # Numero de splits del CV
    n_splits = 5

    # Objeto para hacer el split de CV (usar KFold para regresión, ya que StratifiedKFold es para clasificación)
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=SEED) # Added shuffle and random_state for reproducibility
    best_iterations = []  # ⬅️ para acumular las best_iteration de cada fold

    for i, (if_index, oof_index) in enumerate(kf.split(X_train, y_train)): # Usar kf.split

        # Dataset in fold (donde entreno)
        lgb_if_dataset = lgb.Dataset(data=X_train.iloc[if_index],
                                     label=y_train.iloc[if_index],
                                     free_raw_data=True, categorical_feature=char_feats) #cambie free raw data a true

        # Dataset Out of fold (donde mido la performance del CV)
        lgb_oof_dataset = lgb.Dataset(data=X_train.iloc[oof_index],
                                      label=y_train.iloc[oof_index],
                                      free_raw_data=True, categorical_feature=char_feats) #cambie free raw data a true

        # Entreno el modelo
        lgb_model = lgb.train(lgb_params,
                              lgb_if_dataset,
                              valid_sets=lgb_oof_dataset,
                              num_boost_round=10000,
                              callbacks=[lgb.early_stopping(200, verbose=False)],
                              # feval = mean_squared_error(y_test, preds, squared=False) # Eliminar o definir correctamente para custom metric
                             )

        # Acumulo las predicciones continuas para el conjunto de test
        scores_ensemble = scores_ensemble + lgb_model.predict(X_test)

        # Score del fold (registros de dataset train que en este fold quedan out of fold)
        # Calcular MSE para el fold OOF
        oof_preds = lgb_model.predict(X_train.iloc[oof_index])
        score_folds += mean_squared_error(y_train.iloc[oof_index], oof_preds) / n_splits
        
        # guardo el best_iteration de este fold
        best_iterations.append(lgb_model.best_iteration)
        
        # ⬇️ Liberar memoria de objetos pesados del fold
        del lgb_model, lgb_if_dataset, lgb_oof_dataset, oof_preds
        gc.collect()
       #print(f"fin split {i}")

    # Promedio las predicciones del ensemble para el conjunto de test
    scores_ensemble = scores_ensemble / n_splits

    # Guardo prediccion del trial sobre el conjunto de test
    # Genero nombre de archivo
    #predicted_filename = os.path.join(PATH_TO_TEMP_FILES,f'test_{trial.study.study_name}_{trial.number}.joblib')
    # Copia del dataset para guardar la prediccion
    #predicted_df = test.copy()
    #predicted_df= pd.DataFrame({
    #'periodo': test['periodo'],
    #'customer_id': test['customer_id'],  # o el ID que necesites
    #'product_id' : test['product_id'],
    #'pred': scores_ensemble
    #})
    
    # Genero columna pred con predicciones promediadas de los 5 folds
    #predicted_df['pred'] = scores_ensemble
    # Grabo dataframe en temp_artifacts
    #dump(predicted_df, predicted_filename)
    # Indico a optuna que asocie el archivo generado al trial
    #upload_artifact(trial, predicted_filename, artifact_store)
    #del predicted_df
    gc.collect()
    
    # Grabo métricas de regresión en lugar de matriz de confusión
    # Puedes guardar métricas como MSE, RMSE, MAE, R2, etc. en un archivo de texto o log
    # Por ejemplo, calcular el MSE en el conjunto de test
    test_mse = mean_squared_error(y_test, scores_ensemble)
    trial.set_user_attr("test_mse", test_mse) # Almacenar MSE del test como atributo de usuario

    # guardar el promedio de las mejores iteraciones
    avg_best_iteration = int(np.mean(best_iterations))
    trial.set_user_attr("best_iteration", avg_best_iteration)
    
    del scores_ensemble
    gc.collect()

    # Si quieres una visualización, podrías generar un scatter plot de predicciones vs valores reales
    # O un histograma de residuos. Aquí un placeholder para un "regression_metrics.txt"
    #regression_metrics_filename = os.path.join(PATH_TO_TEMP_FILES, f'regression_metrics_{trial.study.study_name}_{trial.number}.txt')
    #with open(regression_metrics_filename, 'w') as f:
    #    f.write(f"Test Mean Squared Error: {test_mse}\n")
    #    # Add other regression metrics if desired
    #upload_artifact(trial, regression_metrics_filename, artifact_store)
    
    gc.collect()

    # Devuelvo el score promedio de MSE del 5-fold CV a Optuna para que optimice en base a eso
    # Optuna minimiza por defecto, por lo que devolver MSE es apropiado.
    return score_folds

In [325]:
#Inicio el store de artefactos (archivos) de optuna
artifact_store = FileSystemArtifactStore(base_path=PATH_TO_OPTUNA_ARTIFACTS)

#Genero estudio
study = optuna.create_study(direction='minimize',
                            storage=nombrebase,  # Specify the storage URL here.
                            study_name=nombreestudio,
                            load_if_exists = True)
#Corro la optimizacion
study.optimize(lgb_objective, n_trials=2000, gc_after_trial=True)

[I 2025-07-17 12:53:06,957] Using an existing study with name 'producto_linear' instead of creating a new one.
[I 2025-07-17 12:57:55,431] Trial 268 finished with value: 891.8910174659568 and parameters: {'lambda_l1': 2.325916090763849, 'lambda_l2': 0.0001437194737808914, 'num_leaves': 148, 'feature_fraction': 0.9687817423298575, 'learning_rate': 0.01887976400286657, 'bagging_fraction': 0.9379939595657532, 'bagging_freq': 1, 'min_child_samples': 189, 'max_bin': 946, 'min_sum_hessian_in_leaf': 0.04159730162210393}. Best is trial 135 with value: 872.4531699475713.
[I 2025-07-17 13:04:15,261] Trial 269 finished with value: 900.3588286581573 and parameters: {'lambda_l1': 1.5215125164342471, 'lambda_l2': 0.00025443723673679894, 'num_leaves': 147, 'feature_fraction': 0.9842392608520609, 'learning_rate': 0.014338406476159535, 'bagging_fraction': 0.9264616889088194, 'bagging_freq': 1, 'min_child_samples': 149, 'max_bin': 849, 'min_sum_hessian_in_leaf': 0.051951256409332104}. Best is trial 135 

KeyboardInterrupt: 

## Guardar modelos con mejores parametros

In [326]:
# Buscar el trial con menor mse_test en los atributos de usuario
best_trial = min(
    [t for t in study.trials if t.user_attrs.get("test_mse") is not None],
    key=lambda t: t.user_attrs["test_mse"]
)

In [327]:
print(best_trial)

FrozenTrial(number=274, state=1, values=[892.2291823657164], datetime_start=datetime.datetime(2025, 7, 17, 13, 27, 23, 271155), datetime_complete=datetime.datetime(2025, 7, 17, 13, 34, 7, 85281), params={'lambda_l1': 1.8348606478985292, 'lambda_l2': 0.0005072564366547975, 'num_leaves': 149, 'feature_fraction': 0.9891126494356512, 'learning_rate': 0.015276483431226933, 'bagging_fraction': 0.9348515783259492, 'bagging_freq': 1, 'min_child_samples': 142, 'max_bin': 920, 'min_sum_hessian_in_leaf': 0.04897745909878601}, user_attrs={'best_iteration': 826, 'test_mse': 872.5765521635892}, system_attrs={}, intermediate_values={}, distributions={'lambda_l1': FloatDistribution(high=10.0, log=True, low=1e-08, step=None), 'lambda_l2': FloatDistribution(high=10.0, log=True, low=1e-08, step=None), 'num_leaves': IntDistribution(high=150, log=False, low=10, step=1), 'feature_fraction': FloatDistribution(high=1.0, log=False, low=0.4, step=None), 'learning_rate': FloatDistribution(high=0.1, log=False, lo

In [None]:
best_params = best_trial.params.copy()
best_params.update({
    "objective": "regression",#los parametros que no se optimizan (los fijos) hay que escribirlos
    "metric": "mse",
    "verbosity": -1,
    'seed': SEED,
    'linear_tree': True,    
})

In [None]:
print( best_trial.params)

{'lambda_l1': 0.043301223417331454, 'lambda_l2': 3.3597498505196954e-06, 'num_leaves': 20, 'feature_fraction': 0.9578449292753943, 'learning_rate': 0.03626001394797111, 'bagging_fraction': 0.7840929446849241, 'bagging_freq': 1, 'min_child_samples': 11, 'max_bin': 418}


In [None]:
best_iteration = best_trial.user_attrs["best_iteration"]
print( best_trial.params)

{'lambda_l1': 0.043301223417331454, 'lambda_l2': 3.3597498505196954e-06, 'num_leaves': 20, 'feature_fraction': 0.9578449292753943, 'learning_rate': 0.03626001394797111, 'bagging_fraction': 0.7840929446849241, 'bagging_freq': 1, 'min_child_samples': 11, 'max_bin': 418}


In [None]:
def entrenar_lgbm_final_todo(base_params, seed):
    """
    Entrena un modelo final en TODO el dataset (train+test),
    usando los mejores hiperparámetros y un número fijo de iteraciones.

    Args:
        base_params (dict): hiperparámetros óptimos (Optuna).
        seed (int): semilla para reproducibilidad.

    Returns:
        modelo final entrenado.
    """
    run_params = base_params.copy()
    run_params.update({'seed': seed})

    print("\n🚀 Entrenando modelo final en TODO el dataset...")


    all_dataset = lgb.Dataset(
        data=X_train,
        label=y_train,
        free_raw_data=True,
        categorical_feature=char_feats
    )

    # entrenar con el mejor número de iteraciones conocido
    # si no lo conocés, podés poner 10000, pero sin early stopping (porque no hay validación)
    final_model = lgb.train(
        run_params,
        train_set=all_dataset,
        num_boost_round=best_iteration  # 👈 poné aquí el mejor obtenido antes
    )

    # Guardar modelo
    model_path = os.path.join(PATH_TO_MODELS, f"lgb_final_model_todo_seed_{seed}.joblib")
    dump(final_model, model_path)
    print(f"✅ Modelo final entrenado y guardado en: {model_path}")

    gc.collect()

    return final_model

In [None]:
# Mover modelos previos antes del loop
for fname in os.listdir(PATH_TO_MODELS):
    if fname.endswith('.joblib'):
        src_path = os.path.join(PATH_TO_MODELS, fname)
        dst_path = os.path.join(PATH_TO_MODELS_OLD, fname)
        shutil.move(src_path, dst_path)
        print(f"🔁 Modelo movido a backup: {fname}")

In [None]:
#results = []
SEEDS = [42,5,915,15,666,9999,37,45,125,90,1000,3,753,159,852,10,7,1,1050,654,11,21,33,69,8008,88,111,222,314,420,512,777,808,999,1024,2048,4096,17,19,23,29,31,25]

for current_seed in SEEDS:
    # <--- CORRECCIÓN 1 y 2: La función ahora se llama con los parámetros base y la semilla actual
    entrenar_lgbm_final_todo(
        base_params=best_params, 
        seed=current_seed
    )


🚀 Entrenando modelo final en TODO el dataset...
✅ Modelo final entrenado y guardado en: ../LGBM/models\lgb_final_model_todo_seed_42.joblib

🚀 Entrenando modelo final en TODO el dataset...
✅ Modelo final entrenado y guardado en: ../LGBM/models\lgb_final_model_todo_seed_5.joblib

🚀 Entrenando modelo final en TODO el dataset...
✅ Modelo final entrenado y guardado en: ../LGBM/models\lgb_final_model_todo_seed_915.joblib

🚀 Entrenando modelo final en TODO el dataset...
✅ Modelo final entrenado y guardado en: ../LGBM/models\lgb_final_model_todo_seed_15.joblib

🚀 Entrenando modelo final en TODO el dataset...
✅ Modelo final entrenado y guardado en: ../LGBM/models\lgb_final_model_todo_seed_666.joblib

🚀 Entrenando modelo final en TODO el dataset...
✅ Modelo final entrenado y guardado en: ../LGBM/models\lgb_final_model_todo_seed_9999.joblib

🚀 Entrenando modelo final en TODO el dataset...
✅ Modelo final entrenado y guardado en: ../LGBM/models\lgb_final_model_todo_seed_37.joblib

🚀 Entrenando mod