In [1]:
# 1. Configuración inicial y carga de paquetes
using Pkg
Pkg.activate(".")



[32m[1m  Activating[22m[39m project at `e:\faa`


In [2]:
using MLJ, DataFrames, Statistics, Random
using CSV, Dates, CategoricalArrays
using MLJLinearModels, DecisionTree, NearestNeighborModels, LightGBM



┌ Info: lib_lightgbm found in system dirs!
└ @ LightGBM C:\Users\abelf\.julia\packages\LightGBM\z8ahL\src\LightGBM.jl:32


In [3]:
using Pkg
#Pkg.status(["MLJ", "DecisionTree", "MLJBase"])

In [4]:
#Pkg.add("EvoTrees")

In [4]:
using EvoTrees

In [6]:
# CÓDIGO COMPLETO FINAL - Con encoding de variables categóricas
using MLJ, DataFrames, CSV, Statistics, Dates, Random, CategoricalArrays
using MLJ: @load, machine, fit!, predict
using EvoTrees
using LightGBM 
import LightGBM.MLJInterface: LGBMRegressor  # define the regressor type directly

# ========================================
# 1. CARGA Y PROCESAMIENTO DE DATOS
# ========================================

# Cargar datos desde carpeta dataO
df_train = DataFrame(CSV.File("dataO/train_data.csv"))
df_week = DataFrame(CSV.File("dataO/date_to_week_id_map.csv"))
df_prices = DataFrame(CSV.File("dataO/product_prices.csv"))

# Realizar joins
df = innerjoin(df_prices, df_week, on = :week_id)
df_final = innerjoin(df_train, df, on = [:date, :product_identifier, :outlet])

# Preparar features con encoding de variables categóricas
using CategoricalArrays

# Convertir a categóricas
df_final.category_of_product = categorical(df_final.category_of_product)
df_final.state = categorical(df_final.state)

# Crear encodings
df_final.state_encoded = levelcode.(df_final.state)
df_final.category_encoded = levelcode.(df_final.category_of_product)
df_final.Month = month.(df_final.date)

# Seleccionar features incluyendo las variables categóricas encodificadas
X = select(df_final, [:product_identifier, :department_identifier, :outlet, 
                      :sell_price, :Month, :state_encoded, :category_encoded])
y = df_final.sales

# División train/test reproducible (70/30)
Random.seed!(42)
n = nrow(X)
train_size = floor(Int, 0.7 * n)
perm = randperm(n)

X_train = X[perm[1:train_size], :]
X_test = X[perm[train_size+1:end], :]
y_train = y[perm[1:train_size]]
y_test = y[perm[train_size+1:end]]


# Verificar dimensiones de los datos
data_summary = (
    total_rows = nrow(df_final),
    features = ncol(X),
    train_size = nrow(X_train),
    test_size = nrow(X_test),
    feature_names = names(X)
)

# ========================================
# 2. CARGA DE MODELOS
# ========================================

# Cargar todos los modelos funcionales
@load LinearRegressor pkg=MLJLinearModels verbosity=0
@load RidgeRegressor pkg=MLJLinearModels verbosity=0
@load LassoRegressor pkg=MLJLinearModels verbosity=0
@load KNNRegressor pkg=NearestNeighborModels verbosity=0
@load EvoTreeRegressor pkg=EvoTrees verbosity=0
#Nuevo
@load LGBMRegressor        pkg=LightGBM verbosity=0

# ========================================
# 3. DEFINICIÓN DE MODELOS
# ========================================

# Arsenal completo de modelos con diferentes configuraciones
all_models = Dict(
    # Modelos lineales
    "LinearRegression" => LinearRegressor(),
    "Ridge_0.1" => RidgeRegressor(lambda=0.1),
    "Ridge_1.0" => RidgeRegressor(lambda=1.0),
    "Ridge_10.0" => RidgeRegressor(lambda=10.0),
    "Lasso_0.1" => LassoRegressor(lambda=0.1),
    "Lasso_1.0" => LassoRegressor(lambda=1.0),
    "Lasso_10.0" => LassoRegressor(lambda=10.0),
    
    # K-Nearest Neighbors
    "KNN_5" => KNNRegressor(K=5),
    "KNN_10" => KNNRegressor(K=10),
    "KNN_15" => KNNRegressor(K=15),
    "KNN_25" => KNNRegressor(K=25),
    "KNN_35" => KNNRegressor(K=35),
    
    # Evolutionary Trees (Gradient Boosting)
    "EvoTree_50" => EvoTreeRegressor(nrounds=50),
    "EvoTree_100" => EvoTreeRegressor(nrounds=100),
    "EvoTree_200" => EvoTreeRegressor(nrounds=200),


    #Nuevo - LightGBM
    "LGBM_Default"     => LGBMRegressor(),
    "LGBM_100"         => LGBMRegressor(num_iterations=100, learning_rate=0.1, num_leaves=31),
    "LGBM_200"         => LGBMRegressor(num_iterations=200, learning_rate=0.05, num_leaves=50)
)

# ========================================
# 4. ENTRENAMIENTO Y EVALUACIÓN
# ========================================

# DataFrame para almacenar resultados
ultimate_results = DataFrame(
    Model = String[],
    MAE = Float64[],
    MSE = Float64[],
    RMSE = Float64[],
    R2 = Float64[],
    Training_Time = Float64[]
)

# Entrenar y evaluar cada modelo
for (name, model) in all_models
    start_time = time()
    
    # Entrenar modelo
    mach = machine(model, X_train, y_train)
    fit!(mach, verbosity=0)
    
    training_time = time() - start_time
    
    # Realizar predicciones
    ŷ = predict(mach, X_test)
    
    # Calcular métricas
    mae_val = mean(abs.(ŷ .- y_test))
    mse_val = mean((ŷ .- y_test).^2)
    rmse_val = sqrt(mse_val)
    r2_val = 1 - sum((y_test .- ŷ).^2) / sum((y_test .- mean(y_test)).^2)
    
    # Agregar resultados
    push!(ultimate_results, (name, mae_val, mse_val, rmse_val, r2_val, training_time))
end

# ========================================
# 5. RESULTADOS FINALES
# ========================================

# Ordenar por RMSE (menor es mejor)
final_ranking = sort(ultimate_results, :RMSE)

# Mostrar resumen de datos y ranking final
(data_summary = data_summary, ranking = final_ranking)

│ supports. Suppress this type check by specifying `scitype_check_level=0`.
│ 
│ Run `@doc EvoTrees.EvoTreeRegressor` to learn more about your model's requirements.
│ 
│ Commonly, but non exclusively, supervised models are constructed using the syntax
│ `machine(model, X, y)` or `machine(model, X, y, w)` while most other models are
│ constructed with `machine(model, X)`.  Here `X` are features, `y` a target, and `w`
│ sample or class weights.
│ 
│ In general, data in `machine(model, data...)` is expected to satisfy
│ 
│     scitype(data) <: MLJ.fit_data_scitype(model)
│ 
│ In the present case:
│ 
│ scitype(data) = Tuple{Table{Union{AbstractVector{Continuous}, AbstractVector{Count}}}, AbstractVector{Count}}
│ 
│ fit_data_scitype(model) = Union{Tuple{Union{Table{<:Union{AbstractVector{<:Continuous}, AbstractVector{<:Count}, AbstractVector{<:OrderedFactor}, AbstractVector{<:Multiclass}}}, AbstractMatrix{Continuous}}, AbstractVector{<:Continuous}}, Tuple{Union{Table{<:Union{AbstractVector{

(data_summary = (total_rows = 395000,
                 features = 7,
                 train_size = 276500,
                 test_size = 118500,
                 feature_names = ["product_identifier", "department_identifier", "outlet", "sell_price", "Month", "state_encoded", "category_encoded"],),
 ranking = [1m18×6 DataFrame[0m
[1m Row [0m│[1m Model            [0m[1m MAE     [0m[1m MSE      [0m[1m RMSE    [0m[1m R2         [0m[1m Training_Time[0m ⋯
     │[90m String           [0m[90m Float64 [0m[90m Float64  [0m[90m Float64 [0m[90m Float64    [0m[90m Float64      [0m ⋯
─────┼──────────────────────────────────────────────────────────────────────────
   1 │ LGBM_200          1.0903    8.05997  2.83901   0.381742      4.359      ⋯
   2 │ LGBM_100          1.10681   8.11152  2.84807   0.377788      2.776
   3 │ LGBM_Default      1.10681   8.11152  2.84807   0.377788      1.532
   4 │ KNN_35            1.07139   8.16223  2.85696   0.373898      0.0640001
   5 │ E

In [7]:
X

Row,product_identifier,department_identifier,outlet,sell_price,Month,state_encoded,category_encoded
Unnamed: 0_level_1,Int64,Int64,Int64,Float64,Int64,Int64,Int64
1,74,11,111,2.94,1,2,3
2,74,11,111,2.94,1,2,3
3,74,11,111,2.94,1,2,3
4,74,11,111,2.94,1,2,3
5,74,11,111,2.94,1,2,3
6,74,11,111,2.94,1,2,3
7,74,11,111,2.94,1,2,3
8,74,11,111,2.94,1,2,3
9,74,11,111,2.94,1,2,3
10,74,11,111,2.94,1,2,3


In [8]:

using MLJ, DataFrames, CSV, Statistics, Dates, Random, CategoricalArrays
using MLJ: @load, machine, fit!, predict
using EvoTrees

# ========================================
# 1. CARGA Y PREPARACIÓN INICIAL
# ========================================

df_train = DataFrame(CSV.File("dataO/train_data.csv"))
df_week = DataFrame(CSV.File("dataO/date_to_week_id_map.csv"))
df_prices = DataFrame(CSV.File("dataO/product_prices.csv"))

# Joins iniciales
df = innerjoin(df_prices, df_week, on = :week_id)
df_final = innerjoin(df_train, df, on = [:date, :product_identifier, :outlet])

# ========================================
# 2. FEATURES TEMPORALES AVANZADAS
# ========================================

# Features de tiempo básicas
df_final.Month = month.(df_final.date)
df_final.DayOfWeek = dayofweek.(df_final.date)
df_final.DayOfMonth = day.(df_final.date)
df_final.Quarter = quarterofyear.(df_final.date)
df_final.WeekOfYear = week.(df_final.date)

# Features estacionales
df_final.IsWeekend = df_final.DayOfWeek .>= 6
df_final.IsMonthStart = df_final.DayOfMonth .<= 7
df_final.IsMonthEnd = df_final.DayOfMonth .>= 25
df_final.IsHolidayMonth = (df_final.Month .== 12) .|| (df_final.Month .== 1)  # Diciembre/Enero

# Features cíclicas (importantes para capturar patrones estacionales)
df_final.Month_sin = sin.(2π * df_final.Month / 12)
df_final.Month_cos = cos.(2π * df_final.Month / 12)
df_final.DayOfWeek_sin = sin.(2π * df_final.DayOfWeek / 7)
df_final.DayOfWeek_cos = cos.(2π * df_final.DayOfWeek / 7)

# ========================================
# 3. FEATURES DE PRECIOS
# ========================================

# Features de precios por grupos
price_stats = combine(groupby(df_final, :product_identifier), 
    :sell_price => mean => :price_product_mean,
    :sell_price => std => :price_product_std,
    :sell_price => minimum => :price_product_min,
    :sell_price => maximum => :price_product_max
)

outlet_price_stats = combine(groupby(df_final, :outlet),
    :sell_price => mean => :price_outlet_mean,
    :sell_price => std => :price_outlet_std
)

category_price_stats = combine(groupby(df_final, :category_of_product),
    :sell_price => mean => :price_category_mean,
    :sell_price => std => :price_category_std
)

# Unir estadísticas de precios
df_final = leftjoin(df_final, price_stats, on = :product_identifier)
df_final = leftjoin(df_final, outlet_price_stats, on = :outlet)
df_final = leftjoin(df_final, category_price_stats, on = :category_of_product)

# Features derivadas de precios
df_final.price_vs_product_mean = df_final.sell_price ./ df_final.price_product_mean
df_final.price_vs_outlet_mean = df_final.sell_price ./ df_final.price_outlet_mean
df_final.price_vs_category_mean = df_final.sell_price ./ df_final.price_category_mean

# Reemplazar NaN y valores infinitos con 1.0
#for col in [:price_vs_product_mean, :price_vs_outlet_mean, :price_vs_category_mean]
#    df_final[!, col] = replace(df_final[!, col], NaN => 1.0, Inf => 1.0, -Inf => 1.0)
#end

# Limpieza rápida de NaN/Inf:
for col in [:price_vs_product_mean, :price_vs_outlet_mean, :price_vs_category_mean]
    df_final[!, col] = [ (v isa Missing || isnan(v) || isinf(v)) ? 1.0 : v
                         for v in df_final[!, col] ]
end



# ========================================
# 4. FEATURES DE VENTAS HISTÓRICAS (LAG FEATURES)
# ========================================

# Ordenar por producto, outlet y fecha
sort!(df_final, [:product_identifier, :outlet, :date])

# Lag features de ventas (ventas de días anteriores)
df_final.sales_lag_1 = [missing; df_final.sales[1:end-1]]
df_final.sales_lag_7 = [fill(missing, 7); df_final.sales[1:end-7]]

# Promedios móviles de ventas
function rolling_mean(x, window)
    result = similar(x, Float64)
    for i in 1:length(x)
        start_idx = max(1, i - window + 1)
        result[i] = mean(x[start_idx:i])
    end
    return result
end

df_final.sales_rolling_3 = rolling_mean(df_final.sales, 3)
df_final.sales_rolling_7 = rolling_mean(df_final.sales, 7)
df_final.sales_rolling_30 = rolling_mean(df_final.sales, 30)

# ========================================
# 5. FEATURES AGREGADAS POR ENTIDADES
# ========================================

# Estadísticas por producto
product_stats = combine(groupby(df_final, :product_identifier),
    :sales => mean => :product_sales_mean,
    :sales => std => :product_sales_std,
    :sales => sum => :product_total_sales,
    nrow => :product_frequency
)

# Estadísticas por outlet
outlet_stats = combine(groupby(df_final, :outlet),
    :sales => mean => :outlet_sales_mean,
    :sales => std => :outlet_sales_std,
    :sales => sum => :outlet_total_sales,
    nrow => :outlet_frequency
)

# Estadísticas por categoría
category_stats = combine(groupby(df_final, :category_of_product),
    :sales => mean => :category_sales_mean,
    :sales => std => :category_sales_std,
    :sales => sum => :category_total_sales
)

# Estadísticas por estado
state_stats = combine(groupby(df_final, :state),
    :sales => mean => :state_sales_mean,
    :sales => std => :state_sales_std,
    :sales => sum => :state_total_sales
)

# Unir todas las estadísticas
df_final = leftjoin(df_final, product_stats, on = :product_identifier)
df_final = leftjoin(df_final, outlet_stats, on = :outlet)
df_final = leftjoin(df_final, category_stats, on = :category_of_product)
df_final = leftjoin(df_final, state_stats, on = :state)

# ========================================
# 6. FEATURES DE INTERACCIONES
# ========================================

# Interacciones importantes
df_final.outlet_x_category = df_final.outlet .* 1000 .+ hash.(df_final.category_of_product) .% 1000
df_final.state_x_category = hash.(df_final.state) .% 100 .* 1000 .+ hash.(df_final.category_of_product) .% 1000
df_final.month_x_category = df_final.Month .* 1000 .+ hash.(df_final.category_of_product) .% 1000

# Features de densidad/competencia
df_final.products_per_outlet = df_final.outlet_frequency ./ length(unique(df_final.product_identifier))
df_final.outlets_per_state = [length(unique(df_final[df_final.state .== s, :outlet])) for s in df_final.state]

# ========================================
# 7. ENCODING DE VARIABLES CATEGÓRICAS
# ========================================

# Convertir a categóricas
df_final.category_of_product = categorical(df_final.category_of_product)
df_final.state = categorical(df_final.state)

# Crear encodings
df_final.state_encoded = levelcode.(df_final.state)
df_final.category_encoded = levelcode.(df_final.category_of_product)


# NUEVAS FEATURES

# ========================================
# 8. Tendencia y descomposición
# ========================================

# (a) Diferencia entre media móvil de 30 y 90 días
#df_final.sales_roll30 = rolling_mean(df_final.sales, 30)      # si ya tienes rolling_mean definido
#df_final.sales_roll90 = rolling_mean(df_final.sales, 90)
#df_final.sales_trend = df_final.sales_roll30 .- df_final.sales_roll90

# (b) Slope (pendiente) en ventana deslizante
#function rolling_slope(x::AbstractVector{<:Number}, window::Int)
#    n = length(x)
#    slopes = Vector{Float64}(undef, n)
#    for i in 1:n
#        start = max(1, i - window + 1)
#        y = x[start:i]
#        t = collect(1:length(y))
#        slopes[i] = var(t) == 0 ? 0.0 : cov(t, y) / var(t)
#    end
#    return slopes
#end

#df_final.sales_slope_7 = rolling_slope(df_final.sales, 7)

# ========================================
# 8. SELECCIÓN DE FEATURES FINALES
# ========================================

# Eliminar filas con missing values en lag features (para simplificar)
df_final = df_final[.!ismissing.(df_final.sales_lag_1), :]

# Seleccionar features finales
feature_columns = [
    # Features básicas
    :product_identifier, :department_identifier, :outlet, :sell_price,
    
    # Features temporales
    :Month, :DayOfWeek, :DayOfMonth, :Quarter, :WeekOfYear,
    :IsWeekend, :IsMonthStart, :IsMonthEnd, :IsHolidayMonth,
    :Month_sin, :Month_cos, :DayOfWeek_sin, :DayOfWeek_cos,
    
    # Features de precios
    :price_product_mean, :price_outlet_mean, :price_category_mean,
    :price_vs_product_mean, :price_vs_outlet_mean, :price_vs_category_mean,
    
    # Features de ventas históricas
    :sales_lag_1, :sales_rolling_3, :sales_rolling_7, :sales_rolling_30,
    
    # Features agregadas
    :product_sales_mean, :product_frequency, :outlet_sales_mean, :outlet_frequency,
    :category_sales_mean, :state_sales_mean,
    
    # Features de interacciones
    :outlet_x_category, :state_x_category, :month_x_category,
    :products_per_outlet, :outlets_per_state,
    
    # Encodings categóricos
    :state_encoded, :category_encoded

    #NUEVAS
    #:sell_price_lag_7, :price_pct_change_7, :sales_pct_change_7, :price_elasticity,
    #:weekly_sales, :weekly_sales_lag1, :weekly_growth,
    #:monthly_sales, :monthly_sales_lag1, :monthly_growth,
    #:sales_rollmax_7, :sales_rollmin_7

]

X_engineered = df_final[!, feature_columns]
y_engineered = df_final.sales

# Verificar que no hay missing values
for col in names(X_engineered)
    if any(ismissing.(X_engineered[!, col]))
        println("Warning: Missing values en $col")
    end
end

# ========================================
# 9. DIVISIÓN Y EVALUACIÓN
# ========================================

# División train/test
Random.seed!(42)
n = nrow(X_engineered)
train_size = floor(Int, 0.7 * n)
perm = randperm(n)

X_train = X_engineered[perm[1:train_size], :]
X_test = X_engineered[perm[train_size+1:end], :]
y_train = y_engineered[perm[1:train_size]]
y_test = y_engineered[perm[train_size+1:end]]

# Información sobre las features
feature_info = (
    total_features = ncol(X_engineered),
    total_samples = nrow(X_engineered),
    train_samples = nrow(X_train),
    test_samples = nrow(X_test),
    feature_names = names(X_engineered)
)

feature_info

(total_features = 40,
 total_samples = 394999,
 train_samples = 276499,
 test_samples = 118500,
 feature_names = ["product_identifier", "department_identifier", "outlet", "sell_price", "Month", "DayOfWeek", "DayOfMonth", "Quarter", "WeekOfYear", "IsWeekend"  …  "outlet_frequency", "category_sales_mean", "state_sales_mean", "outlet_x_category", "state_x_category", "month_x_category", "products_per_outlet", "outlets_per_state", "state_encoded", "category_encoded"],)

In [9]:
missing_summary = Dict()
for col in names(X_train)
    missing_count = sum(ismissing.(X_train[!, col]))
    if missing_count > 0
        missing_summary[col] = missing_count
    end
end

missing_summary

Dict{Any, Any}()

In [10]:
# Convertir todos los datos a tipos numéricos estándar
function fix_data_types(X, y)
    X_fixed = DataFrame()
    
    for col in names(X)
        if eltype(X[!, col]) <: Union{Missing, Number}
            # Convertir a Float64, reemplazando missing con 0.0
            X_fixed[!, col] = convert(Vector{Float64}, coalesce.(X[!, col], 0.0))
        else
            # Para otros tipos, convertir directamente
            X_fixed[!, col] = convert(Vector{Float64}, X[!, col])
        end
    end
    
    y_fixed = convert(Vector{Float64}, y)
    
    return X_fixed, y_fixed
end

# Aplicar la corrección
X_train_fixed, y_train_fixed = fix_data_types(X_train, y_train)
X_test_fixed, y_test_fixed = fix_data_types(X_test, y_test)

# Verificar tipos
(
    X_train_types = eltype.(eachcol(X_train_fixed)),
    y_train_type = eltype(y_train_fixed),
    dimensions = (nrow(X_train_fixed), ncol(X_train_fixed))
)

(X_train_types = DataType[Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64  …  Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64],
 y_train_type = Float64,
 dimensions = (276499, 40),)

In [11]:
#Pkg.add("MLJLightGBMInterface")

In [11]:
# Modelos completos con feature engineering

using MLJ
#using  MLJLightGBMInterface


fe_models = Dict(
    "LinearRegression_FE" => LinearRegressor(),
    "Ridge_1.0_FE" => RidgeRegressor(lambda=1.0),
    "KNN_25_FE" => KNNRegressor(K=25),
    "KNN_35_FE" => KNNRegressor(K=35),
    "EvoTree_100_FE" => EvoTreeRegressor(nrounds=100),
    "EvoTree_200_FE" => EvoTreeRegressor(nrounds=200),
    # LightGBM con feature engineering
    "LGBM_Default_FE"     => LGBMRegressor(),
    "LGBM_100_FE"         => LGBMRegressor(
                               num_iterations=100,
                               learning_rate=0.1,
                               num_leaves=31
                            ),
    "LGBM_200_FE"         => LGBMRegressor(
                               num_iterations=200,
                               learning_rate=0.05,
                               num_leaves=50
                            )
)


# Evaluar todos
results_final_fe = DataFrame(
    Model = String[],
    MAE = Float64[],
    RMSE = Float64[],
    R2 = Float64[],
    Features = Int64[]
)

for (name, model) in fe_models
    mach = machine(model, X_train_fixed, y_train_fixed)
    fit!(mach, verbosity=0)
    ŷ = predict(mach, X_test_fixed)
    
    mae_val = mean(abs.(ŷ .- y_test_fixed))
    rmse_val = sqrt(mean((ŷ .- y_test_fixed).^2))
    r2_val = 1 - sum((y_test_fixed .- ŷ).^2) / sum((y_test_fixed .- mean(y_test_fixed)).^2)
    
    push!(results_final_fe, (name, mae_val, rmse_val, r2_val, ncol(X_train_fixed)))
end

sort(results_final_fe, :RMSE)

Row,Model,MAE,RMSE,R2,Features
Unnamed: 0_level_1,String,Float64,Float64,Float64,Int64
1,LGBM_Default_FE,0.594825,1.62456,0.775698,40
2,LGBM_100_FE,0.594825,1.62456,0.775698,40
3,LGBM_200_FE,0.591386,1.63568,0.772614,40
4,LinearRegression_FE,0.655862,1.65888,0.766118,40
5,Ridge_1.0_FE,0.713618,1.81293,0.720664,40
6,EvoTree_100_FE,0.619101,1.90706,0.690904,40
7,EvoTree_200_FE,0.624935,1.95064,0.676617,40
8,KNN_25_FE,0.993251,2.35886,0.527102,40
9,KNN_35_FE,1.02405,2.45056,0.489617,40


In [None]:
#Pkg.add("MLJTuning")

[32m[1m    Updating[22m[39m registry at `C:\Users\natal\.julia\registries\General.toml`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m    Updating[22m[39m `C:\Users\natal\Maestria\Aprendizaje automático\TP-FINAL\archive (5)\Project.toml`
  [90m[03970b2e] [39m[92m+ MLJTuning v0.8.8[39m
[32m[1m  No Changes[22m[39m to `C:\Users\natal\Maestria\Aprendizaje automático\TP-FINAL\archive (5)\Manifest.toml`


In [None]:
#Pkg.add("LLVM")

[32m[1m   Resolving[22m[39m package versions...
[32m[1m   Installed[22m[39m CEnum ───────── v0.5.0
[32m[1m   Installed[22m[39m LLVMExtra_jll ─ v0.0.37+2
[32m[1m   Installed[22m[39m LLVM ────────── v9.4.2
[32m[1m    Updating[22m[39m `C:\Users\natal\Maestria\Aprendizaje automático\TP-FINAL\archive (5)\Project.toml`
  [90m[929cbde3] [39m[92m+ LLVM v9.4.2[39m
[32m[1m    Updating[22m[39m `C:\Users\natal\Maestria\Aprendizaje automático\TP-FINAL\archive (5)\Manifest.toml`
  [90m[fa961155] [39m[92m+ CEnum v0.5.0[39m
  [90m[929cbde3] [39m[92m+ LLVM v9.4.2[39m
  [90m[dad2f222] [39m[92m+ LLVMExtra_jll v0.0.37+2[39m
[32m[1mPrecompiling[22m[39m packages...
   1857.2 ms[33m  ✓ [39m[90mStatisticalTraits[39m
   1826.0 ms[33m  ✓ [39m[90mNaNMath[39m
   1881.8 ms[32m  ✓ [39m[90mCEnum[39m
   1858.9 ms[32m  ✓ [39m[90mCoverageTools[39m
   2622.7 ms[32m  ✓ [39m[90mURIParser[39m
   1569.0 ms[32m  ✓ [39m[90mDensityInterface[39m
   1588.0 ms[3

In [None]:
#Pkg.add("TreeParzen")

[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `C:\Users\natal\Maestria\Aprendizaje automático\TP-FINAL\archive (5)\Project.toml`
[32m[1m  No Changes[22m[39m to `C:\Users\natal\Maestria\Aprendizaje automático\TP-FINAL\archive (5)\Manifest.toml`
[32m[1mPrecompiling[22m[39m packages...
   1467.0 ms[32m  ✓ [39m[90mWidgets[39m
         [91m  ✗ [39m[90mBinaryProvider[39m
         [91m  ✗ [39m[90mSpecialFunctions[39m
         [91m  ✗ [39m[90mStatsFuns[39m
         [91m  ✗ [39m[90mColorVectorSpace → SpecialFunctionsExt[39m
         [91m  ✗ [39m[90mDifferentiationInterface → DifferentiationInterfaceForwardDiffExt[39m
         [91m  ✗ [39mDistributions
         [91m  ✗ [39m[90mLineSearches[39m
         [91m  ✗ [39m[90mKernelDensity[39m
         [91m  ✗ [39m[90mMathOptInterface[39m
         [91m  ✗ [39m[90mStatisticalMeasures[39m
         [91m  ✗ [39m[90mOptim[39m
         [91m  ✗ [39mGadfly
         

In [12]:
using MLJ, DataFrames, CSV, Statistics, Dates, Random, CategoricalArrays
using MLJ: @load, machine, fit!, predict, CV
using MLJBase: rms, CV           # ← importa aquí rms
using MLJTuning              # TunedModel, range, RandomSearch, etc.
using BayesianOptimization   # si vas por el paquete oficial
using TreeParzen             # si prefieres Tree-Parzen
using LightGBM
import LightGBM.MLJInterface: LGBMRegressor
using EvoTrees

# ——————————————
# 6. TUNING “BAYESIANO” PARA LIGHTGBM
# ——————————————

# 6.1. Asegúrate de tener instalado TreeParzen.jl:
# ] add TreeParzen

using MLJTuning, TreeParzen   # MLJTuning va con MLJ y TreeParzen aporta TreeParzenTuning()
@load LGBMRegressor pkg=LightGBM verbosity=0

# 6.2. Definir modelo base y rangos de búsqueda
lgbm = LGBMRegressor()

ranges = [
    range(lgbm, :num_iterations, lower=50,  upper=300),
    range(lgbm, :learning_rate,   lower=0.01, upper=0.2),
    range(lgbm, :num_leaves,      lower=10,   upper=100),
]

# 6.3. Configurar TunedModel con Tree-Parzen (bayesiano)
bayes_tuner = TunedModel(
    model      = lgbm,
    tuning     = TreeParzenTuning(),    # estrategia bayesiana
    resampling = CV(nfolds=5, shuffle=true),
    measure    = MLJ.rms,                   # raíz del error cuadrático
    ranges     = ranges,
    n          = 25                     # número de iteraciones
)

# 6.4. Entrenar el tuner
tmach = machine(bayes_tuner, X_train_fixed, y_train_fixed)
fit!(tmach, verbosity=1)

# 6.5. Extraer mejores parámetros y evaluar en test
best_lgbm = fitted_params(tmach).best_model
println("Mejores hiperparámetros bayesianos para LGBM:\n", best_lgbm)

ŷ_bayes = predict(tmach, X_test_fixed) |> collect
rmse_bayes = sqrt(mean((ŷ_bayes .- y_test_fixed).^2))
println("RMSE en test tras tuning: ", round(rmse_bayes, digits=4))

UndefVarError: UndefVarError: `rms` not defined

In [None]:
# Este script incluye una bayesiana

using MLJ, DataFrames, CSV, Statistics, Dates, Random, CategoricalArrays
using MLJ: @load, machine, fit!, predict, CV
using MLJBase: rms, CV           # ← importa aquí rms
using MLJTuning              # TunedModel, range, RandomSearch, etc.
using BayesianOptimization   # si vas por el paquete oficial
using TreeParzen             # si prefieres Tree-Parzen
using LightGBM
import LightGBM.MLJInterface: LGBMRegressor
using EvoTrees

# ========================================
# 1. MODELOS CON FEATURE ENGINEERING
# ========================================

@load LinearRegressor      pkg=MLJLinearModels verbosity=0
@load RidgeRegressor       pkg=MLJLinearModels verbosity=0
@load KNNRegressor         pkg=NearestNeighborModels verbosity=0
@load EvoTreeRegressor     pkg=EvoTrees verbosity=0

fe_models = Dict(
    "LinearRegression_FE" => LinearRegressor(),
    "Ridge_1.0_FE"        => RidgeRegressor(lambda=1.0),
    "KNN_25_FE"           => KNNRegressor(K=25),
    "KNN_35_FE"           => KNNRegressor(K=35),
    "EvoTree_100_FE"      => EvoTreeRegressor(nrounds=100),
    "EvoTree_200_FE"      => EvoTreeRegressor(nrounds=200),
    # LightGBM
    "LGBM_Default_FE"     => LGBMRegressor(),
    "LGBM_100_FE"         => LGBMRegressor(
                               num_iterations=100,
                               learning_rate=0.1,
                               num_leaves=31
                            ),
    "LGBM_200_FE"         => LGBMRegressor(
                               num_iterations=200,
                               learning_rate=0.05,
                               num_leaves=50
                            )
)

# ========================================
# 2. EVALUACIÓN SIMPLE DE MODELOS
# ========================================

results_final_fe = DataFrame(
    Model    = String[],
    MAE      = Float64[],
    RMSE     = Float64[],
    R2       = Float64[],
    Features = Int64[]
)

for (name, model) in fe_models
    mach = machine(model, X_train_fixed, y_train_fixed)
    fit!(mach, verbosity=0)
    ŷ = predict(mach, X_test_fixed)
    mae_val  = mean(abs.(ŷ .- y_test_fixed))
    rmse_val = sqrt(mean((ŷ .- y_test_fixed).^2))
    r2_val   = 1 - sum((y_test_fixed .- ŷ).^2) / sum((y_test_fixed .- mean(y_test_fixed)).^2)
    push!(results_final_fe, (name, mae_val, rmse_val, r2_val, ncol(X_train_fixed)))
end

sort(results_final_fe, :RMSE)

# ========================================
# 3. TUNING BAYESIANO DE HYPERPARÁMETROS
# ========================================
# Aquí definimos y ejecutamos una búsqueda bayesiana para LightGBM:

# 3.1. Instanciar el modelo base
lgbm = LGBMRegressor()

# 3.2. Definir rangos para hiperparámetros
ranges = [
    range(lgbm, :num_iterations, lower=50,  upper=300),
    range(lgbm, :learning_rate,   lower=0.01, upper=0.2),
    range(lgbm, :num_leaves,      lower=10,   upper=100)
]

# 3.3. Configurar el TunedModel con Bayesian Optimization
bayes_tuner = TunedModel(
    model = lgbm,
    tuning = BayesianOptimization(n_iter=25, acquisition = :ei),
    resampling = CV(nfolds=5, shuffle=true),
    measure = rms,
    ranges = ranges,
    acceleration = CPUThreads()
)

# 3.4. Entrenar el modelo tunado
tmach = machine(bayes_tuner, X_train_fixed, y_train_fixed)
fit!(tmach, verbosity=1)

# 3.5. Extraer y mostrar los mejores parámetros
best_params = fitted_params(tmach).best_model
println("Mejores parámetros encontrados para LightGBM:")
println(best_params)

# 3.6. Evaluar rendimiento en test set
ŷ_test = predict(tmach, X_test_fixed) |> collect
rmse_test = sqrt(mean((ŷ_test .- y_test_fixed).^2))
println("RMSE en test con modelo tunado: ", round(rmse_test, digits=4))


UndefVarError: UndefVarError: `rms` not defined

In [13]:
using MLJ, DataFrames

# Entrenar Ridge
model_ridge = RidgeRegressor(lambda=1.0)
mach_ridge  = machine(model_ridge, X_train_fixed, y_train_fixed)
fit!(mach_ridge, verbosity=0)

# Obtener pares (feature => coef)
coef_pairs = fitted_params(mach_ridge).coefs

# Extraer nombres y valores absolutos
features = String[]
coefs    = Float64[]
for (feat,coef) in coef_pairs
    push!(features, string(feat))
    push!(coefs, abs(coef))
end

# Calcular ranking de importancia
ranks = sortperm(coefs, rev=true)

# Crear DataFrame
feature_importance = DataFrame(
    Feature         = features,
    Coefficient     = coefs,
    Importance_Rank = ranks
)

# Mostrar top 10 por coeficiente
first(sort(feature_importance, :Coefficient, rev=true), 10)


Row,Feature,Coefficient,Importance_Rank
Unnamed: 0_level_1,String,Float64,Int64
1,sales_rolling_3,0.860157,40
2,sales_rolling_7,0.199064,22
3,sales_lag_1,0.182801,13
4,sales_rolling_30,0.077704,7
5,DayOfWeek,0.0490341,28
6,product_sales_mean,0.0356142,19
7,DayOfWeek_sin,0.0211705,20
8,DayOfWeek_cos,0.0157785,12
9,IsWeekend,0.0157271,9
10,WeekOfYear,0.0143109,10


In [14]:
#Pkg.add("MLJLightGBMInterface")

In [15]:
using MLJ
using LightGBM, DataFrames

# Importar explícitamente
const LGBMRegressor = LightGBM.MLJInterface.LGBMRegressor

model = LGBMRegressor(
    objective = "regression",
    learning_rate = 0.1,
    num_iterations = 100,
    num_leaves = 31,
    metric = ["l2"] # 👈 CORREGIDO: vector de string
)

#X = MLJ.table(X_train_fixed)
y = y_train_vector

#mach = machine(model, X, y)

mach = machine(model, X_train_fixed, y)

fit!(mach)

X_test_df = DataFrame(X_test_matrix, names(X_train_fixed))

ŷ_native = predict(mach, MLJ.table(X_test_df)) |> collect

mae_native = mean(abs.(ŷ_native .- y_test_vector))
r2_native = 1 - sum((y_test_vector .- ŷ_native).^2) / sum((y_test_vector .- mean(y_test_vector)).^2)

("LightGBM_MLJ", "SUCCESS", mae_native, r2_native)


ErrorException: cannot assign a value to imported variable Main.LGBMRegressor

In [16]:
using Pkg
# Pkg.status()