In [7]:
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Carregar datasets
data_treino = pd.read_csv('/home/caio/github/k-3/data/train.csv')
data_teste = pd.read_csv('/home/caio/github/k-3/data/test.csv')

# Definir features e target
X = data_treino.drop(columns=['rainfall'])  # Substitua pelo nome real da coluna alvo
y = data_treino['rainfall']
X_test = data_teste  # Apenas features

# Dividir treino em treino principal (60%) e validação (40%)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.4, random_state=42)

# Inicializar modelos base
modelo_catboost = CatBoostRegressor(verbose=0, random_state=42)
modelo_xgboost = XGBRegressor(n_estimators=100, random_state=42)
modelo_lightgbm = LGBMRegressor(n_estimators=100, random_state=42)

# Treinar modelos base no treino principal
modelo_catboost.fit(X_train, y_train)
modelo_xgboost.fit(X_train, y_train)
modelo_lightgbm.fit(X_train, y_train)

# Obter previsões dos modelos base no conjunto de validação
preds_catboost = modelo_catboost.predict(X_valid)
preds_xgboost = modelo_xgboost.predict(X_valid)
preds_lightgbm = modelo_lightgbm.predict(X_valid)

# Calcular RMSE para cada modelo individual
rmse_catboost = np.sqrt(mean_squared_error(y_valid, preds_catboost))
rmse_xgboost = np.sqrt(mean_squared_error(y_valid, preds_xgboost))
rmse_lightgbm = np.sqrt(mean_squared_error(y_valid, preds_lightgbm))

# Criar novo conjunto de treino para o modelo meta
X_meta_train = np.column_stack([preds_catboost, preds_xgboost, preds_lightgbm])
y_meta_train = y_valid

# Treinar modelo meta (Regressão Ridge)
modelo_meta = Ridge()
modelo_meta.fit(X_meta_train, y_meta_train)

# Fazer previsões no conjunto de validação usando o modelo meta
y_valid_pred = modelo_meta.predict(X_meta_train)

# Calcular RMSE do modelo meta
rmse_meta = np.sqrt(mean_squared_error(y_meta_train, y_valid_pred))

# Exibir RMSEs
print(f"RMSE do CatBoost: {rmse_catboost:.4f}")
print(f"RMSE do XGBoost: {rmse_xgboost:.4f}")
print(f"RMSE do LightGBM: {rmse_lightgbm:.4f}")
print(f"RMSE do Modelo Meta: {rmse_meta:.4f}")

# Obter previsões dos modelos base no conjunto de teste
preds_catboost_test = modelo_catboost.predict(X_test)
preds_xgboost_test = modelo_xgboost.predict(X_test)
preds_lightgbm_test = modelo_lightgbm.predict(X_test)

# Criar novo conjunto de teste para o modelo meta
X_meta_test = np.column_stack([preds_catboost_test, preds_xgboost_test, preds_lightgbm_test])

# Fazer previsão final com o modelo meta
y_pred_final = modelo_meta.predict(X_meta_test)

# Salvar resultados
output = pd.DataFrame({'Id': data_teste['id'], 'rainfall': y_pred_final})
output.to_csv('submission.csv', index=False)

print("Blending concluído! Resultados salvos em submission.csv")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000108 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1711
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 12
[LightGBM] [Info] Start training from score 0.751142
RMSE do CatBoost: 0.3464
RMSE do XGBoost: 0.3602
RMSE do LightGBM: 0.3451
RMSE do Modelo Meta: 0.3378
Blending concluído! Resultados salvos em submission.csv


In [8]:
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Carregar datasets
data_treino = pd.read_csv('/home/caio/github/k-3/data/train.csv')
data_teste = pd.read_csv('/home/caio/github/k-3/data/test.csv')

# Definir features e target
X = data_treino.drop(columns=['rainfall'])  # Substitua pelo nome real da coluna alvo
y = data_treino['rainfall']
X_test = data_teste  # Apenas features

# Dividir treino em treino principal (60%) e validação (40%)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.4, random_state=42)

# Inicializar modelos base
modelo_catboost = CatBoostRegressor(verbose=0, random_state=42)
modelo_xgboost = XGBRegressor(n_estimators=100, random_state=42)
modelo_lightgbm = LGBMRegressor(n_estimators=100, random_state=42)
modelo_knn = KNeighborsRegressor(n_neighbors=5)  # KNN com 5 vizinhos

# Treinar modelos base no treino principal
modelo_catboost.fit(X_train, y_train)
modelo_xgboost.fit(X_train, y_train)
modelo_lightgbm.fit(X_train, y_train)
modelo_knn.fit(X_train, y_train)

# Obter previsões dos modelos base no conjunto de validação
preds_catboost = modelo_catboost.predict(X_valid)
preds_xgboost = modelo_xgboost.predict(X_valid)
preds_lightgbm = modelo_lightgbm.predict(X_valid)
preds_knn = modelo_knn.predict(X_valid)

# Calcular RMSE para cada modelo individual
rmse_catboost = np.sqrt(mean_squared_error(y_valid, preds_catboost))
rmse_xgboost = np.sqrt(mean_squared_error(y_valid, preds_xgboost))
rmse_lightgbm = np.sqrt(mean_squared_error(y_valid, preds_lightgbm))
rmse_knn = np.sqrt(mean_squared_error(y_valid, preds_knn))

# Criar novo conjunto de treino para o modelo meta
X_meta_train = np.column_stack([preds_catboost, preds_xgboost, preds_lightgbm, preds_knn])
y_meta_train = y_valid

# Treinar modelo meta (Regressão Ridge)
modelo_meta = Ridge()
modelo_meta.fit(X_meta_train, y_meta_train)

# Fazer previsões no conjunto de validação usando o modelo meta
y_valid_pred = modelo_meta.predict(X_meta_train)

# Calcular RMSE do modelo meta
rmse_meta = np.sqrt(mean_squared_error(y_meta_train, y_valid_pred))

# Exibir RMSEs
print(f"RMSE do CatBoost: {rmse_catboost:.4f}")
print(f"RMSE do XGBoost: {rmse_xgboost:.4f}")
print(f"RMSE do LightGBM: {rmse_lightgbm:.4f}")
print(f"RMSE do KNN: {rmse_knn:.4f}")
print(f"RMSE do Modelo Meta: {rmse_meta:.4f}")

# Obter previsões dos modelos base no conjunto de teste
preds_catboost_test = modelo_catboost.predict(X_test)
preds_xgboost_test = modelo_xgboost.predict(X_test)
preds_lightgbm_test = modelo_lightgbm.predict(X_test)
preds_knn_test = modelo_knn.predict(X_test)

# Criar novo conjunto de teste para o modelo meta
X_meta_test = np.column_stack([preds_catboost_test, preds_xgboost_test, preds_lightgbm_test, preds_knn_test])

# Fazer previsão final com o modelo meta
y_pred_final = modelo_meta.predict(X_meta_test)

# Salvar resultados
output = pd.DataFrame({'Id': data_teste['id'], 'rainfall': y_pred_final})
output.to_csv('submission.csv', index=False)

print("Blending concluído! Resultados salvos em submission.csv")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000115 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1711
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 12
[LightGBM] [Info] Start training from score 0.751142
RMSE do CatBoost: 0.3464
RMSE do XGBoost: 0.3602
RMSE do LightGBM: 0.3451
RMSE do KNN: 0.3773
RMSE do Modelo Meta: 0.3360


ValueError: Input X contains NaN.
KNeighborsRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [9]:
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Carregar datasets
data_treino = pd.read_csv('/home/caio/github/k-3/data/train.csv')
data_teste = pd.read_csv('/home/caio/github/k-3/data/test.csv')

# Definir features e target
X = data_treino.drop(columns=['rainfall'])  # Substitua pelo nome real da coluna alvo
y = data_treino['rainfall']
X_test = data_teste  # Apenas features

# Dividir treino em treino principal (60%) e validação (40%)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.4, random_state=42)

# Inicializar modelos base
modelo_catboost = CatBoostRegressor(verbose=0, random_state=42)
modelo_xgboost = XGBRegressor(n_estimators=100, random_state=42)
modelo_lightgbm = LGBMRegressor(n_estimators=100, random_state=42)
modelo_knn = KNeighborsRegressor(n_neighbors=5)
modelo_rf = RandomForestRegressor(n_estimators=100, random_state=42)
modelo_extra_trees = ExtraTreesRegressor(n_estimators=100, random_state=42)
modelo_gb = GradientBoostingRegressor(n_estimators=100, random_state=42)
modelo_dt = DecisionTreeRegressor(random_state=42)
modelo_hgb = HistGradientBoostingRegressor(random_state=42)
modelo_mlp = MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)

# Treinar modelos base no treino principal
modelos = [
    modelo_catboost, modelo_xgboost, modelo_lightgbm, modelo_knn, 
    modelo_rf, modelo_extra_trees, modelo_gb, modelo_dt, modelo_hgb, modelo_mlp
]

for modelo in modelos:
    modelo.fit(X_train, y_train)

# Obter previsões dos modelos base no conjunto de validação
predicoes_valid = np.column_stack([modelo.predict(X_valid) for modelo in modelos])

# Calcular RMSE para cada modelo individual
rmse_individuais = [np.sqrt(mean_squared_error(y_valid, pred)) for pred in predicoes_valid.T]

# Criar novo conjunto de treino para o modelo meta
X_meta_train = predicoes_valid
y_meta_train = y_valid

# Treinar modelo meta (Regressão Ridge)
modelo_meta = Ridge()
modelo_meta.fit(X_meta_train, y_meta_train)

# Fazer previsões no conjunto de validação usando o modelo meta
y_valid_pred = modelo_meta.predict(X_meta_train)

# Calcular RMSE do modelo meta
rmse_meta = np.sqrt(mean_squared_error(y_meta_train, y_valid_pred))

# Exibir RMSEs
nomes_modelos = ["CatBoost", "XGBoost", "LightGBM", "KNN", "RandomForest", "ExtraTrees", "GradientBoosting", "DecisionTree", "HistGradientBoosting", "MLP"]
for nome, rmse in zip(nomes_modelos, rmse_individuais):
    print(f"RMSE do {nome}: {rmse:.4f}")
print(f"RMSE do Modelo Meta: {rmse_meta:.4f}")

# Obter previsões dos modelos base no conjunto de teste
predicoes_teste = np.column_stack([modelo.predict(X_test) for modelo in modelos])

# Criar novo conjunto de teste para o modelo meta
X_meta_test = predicoes_teste

# Fazer previsão final com o modelo meta
y_pred_final = modelo_meta.predict(X_meta_test)

# Salvar resultados
output = pd.DataFrame({'Id': data_teste['id'], 'rainfall': y_pred_final})
output.to_csv('submission.csv', index=False)

print("Blending concluído! Resultados salvos em submission.csv")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000110 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1711
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 12
[LightGBM] [Info] Start training from score 0.751142
RMSE do CatBoost: 0.3464
RMSE do XGBoost: 0.3602
RMSE do LightGBM: 0.3451
RMSE do KNN: 0.3773
RMSE do RandomForest: 0.3398
RMSE do ExtraTrees: 0.3376
RMSE do GradientBoosting: 0.3373
RMSE do DecisionTree: 0.4558
RMSE do HistGradientBoosting: 0.3457
RMSE do MLP: 0.8605
RMSE do Modelo Meta: 0.3292


ValueError: Input X contains NaN.
KNeighborsRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [15]:
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import (
    RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, 
    HistGradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor
)
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import (
    Lasso, ElasticNet, BayesianRidge, HuberRegressor, PassiveAggressiveRegressor, SGDRegressor
)
from sklearn.svm import SVR
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Carregar datasets
data_treino = pd.read_csv('/home/caio/github/k-3/data/train_tratado.csv')
data_teste = pd.read_csv('/home/caio/github/k-3/data/teste_tratado.csv')

# Definir features e target
X = data_treino.drop(columns=['rainfall'])  # Substitua pelo nome real da coluna alvo
y = data_treino['rainfall']
X_test = data_teste  # Apenas features

# Dividir treino em treino principal (60%) e validação (40%)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.4, random_state=42)

# Inicializar modelos base
modelos = {
    "CatBoost": CatBoostRegressor(verbose=0, random_state=42),
    "XGBoost":n_estimators=100, random_state=42
    "LightGBM": LGBMRegressor(n_estimators=100, random_state=42),
    #"KNN": KNeighborsRegressor(n_neighbors=5),
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),
    "ExtraTrees": ExtraTreesRegressor(n_estimators=100, random_state=42),
    #"GradientBoosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "DecisionTree": DecisionTreeRegressor(random_state=42),
    "HistGradientBoosting": HistGradientBoostingRegressor(random_state=42),
    "MLP": MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42),
    "AdaBoost": AdaBoostRegressor(n_estimators=100, random_state=42),
    "Bagging": BaggingRegressor(n_estimators=100, random_state=42),
    "Lasso": Lasso(alpha=0.1),
    "ElasticNet": ElasticNet(alpha=0.1),
    "BayesianRidge": BayesianRidge(),
    "SVR": SVR(kernel='rbf', C=100),
    "GaussianProcess": GaussianProcessRegressor(),
    "HuberRegressor": HuberRegressor(),
    "PassiveAggressive": PassiveAggressiveRegressor(max_iter=1000, random_state=42),
    #"SGDRegressor": SGDRegressor(max_iter=1000, random_state=42),
}

# Treinar modelos base no treino principal
for nome, modelo in modelos.items():
    print(f"Treinando {nome}...")
    modelo.fit(X_train, y_train)

# Obter previsões dos modelos base no conjunto de validação
predicoes_valid = np.column_stack([modelo.predict(X_valid) for modelo in modelos.values()])

# Calcular RMSE para cada modelo individual
rmse_individuais = {nome: np.sqrt(mean_squared_error(y_valid, pred)) 
                     for nome, pred in zip(modelos.keys(), predicoes_valid.T)}

# Criar novo conjunto de treino para o modelo meta
X_meta_train = predicoes_valid
y_meta_train = y_valid

# **NOVO META-MODELO: XGBoostRegressor**
modelo_meta = XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=5, random_state=42)
modelo_meta.fit(X_meta_train, y_meta_train)

# Fazer previsões no conjunto de validação usando o modelo meta
y_valid_pred = modelo_meta.predict(X_meta_train)

# Calcular RMSE do modelo meta
rmse_meta = np.sqrt(mean_squared_error(y_meta_train, y_valid_pred))

# Exibir RMSEs
for nome, rmse in rmse_individuais.items():
    print(f"RMSE do {nome}: {rmse:.4f}")
print(f"\n🔥 RMSE do Modelo Meta (XGBoost): {rmse_meta:.4f}")

# Obter previsões dos modelos base no conjunto de teste
predicoes_teste = np.column_stack([modelo.predict(X_test) for modelo in modelos.values()])

# Criar novo conjunto de teste para o modelo meta
X_meta_test = predicoes_teste

# Fazer previsão final com o modelo meta
y_pred_final = modelo_meta.predict(X_meta_test)

# Salvar resultados
output = pd.DataFrame({'Id': data_teste['id'], 'rainfall': y_pred_final})
output.to_csv('submission.csv', index=False)

print("Blending concluído! Resultados salvos em submission.csv")


Treinando CatBoost...
Treinando XGBoost...
Treinando LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000083 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1350
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 9
[LightGBM] [Info] Start training from score 0.751142
Treinando RandomForest...
Treinando ExtraTrees...
Treinando DecisionTree...
Treinando HistGradientBoosting...
Treinando MLP...
Treinando AdaBoost...
Treinando Bagging...
Treinando Lasso...
Treinando ElasticNet...
Treinando BayesianRidge...
Treinando SVR...
Treinando GaussianProcess...
Treinando HuberRegressor...
Treinando PassiveAggressive...


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


RMSE do CatBoost: 0.3446
RMSE do XGBoost: 0.3607
RMSE do LightGBM: 0.3455
RMSE do RandomForest: 0.3398
RMSE do ExtraTrees: 0.3355
RMSE do DecisionTree: 0.4790
RMSE do HistGradientBoosting: 0.3484
RMSE do MLP: 0.6616
RMSE do AdaBoost: 0.3476
RMSE do Bagging: 0.3405
RMSE do Lasso: 0.3334
RMSE do ElasticNet: 0.3313
RMSE do BayesianRidge: 0.3314
RMSE do SVR: 0.3382
RMSE do GaussianProcess: 0.8700
RMSE do HuberRegressor: 0.3699
RMSE do PassiveAggressive: 0.3711

🔥 RMSE do Modelo Meta (XGBoost): 0.0367
Blending concluído! Resultados salvos em submission.csv


In [16]:
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import (
    RandomForestRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor, 
    AdaBoostRegressor, BaggingRegressor
)
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Lasso, ElasticNet, BayesianRidge
from sklearn.svm import SVR
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Carregar datasets
data_treino = pd.read_csv('/home/caio/github/k-3/data/train_tratado.csv')
data_teste = pd.read_csv('/home/caio/github/k-3/data/teste_tratado.csv')

# Definir features e target
X = data_treino.drop(columns=['rainfall'])  # Substitua pelo nome real da coluna alvo
y = data_treino['rainfall']
X_test = data_teste

# Novo Split: Treino (50%), Validação para Base (30%), Teste para Meta (20%)
X_train_base, X_rest, y_train_base, y_rest = train_test_split(X, y, test_size=0.5, random_state=42)
X_valid_base, X_test_meta, y_valid_base, y_test_meta = train_test_split(X_rest, y_rest, test_size=0.4, random_state=42)

# Inicializar modelos base
modelos = {
    "CatBoost": CatBoostRegressor(verbose=0, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, random_state=42),
    "LightGBM": LGBMRegressor(n_estimators=100, random_state=42),
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),
    "ExtraTrees": ExtraTreesRegressor(n_estimators=100, random_state=42),
    "DecisionTree": DecisionTreeRegressor(random_state=42),
    "HistGradientBoosting": HistGradientBoostingRegressor(random_state=42),
    "MLP": MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42),
    "AdaBoost": AdaBoostRegressor(n_estimators=100, random_state=42),
    "Bagging": BaggingRegressor(n_estimators=100, random_state=42),
    "Lasso": Lasso(alpha=0.1),
    "ElasticNet": ElasticNet(alpha=0.1),
    "BayesianRidge": BayesianRidge(),
    "SVR": SVR(kernel='rbf', C=100),
    "GaussianProcess": GaussianProcessRegressor()
}

# Treinar modelos base no treino principal
for nome, modelo in modelos.items():
    print(f"Treinando {nome}...")
    modelo.fit(X_train_base, y_train_base)

# Obter previsões dos Modelos Base na Validação Base (para treinar o meta-modelo)
predicoes_valid_base = np.column_stack([modelo.predict(X_valid_base) for modelo in modelos.values()])

# Criar e treinar o Meta-Modelo
modelo_meta = XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=5, random_state=42)
modelo_meta.fit(predicoes_valid_base, y_valid_base)

# Avaliar o Meta-Modelo num conjunto nunca visto (Teste Meta)
predicoes_test_meta = np.column_stack([modelo.predict(X_test_meta) for modelo in modelos.values()])
y_pred_meta = modelo_meta.predict(predicoes_test_meta)

# Calcular RMSE real do modelo meta
rmse_meta_real = np.sqrt(mean_squared_error(y_test_meta, y_pred_meta))
print(f"\n✅ RMSE Real do Modelo Meta: {rmse_meta_real:.4f}")

# Obter previsões dos modelos base no conjunto de teste
predicoes_teste = np.column_stack([modelo.predict(X_test) for modelo in modelos.values()])

# Fazer previsão final com o modelo meta
y_pred_final = modelo_meta.predict(predicoes_teste)

# Salvar resultados
output = pd.DataFrame({'Id': data_teste['id'], 'rainfall': y_pred_final})
output.to_csv('submission.csv', index=False)

print("Blending concluído! Resultados salvos em submission.csv")


Treinando CatBoost...
Treinando XGBoost...
Treinando LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000091 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1316
[LightGBM] [Info] Number of data points in the train set: 1095, number of used features: 9
[LightGBM] [Info] Start training from score 0.749772
Treinando RandomForest...
Treinando ExtraTrees...
Treinando DecisionTree...
Treinando HistGradientBoosting...
Treinando MLP...
Treinando AdaBoost...
Treinando Bagging...
Treinando Lasso...
Treinando ElasticNet...
Treinando BayesianRidge...
Treinando SVR...
Treinando GaussianProcess...

✅ RMSE Real do Modelo Meta: 0.3585
Blending concluído! Resultados salvos em submission.csv


In [None]:
from sklearn.model_selection import RandomizedSearchCV, train_test_split
# Definir o modelo
xgb = XGBRegressor(objective='reg:squarederror', random_state=42)

# Definir os hiperparâmetros para a busca
param_dist = {
    'n_estimators': np.arange(100, 7000, 10),  # Aumentado até 7000 e passo reduzido para testar mais variações
    'learning_rate': np.linspace(0.00005, 0.8, 100),  # 100 valores para mais refinamento da taxa de aprendizado
    'max_depth': np.arange(2, 30),  # Aumentado até 30 para testar árvores ainda mais profundas
    'subsample': np.linspace(0.1, 1, 30),  # Mais valores para melhor ajuste da fração de amostras
    'colsample_bytree': np.linspace(0.1, 1, 30)  # Mais valores para ajuste fino da fração de features
}



# Criar a busca com RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=20,  # Número de combinações a testar
    scoring='neg_mean_squared_error',
    cv=5,
    verbose=1,
    n_jobs=-1
)

# Treinar a busca
random_search.fit(feature_treino_base, target_treino_base)

# Exibir apenas os melhores hiperparâmetros
print(random_search.best_params_)

In [None]:
import optuna
from catboost import CatBoostRegressor
import numpy as np
from sklearn.model_selection import cross_val_score

# Definição da função objetivo para otimização
def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 3000),
        'learning_rate': trial.suggest_float('learning_rate', 0.0001, 1.0, log=True),
        'depth': trial.suggest_int('depth', 2, 8),
        'subsample': trial.suggest_float('subsample', 0.2, 1.0),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.2, 1.0),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0.0, 20.0),
        'random_seed': 42,
        'verbose': 0
    }



    # Criar modelo CatBoostRegressor com os hiperparâmetros sugeridos
    model = CatBoostRegressor(**params)

    # Avaliação do modelo com validação cruzada
    scores = cross_val_score(model, feature_treino_base, target_treino_base, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    
    return np.mean(scores)  # Média do erro quadrático negativo

# Criando um estudo e rodando a otimização
study = optuna.create_study(direction='maximize')  # Maximizar -MSE (equivalente a minimizar MSE)
study.optimize(objective, n_trials=50)  # Ajuste o número de trials

# Exibir os melhores hiperparâmetros encontrados
print("Melhores hiperparâmetros:", study.best_params)
