## Importações

In [1]:
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import (
    RandomForestRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor, 
    AdaBoostRegressor, BaggingRegressor
)
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Lasso, ElasticNet, BayesianRidge
from sklearn.svm import SVR
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.stats import zscore


## Carga de Dados // Tratamento Básico

### Carga de Dados

In [2]:
data_treino = pd.read_csv('/home/caio/github/k-3/data/train.csv')
data_teste = pd.read_csv('/home/caio/github/k-3/data/test.csv')

### EDA

In [3]:
print('-'*50, " Valores NaN em data_treino", "-"*50)
print(data_treino.isnull().sum())
print('-'*50, " Valores NaN em data_teste", "-"*50)
print(data_teste.isnull().sum())

--------------------------------------------------  Valores NaN em data_treino --------------------------------------------------
id               0
day              0
pressure         0
maxtemp          0
temparature      0
mintemp          0
dewpoint         0
humidity         0
cloud            0
sunshine         0
winddirection    0
windspeed        0
rainfall         0
dtype: int64
--------------------------------------------------  Valores NaN em data_teste --------------------------------------------------
id               0
day              0
pressure         0
maxtemp          0
temparature      0
mintemp          0
dewpoint         0
humidity         0
cloud            0
sunshine         0
winddirection    1
windspeed        0
dtype: int64


In [4]:
print('-'*50, " Outliers em data_treino", "-"*50)
z_scores = data_treino.select_dtypes(include=np.number).apply(zscore)
outliers_z = (abs(z_scores) > 3).sum()
print(outliers_z)

print('-'*50, " Outliers em data_teste", "-"*50)
z_scores = data_teste.select_dtypes(include=np.number).apply(zscore)
outliers_z = (abs(z_scores) > 3).sum()
print(outliers_z)


--------------------------------------------------  Outliers em data_treino --------------------------------------------------
id                0
day               0
pressure          4
maxtemp           0
temparature       1
mintemp           2
dewpoint         26
humidity         17
cloud            31
sunshine          0
winddirection     0
windspeed        15
rainfall          0
dtype: int64
--------------------------------------------------  Outliers em data_teste --------------------------------------------------
id                0
day               0
pressure          1
maxtemp           1
temparature       1
mintemp           2
dewpoint         10
humidity          7
cloud            11
sunshine          0
winddirection     0
windspeed         8
dtype: int64


### Tratamento Básico

In [5]:
# Remoção do valor NaN de winddirection
mediana = data_teste['winddirection'].median()
data_teste['winddirection'] = data_teste['winddirection'].fillna(mediana)

### Divisão FEATURE / TARGET

In [6]:
FEATURE = data_treino.drop(columns=['rainfall']) 
TARGET = data_treino['rainfall']

### Divisão Meta-Modelo

In [7]:
# Treino (50%), Validação para Base (30%), Teste para Meta (20%)
feature_treino_base, feature_rest, target_treino_base, target_rest = train_test_split(FEATURE, TARGET, test_size=0.5, random_state=42)
feature_valid_base, feature_test_meta, target_valid_base, target_test_meta = train_test_split(feature_rest, target_rest, test_size=0.4, random_state=42)

## Modelos

### Catboost

#### Iniciando Modelo

In [8]:
#### Iniciando o Modelo
modelo_c1 = CatBoostRegressor(
    iterations=1955,
    depth=5,
    learning_rate=0.00878515482955258,
    loss_function='RMSE',
    subsample= 0.7024609548921839,
    colsample_bylevel= 0.24231948861828084,
    l2_leaf_reg= 13.987621237707176,
    verbose=0
)

#### Treinando Modelo

In [9]:
modelo_c1.fit(feature_treino_base, target_treino_base)

<catboost.core.CatBoostRegressor at 0x7ffb52cc3bd0>

#### Fazendo Previsão na Base (30%) // Gerando Feature Para Meta Modelo

In [10]:
feature_meta_c1 = modelo_c1.predict(feature_valid_base)
rmse_cat1 = np.sqrt(mean_squared_error(feature_meta_c1, target_valid_base))

### XGBRegressor

#### Iniciando Modelo

In [11]:
modelo_x1 = XGBRegressor(n_estimators=2476,
                         learning_rate= 0.002713567157357714,
                         max_depth= 10,
                         subsample= 0.8412329047093903,
                         colsample_bytree= 0.8118853098655872,
                         gamma= 0.6454795935460096,
                         reg_alpha=0.023901381799989727,
                         reg_lambda= 3.298548751382269,
                         min_child_weight= 1,
                         random_state=42)

#### Treinando Modelo

In [12]:
modelo_x1.fit(feature_treino_base, target_treino_base)

#### Fazendo Previsão na Base (30%) // Gerando Feature Para Meta Modelo

In [13]:
feature_meta_x1 = modelo_x1.predict(feature_valid_base)
rmse_x1 = np.sqrt(mean_squared_error(feature_meta_x1, target_valid_base))

### LightGBM

#### Iniciando o Modelo

In [14]:
modelo_lgbm = LGBMRegressor(num_iterations= 1900,
                            learning_rate= 0.10809966608948998,
                            max_depth= 4,
                            num_leaves= 140,
                            min_data_in_leaf=11,
                            subsample= 0.9,
                            colsample_bytree= 0.7,
                            lambda_l1= 1.5229710918593735,
                            lambda_l2= 1.55538055462489,
                            min_gain_to_split= 0.11592155573870366,
                            random_state=69)

#### Treinando o Modelo

In [15]:
modelo_lgbm.fit(feature_treino_base, target_treino_base)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000138 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1640
[LightGBM] [Info] Number of data points in the train set: 1095, number of used features: 12
[LightGBM] [Info] Start training from score 0.749772


#### Fazendo Previsão na Base (30%) // Gerando Feature Para Meta Modelo

In [16]:
feature_meta_lgbm = modelo_lgbm.predict(feature_valid_base)
rmse_lgbm = np.sqrt(mean_squared_error(feature_meta_lgbm, target_valid_base))



## Teste dos Modelos

In [18]:
print(f"CatBoost RMSE .......{rmse_cat1}")
print(f"XGBRegressor RMSE ...{rmse_x1}")
print(f"LightLGBM RMSE ......{rmse_lgbm}")  ## 0.34832636264866634

CatBoost RMSE .......0.3290486490320497
XGBRegressor RMSE ...0.3294914833518092
LightLGBM RMSE ......0.3265194255839313


## Meta-Modelo

In [21]:
feature_meta_unificada = pd.DataFrame()  # Corrigido aqui
feature_meta_unificada['CatBoost'] = feature_meta_c1
feature_meta_unificada['XGBRegressor'] = feature_meta_x1
feature_meta_unificada['LightGBM'] = feature_meta_lgbm

##target_valid_base

In [30]:
meta_modelo = CatBoostRegressor(verbose=0, random_state=42)
meta_modelo.fit(feature_meta_unificada, target_valid_base)
previsao = meta_modelo.predict(feature_meta_unificada)
rmse_meta = np.sqrt(mean_squared_error(target_valid_base, previsao))
print(rmse_meta)

previsao_final = meta_modelo.predict(data_teste)

0.17147957666593586


CatBoostError: catboost/libs/data/model_dataset_compatibility.cpp:81: At position 0 should be feature with name CatBoost (found id).