## Importações

In [1]:
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import (
    RandomForestRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor, 
    AdaBoostRegressor, BaggingRegressor
)
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Lasso, ElasticNet, BayesianRidge
from sklearn.svm import SVR
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.stats import zscore
from sklearn.linear_model import LinearRegression


## Carga de Dados // Tratamento Básico

### Carga de Dados

In [2]:
data_treino = pd.read_csv('/home/caio/github/k-3/data/train.csv')
data_teste = pd.read_csv('/home/caio/github/k-3/data/test.csv')

### EDA

In [3]:
print('-'*50, " Valores NaN em data_treino", "-"*50)
print(data_treino.isnull().sum())
print('-'*50, " Valores NaN em data_teste", "-"*50)
print(data_teste.isnull().sum())

--------------------------------------------------  Valores NaN em data_treino --------------------------------------------------
id               0
day              0
pressure         0
maxtemp          0
temparature      0
mintemp          0
dewpoint         0
humidity         0
cloud            0
sunshine         0
winddirection    0
windspeed        0
rainfall         0
dtype: int64
--------------------------------------------------  Valores NaN em data_teste --------------------------------------------------
id               0
day              0
pressure         0
maxtemp          0
temparature      0
mintemp          0
dewpoint         0
humidity         0
cloud            0
sunshine         0
winddirection    1
windspeed        0
dtype: int64


In [4]:
print('-'*50, " Outliers em data_treino", "-"*50)
z_scores = data_treino.select_dtypes(include=np.number).apply(zscore)
outliers_z = (abs(z_scores) > 3).sum()
print(outliers_z)

print('-'*50, " Outliers em data_teste", "-"*50)
z_scores = data_teste.select_dtypes(include=np.number).apply(zscore)
outliers_z = (abs(z_scores) > 3).sum()
print(outliers_z)


--------------------------------------------------  Outliers em data_treino --------------------------------------------------
id                0
day               0
pressure          4
maxtemp           0
temparature       1
mintemp           2
dewpoint         26
humidity         17
cloud            31
sunshine          0
winddirection     0
windspeed        15
rainfall          0
dtype: int64
--------------------------------------------------  Outliers em data_teste --------------------------------------------------
id                0
day               0
pressure          1
maxtemp           1
temparature       1
mintemp           2
dewpoint         10
humidity          7
cloud            11
sunshine          0
winddirection     0
windspeed         8
dtype: int64


### Tratamento Básico

In [5]:
def outliers(data, fator=1.5):
    data_limpa = data.copy()
    
    for col in data_limpa.select_dtypes(include=[np.number]).columns:
        Q1 = data_limpa[col].quantile(0.25)
        Q3 = data_limpa[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - (fator * IQR)
        upper_bound = Q3 + (fator * IQR)
        data_limpa[col] = np.clip(data_limpa[col], lower_bound, upper_bound)
    
    return data_limpa

In [6]:
# Remoção do valor NaN de winddirection
mediana = data_teste['winddirection'].median()
data_teste['winddirection'] = data_teste['winddirection'].fillna(mediana)

### Divisão FEATURE / TARGET

In [7]:
FEATURE = data_treino.drop(columns=['rainfall']) 
TARGET = data_treino['rainfall']

### Divisão Meta-Modelo

In [8]:
# Treino (50%), Validação para Base (30%), Teste para Meta (20%)
feature_treino_base, feature_rest, target_treino_base, target_rest = train_test_split(FEATURE, TARGET, test_size=0.5, random_state=42)
feature_valid_base, feature_test_meta, target_valid_base, target_test_meta = train_test_split(feature_rest, target_rest, test_size=0.4, random_state=42)

## Modelos

### Catboost

#### Iniciando Modelo

In [9]:
#### Iniciando o Modelo
modelo_c1 = CatBoostRegressor(
    iterations=1955,
    depth=5,
    learning_rate=0.00878515482955258,
    loss_function='RMSE',
    subsample= 0.7024609548921839,
    colsample_bylevel= 0.24231948861828084,
    l2_leaf_reg= 13.987621237707176,
    verbose=0
)

#### Treinando Modelo

In [10]:
modelo_c1.fit(feature_treino_base, target_treino_base)

<catboost.core.CatBoostRegressor at 0x7f8fe3870e50>

#### Fazendo Previsão na Base (30%) // Gerando Feature Para Meta Modelo

In [11]:
feature_meta_c1 = modelo_c1.predict(feature_valid_base)
rmse_cat1 = np.sqrt(mean_squared_error(feature_meta_c1, target_valid_base))

### XGBRegressor

#### Iniciando Modelo

In [12]:
modelo_x1 = XGBRegressor(n_estimators=2476,
                         learning_rate= 0.002713567157357714,
                         max_depth= 10,
                         subsample= 0.8412329047093903,
                         colsample_bytree= 0.8118853098655872,
                         gamma= 0.6454795935460096,
                         reg_alpha=0.023901381799989727,
                         reg_lambda= 3.298548751382269,
                         min_child_weight= 1,
                         random_state=42)

#### Treinando Modelo

In [13]:
modelo_x1.fit(feature_treino_base, target_treino_base)

#### Fazendo Previsão na Base (30%) // Gerando Feature Para Meta Modelo

In [14]:
feature_meta_x1 = modelo_x1.predict(feature_valid_base)
rmse_x1 = np.sqrt(mean_squared_error(feature_meta_x1, target_valid_base))

### LightGBM

#### Iniciando o Modelo

In [15]:
modelo_lgbm = LGBMRegressor(num_iterations= 1900,
                            learning_rate= 0.10809966608948998,
                            max_depth= 4,
                            num_leaves= 140,
                            min_data_in_leaf=11,
                            subsample= 0.9,
                            colsample_bytree= 0.7,
                            lambda_l1= 1.5229710918593735,
                            lambda_l2= 1.55538055462489,
                            min_gain_to_split= 0.11592155573870366,
                            random_state=69)

#### Treinando o Modelo

In [16]:
modelo_lgbm.fit(feature_treino_base, target_treino_base)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000132 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1640
[LightGBM] [Info] Number of data points in the train set: 1095, number of used features: 12
[LightGBM] [Info] Start training from score 0.749772


#### Fazendo Previsão na Base (30%) // Gerando Feature Para Meta Modelo

In [17]:
feature_meta_lgbm = modelo_lgbm.predict(feature_valid_base)
rmse_lgbm = np.sqrt(mean_squared_error(feature_meta_lgbm, target_valid_base))



### LinearRegression

#### Iniciando Modelo

In [18]:
modelo_linear = LinearRegression()

#### Treinando Modelo

In [19]:
modelo_linear.fit(feature_treino_base, target_treino_base)

#### Fazendo Previsão na Base (30%) // Gerando Feature Para Meta Modelo

In [20]:
feature_meta_linear = modelo_linear.predict(feature_valid_base)
rmse_linear = np.sqrt(mean_squared_error(feature_meta_linear, target_valid_base))

### AdaBoost

#### Inciando Modelo

In [21]:
modelo_ada = AdaBoostRegressor(n_estimators=117,
                               learning_rate = 0.009320996743568632, 
                               estimator=DecisionTreeRegressor(max_depth=4),
                               loss="square",
                               random_state=69)


#### Treinamento do Modelo

In [22]:
modelo_ada.fit(feature_treino_base, target_treino_base)

#### Fazendo Previsão na Base (30%) // Gerando Feature Para Meta Modelo

In [23]:
feature_meta_ada = modelo_ada.predict(feature_valid_base)
rmse_ada = np.sqrt(mean_squared_error(feature_meta_ada, target_valid_base))

### Neural

In [24]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
import keras.backend as K
from sklearn.model_selection import train_test_split

# Função de erro RMSE corrigida
def rmse(y_true, y_pred):
    y_true = K.cast(y_true, dtype="float32")
    y_pred = K.cast(y_pred, dtype="float32")
    return K.sqrt(K.mean(K.square(y_true - y_pred)))

# Separar os dados em treino e validação
X_train, X_val, y_train, y_val = train_test_split(feature_treino_base, target_treino_base, test_size=0.2, random_state=42)

# Criar o modelo com os melhores hiperparâmetros
def criar_modelo():
    model = Sequential()
    
    # Primeira camada oculta
    model.add(Dense(160, activation="relu", input_shape=(X_train.shape[1],)))
    model.add(BatchNormalization())
    model.add(Dropout(0.49034062968575254))  # Dropout otimizado
    
    # Camada de saída
    model.add(Dense(1, activation="linear"))

    # Configurar otimizador com learning rate otimizado
    optimizer = Adam(learning_rate=0.0021515376914683355)
    
    model.compile(optimizer=optimizer, loss=rmse, metrics=["mae", rmse])
    return model

# Criar e treinar o modelo
modelo_otimizado = criar_modelo()
history = modelo_otimizado.fit(X_train, y_train, 
                               epochs=125, 
                               batch_size=16, 
                               validation_data=(X_val, y_val), 
                               verbose=1)

# Avaliação final
loss, mae, rmse_score = modelo_otimizado.evaluate(X_val, y_val, verbose=1)
print(f"RMSE final no conjunto de validação: {rmse_score}")


2025-03-19 02:53:22.856762: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-03-19 02:53:24.183505: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


Epoch 1/125
Epoch 2/125
Epoch 3/125
Epoch 4/125
Epoch 5/125
Epoch 6/125
Epoch 7/125
Epoch 8/125
Epoch 9/125
Epoch 10/125
Epoch 11/125
Epoch 12/125
Epoch 13/125
Epoch 14/125
Epoch 15/125
Epoch 16/125
Epoch 17/125
Epoch 18/125
Epoch 19/125
Epoch 20/125
Epoch 21/125
Epoch 22/125
Epoch 23/125
Epoch 24/125
Epoch 25/125
Epoch 26/125
Epoch 27/125
Epoch 28/125
Epoch 29/125
Epoch 30/125
Epoch 31/125
Epoch 32/125
Epoch 33/125
Epoch 34/125
Epoch 35/125
Epoch 36/125
Epoch 37/125
Epoch 38/125
Epoch 39/125
Epoch 40/125
Epoch 41/125
Epoch 42/125
Epoch 43/125
Epoch 44/125
Epoch 45/125
Epoch 46/125
Epoch 47/125
Epoch 48/125
Epoch 49/125
Epoch 50/125
Epoch 51/125
Epoch 52/125
Epoch 53/125
Epoch 54/125
Epoch 55/125
Epoch 56/125
Epoch 57/125
Epoch 58/125
Epoch 59/125
Epoch 60/125
Epoch 61/125
Epoch 62/125
Epoch 63/125
Epoch 64/125
Epoch 65/125
Epoch 66/125
Epoch 67/125
Epoch 68/125
Epoch 69/125
Epoch 70/125
Epoch 71/125
Epoch 72/125
Epoch 73/125
Epoch 74/125
Epoch 75/125
Epoch 76/125
Epoch 77/125
Epoch 78

In [29]:
feature_meta_neural = modelo_otimizado.predict(feature_valid_base)
rmse_neural = np.sqrt(mean_squared_error(feature_meta_neural, target_valid_base))



## Teste dos Modelos

In [30]:
print(f"CatBoost RMSE ...........{rmse_cat1}")
print(f"XGBRegressor RMSE .......{rmse_x1}")
print(f"LightLGBM RMSE ..........{rmse_lgbm}") 
print(f"LinearRegression RMSE ...{rmse_linear}") 
print(f"AdaBoost RMSE............{rmse_ada}")
print(f"Neural RMSE:............ {rmse_neural}")

CatBoost RMSE ...........0.3290486490320497
XGBRegressor RMSE .......0.3294914833518092
LightLGBM RMSE ..........0.3265194255839313
LinearRegression RMSE ...0.3311290087954679
AdaBoost RMSE............0.3325683449975139
Neural RMSE:............ 0.3600888625712664


## Meta-Modelo

#### Feature para Treino do Meta-Modelo

In [31]:
feature_meta_unificada = pd.DataFrame()

feature_meta_c1 = pd.Series(feature_meta_c1)
feature_meta_x1 = pd.Series(feature_meta_x1)
feature_meta_lgbm = pd.Series(feature_meta_lgbm)
feature_meta_linear = pd.Series(feature_meta_linear)
feature_meta_ada = pd.Series(feature_meta_ada)

feature_meta_unificada['CatBoost'] = feature_meta_c1
feature_meta_unificada['XGBRegressor'] = feature_meta_x1
feature_meta_unificada['LightGBM'] = feature_meta_lgbm
feature_meta_unificada['Linear'] = feature_meta_linear
feature_meta_unificada['Ada'] = feature_meta_ada
feature_meta_unificada['Neural'] = feature_meta_neural

##target_valid_base

#### Feature para Previsão Final

In [32]:
feature_final1 = modelo_c1.predict(data_teste)
feature_final2 = modelo_x1.predict(data_teste)
feature_final3 = modelo_lgbm.predict(data_teste)
feature_final4 = modelo_linear.predict(data_teste)
feature_final5 = modelo_ada.predict(data_teste)
feature_final6 = modelo_otimizado.predict(data_teste)



In [34]:
feature_meta_unificada_final = pd.DataFrame({
    'CatBoost': feature_final1,
    'XGBRegressor': feature_final2,
    'LightGBM': feature_final3,
    'Linear': feature_final4,
    'Ada': feature_final5,
    'Neural': feature_final6.ravel()
})

#### Criação do Meta-Modelo

In [38]:
meta_modelo = CatBoostRegressor(
    iterations=1973,
    depth=4,
    learning_rate=0.0031470236892576554,
    loss_function='RMSE',
    subsample= 0.8163554441989388,
    colsample_bylevel= 0.4701500303061581,
    l2_leaf_reg= 19.72460023484443,
    verbose=0
)

meta_modelo.fit(feature_meta_unificada, target_valid_base)


<catboost.core.CatBoostRegressor at 0x7f8f852bac10>

#### Previsão Final e CSV

In [36]:
previsao_final = meta_modelo.predict(feature_meta_unificada_final)

output = pd.DataFrame({'id': data_teste['id'], 'rainfall': previsao_final})
output.to_csv('testeeee.csv', index=False)

In [None]:
Melhores hiperparâmetros: {'iterations': 1973, 'learning_rate': 0.0031470236892576554, 'depth': 4, 'subsample': 0.8163554441989388, 'colsample_bylevel': 0.4701500303061581, 'l2_leaf_reg': 19.72460023484443}
