following - https://www.kaggle.com/code/ayanabil11/lightgbm-xgboost-and-catboost-stacking

In [18]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_squared_log_error, r2_score

import lightgbm as lgb
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

from sklearn.ensemble import StackingRegressor
from lightgbm.basic import LightGBMError

In [19]:
df_train = pd.read_csv("/kaggle/input/playground-series-s4e5/train.csv")
df_test = pd.read_csv("/kaggle/input/playground-series-s4e5/test.csv")

In [20]:
y = df_train['FloodProbability']
X_train = df_train.drop(columns=['id', 'FloodProbability'])
X_test = df_test.drop(columns=['id'])

In [21]:
def feature_engineering(df):
    df['feature_sum'] = df.sum(axis=1)
    df['feature_mean'] = df.mean(axis=1)
    df['feature_std'] = df.std(axis=1)
    df['feature_min'] = df.min(axis=1)
    df['feature_max'] = df.max(axis=1)
    
    # Polynomial features
    poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
    poly_features = poly.fit_transform(df)
    poly_df = pd.DataFrame(poly_features, columns=poly.get_feature_names_out(df.columns))
    df = pd.concat([df, poly_df], axis=1)
    
    return df

In [22]:
X_train = feature_engineering(X_train)
X_test = feature_engineering(X_test)

In [23]:
X_train.columns

Index(['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement',
       'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality',
       'Siltation', 'AgriculturalPractices', 'Encroachments',
       ...
       'feature_sum feature_mean', 'feature_sum feature_std',
       'feature_sum feature_min', 'feature_sum feature_max',
       'feature_mean feature_std', 'feature_mean feature_min',
       'feature_mean feature_max', 'feature_std feature_min',
       'feature_std feature_max', 'feature_min feature_max'],
      dtype='object', length=350)

In [24]:
X_train = X_train.loc[:, ~X_train.columns.duplicated()]
X_test = X_test.loc[:, ~X_test.columns.duplicated()]

In [25]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y, test_size=0.2, random_state=42)

In [26]:
lgb_model = lgb.LGBMRegressor(device = "gpu")
xgb_model = XGBRegressor(tree_method="gpu_hist")
cat_model = CatBoostRegressor(verbose=0, task_type="GPU", devices="0")

In [27]:
estimators = [
    ('lgb', lgb_model),
    ('xgb', xgb_model),
    ('cat', cat_model)
]
stacking_model = StackingRegressor(
    estimators=estimators,
    final_estimator=lgb.LGBMRegressor()
)

In [28]:
stacking_model.fit(X_train, y_train)

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 38314
[LightGBM] [Info] Number of data points in the train set: 894365, number of used features: 325
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...




[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 325 dense feature groups (279.76 MB) transferred to GPU in 0.166131 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 0.504480
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 38257
[LightGBM] [Info] Number of data points in the train set: 715492, number of used features: 325
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 325 dense feature groups (223.81 MB) transferred to GPU in 0.133146 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 0.504504
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 38296
[LightGBM] [Info] Number of data points in the train set: 715492, number of used feat

In [29]:
val_preds = stacking_model.predict(X_val)
val_score = r2_score(y_val, val_preds)
print(f"Validation R2 Score: {val_score}")

Validation R2 Score: 0.867430996177503
