# tldr
## final sub

no cross val \
hardcode params \
high_score

In [1]:
# main 
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

# viz
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
cmap = sns.color_palette("ch:s=.75,rot=-0.6",as_cmap=True)

from colorama import Fore, Style

# deal with warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
# set path
path = '/kaggle/input/playground-series-s4e5'
# load data
train = pd.read_csv(f'{path}/train.csv', index_col=0)
test = pd.read_csv(f'{path}/test.csv', index_col=0)

initial_features = train.drop(columns='FloodProbability').columns

In [3]:
unique_vals = [*range(20)]

In [4]:
def engineer(df):

    df['ClimateAnthropogenicInteraction'] = (df['MonsoonIntensity'] + df['ClimateChange']) * \
                                            (df['Deforestation'] + df['Urbanization'] + df['AgriculturalPractices'] + df['Encroachments'])
    
    df['InfrastructurePreventionInteraction'] = (df['DamsQuality'] + df['DrainageSystems'] + df['DeterioratingInfrastructure']) * \
                                                (df['RiverManagement'] + df['IneffectiveDisasterPreparedness'] + df['InadequatePlanning'])
    
    df['sum'] = df[initial_features].sum(axis=1)   # Sum of features
    df['std'] = df[initial_features].std(axis=1)   # Standard deviation
    df['mean'] = df[initial_features].mean(axis=1) # Mean
    df['max'] = df[initial_features].max(axis=1)   # Maximum value
    df['min'] = df[initial_features].min(axis=1)   # Minimum value
    df['mode'] = df[initial_features].mode(axis=1)[0]  # Mode

    df['q_25th'] = df[initial_features].quantile(0.25, axis=1) # 25th percentile
    df['q_75th'] = df[initial_features].quantile(0.75, axis=1) # 75th percentile
    df['skew'] = df[initial_features].skew(axis=1) # Skewness
    df['kurt'] = df[initial_features].kurt(axis=1) # Kurtosis
    
    # Range feature: Check if the sum of features falls within a specific range.
    df['sum_72_76'] = df['sum'].isin(np.arange(72, 76))
    
    # Quantile features: Providing the model with more detailed data distribution information.
    for i in range(10, 100, 10):
        df[f'{i}th'] = df[initial_features].quantile(i / 100, axis=1)
    
    # Other mathematical features
    df['harmonic'] = len(initial_features) / df[initial_features].apply(lambda x: (1/x).mean(), axis=1) # Harmonic mean
    df['geometric'] = df[initial_features].apply(lambda x: x.prod()**(1/len(x)), axis=1) # Geometric mean
    df['zscore'] = df[initial_features].apply(lambda x: (x - x.mean()) / x.std(), axis=1).mean(axis=1) # Z-score mean
    df['cv'] = df[initial_features].std(axis=1) / df[initial_features].mean(axis=1) # Coefficient of variation
    df['Skewness_75'] = (df[initial_features].quantile(0.75, axis=1) - df[initial_features].mean(axis=1)) / df[initial_features].std(axis=1) # 75th percentile skewness
    df['Skewness_25'] = (df[initial_features].quantile(0.25, axis=1) - df[initial_features].mean(axis=1)) / df[initial_features].std(axis=1) # 25th percentile skewness
    df['2ndMoment'] = df[initial_features].apply(lambda x: (x**2).mean(), axis=1) # Second moment
    df['3rdMoment'] = df[initial_features].apply(lambda x: (x**3).mean(), axis=1) # Third moment
    df['entropy'] = df[initial_features].apply(lambda x: -1*(x*np.log(x)).sum(), axis=1) # Entropy
    
    # Unique value count features: Counting the occurrences of each unique value in the dataset, helps understand data discreteness.
    for v in unique_vals:
        df['cnt_{}'.format(v)] = (df[initial_features] == v).sum(axis=1)
    
    return df

In [5]:
train = engineer(train)
test = engineer(test)

y_train = train[['FloodProbability']].copy()
X_train = train.drop(columns='FloodProbability').copy()

## params

In [6]:
xgb_params = {
    'max_depth': 10,
    'tree_method': 'gpu_hist',
    'learning_rate': 0.01,
    'random_state':2,
    'verbose':1
}

cat_params = {
    'random_state':1,
    'learning_rate': 0.011277016304363601, 
    'depth': 8, 
    'subsample': 0.8675506657380021, 
    'min_data_in_leaf': 98,
    'task_type': 'GPU',
    'bootstrap_type': 'Bernoulli',
    'verbose':1
}

lgbm_params = {
    'boosting_type':'gbdt',
    'device':'gpu',
    'learning_rate':0.012,  
    'num_leaves':250, 
    'subsample_for_bin':165700, 
    'min_child_samples':114, 
    'reg_alpha':2.075e-06, 
    'reg_lambda':3.839e-07, 
    'colsample_bytree':0.9634, 
    'subsample':0.9592, 
    'max_depth':10, 
    'random_state':3,
    'verbose':1
    
}

## model

In [7]:
from sklearn.ensemble import StackingRegressor # RandomForestRegressor
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import StandardScaler

from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline

# from sklearn.ensemble import HistGradientBoostingRegressor as HGB
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

default_num_pipeline = make_pipeline(SimpleImputer(strategy="median"),
                                     StandardScaler())

stacked = StackingRegressor(
    estimators=[
        ('cat', CatBoostRegressor(**cat_params)),
        ('xgb', XGBRegressor(**xgb_params)),
        ('lgbm', LGBMRegressor(**lgbm_params)),
#         ('hgb', HGB(random_state=4))
    ],
    final_estimator=ElasticNet(alpha=1e-5, 
                               l1_ratio=0.4,
                               random_state=0),
)

model = make_pipeline(default_num_pipeline,
                     stacked)

model.fit(X_train, y_train)

0:	learn: 0.0505390	total: 282ms	remaining: 4m 42s
1:	learn: 0.0500583	total: 302ms	remaining: 2m 30s
2:	learn: 0.0495849	total: 321ms	remaining: 1m 46s
3:	learn: 0.0491166	total: 341ms	remaining: 1m 25s
4:	learn: 0.0486531	total: 361ms	remaining: 1m 11s
5:	learn: 0.0481957	total: 380ms	remaining: 1m 2s
6:	learn: 0.0477443	total: 399ms	remaining: 56.6s
7:	learn: 0.0473027	total: 418ms	remaining: 51.9s
8:	learn: 0.0468630	total: 438ms	remaining: 48.2s
9:	learn: 0.0464318	total: 457ms	remaining: 45.2s
10:	learn: 0.0460037	total: 476ms	remaining: 42.8s
11:	learn: 0.0455817	total: 495ms	remaining: 40.7s
12:	learn: 0.0451669	total: 514ms	remaining: 39s
13:	learn: 0.0447576	total: 531ms	remaining: 37.4s
14:	learn: 0.0443538	total: 547ms	remaining: 35.9s
15:	learn: 0.0439543	total: 564ms	remaining: 34.7s
16:	learn: 0.0435581	total: 580ms	remaining: 33.6s
17:	learn: 0.0431672	total: 598ms	remaining: 32.6s
18:	learn: 0.0427826	total: 616ms	remaining: 31.8s
19:	learn: 0.0424024	total: 632ms	rema



[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 59 dense feature groups (63.97 MB) transferred to GPU in 0.048457 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 0.504480
0:	learn: 0.0505505	total: 19.9ms	remaining: 19.9s
1:	learn: 0.0500697	total: 38.1ms	remaining: 19s
2:	learn: 0.0495962	total: 56.9ms	remaining: 18.9s
3:	learn: 0.0491279	total: 74.8ms	remaining: 18.6s
4:	learn: 0.0486643	total: 92.4ms	remaining: 18.4s
5:	learn: 0.0482068	total: 110ms	remaining: 18.3s
6:	learn: 0.0477554	total: 128ms	remaining: 18.1s
7:	learn: 0.0473152	total: 146ms	remaining: 18.1s
8:	learn: 0.0468758	total: 178ms	remaining: 19.6s
9:	learn: 0.0464448	total: 195ms	remaining: 19.3s
10:	learn: 0.0460191	total: 212ms	remaining: 19.1s
11:	learn: 0.0455966	total: 234ms	remaining: 19.3s
12:	learn: 0.0451795	total: 250ms	remaining: 19s
13:	learn: 0.0447706	total: 266ms	remaining: 18.7s
14:	learn: 0.0443650	total: 281m

In [8]:
test_pred = model.predict(test)

sub = pd.DataFrame({y_train.columns[0]: test_pred},
             index=test.index)

sub

Unnamed: 0_level_0,FloodProbability
id,Unnamed: 1_level_1
1117957,0.577540
1117958,0.455126
1117959,0.449323
1117960,0.467093
1117961,0.467105
...,...
1863257,0.475943
1863258,0.442900
1863259,0.618979
1863260,0.548860


In [9]:
MAKE_SUBMISSION = True

if MAKE_SUBMISSION:
    sub.to_csv('submission.csv', index=True)