In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
!pip install -U scikit-learn -q

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
spopt 0.6.0 requires shapely>=2.0.1, but you have shapely 1.8.5.post1 which is incompatible.[0m[31m
[0m

In [3]:
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import make_scorer, mean_squared_error, root_mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import VotingRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
from sklearn.base import copy
import pandas as pd
import numpy as np
import random
import optuna

In [4]:
RANDOM_SEED = 42

# This action may be dangerous for the private score
MAKING_ENSEMBLE = True

FIND_BEST_PARAMS = False
APPLY_LOG_TRANSFORMATION = True
APPLY_FEATURE_ENGINEERING = True

np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

In [5]:
train   = pd.read_csv('/kaggle/input/playground-series-s4e4/train.csv')
orginal = pd.read_csv('/kaggle/input/abalone-dataset/abalone.csv')
test    = pd.read_csv('/kaggle/input/playground-series-s4e4/test.csv')

In [6]:
train = train.drop(['id'], axis = 1)
train.columns = orginal.columns
train = pd.concat([train, orginal], axis = 0, ignore_index=True)

y = train['Rings']
# Because RMSLE score, We make a conversion like below:
y_log = np.log(1+y)
# Add the end for getting the result we back to original like below:
# y = np.exp(y_log)-1


train = train.drop(['Rings'], axis = 1)
train.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight
0,F,0.55,0.43,0.15,0.7715,0.3285,0.1465,0.24
1,F,0.63,0.49,0.145,1.13,0.458,0.2765,0.32
2,I,0.16,0.11,0.025,0.021,0.0055,0.003,0.005
3,M,0.595,0.475,0.15,0.9145,0.3755,0.2055,0.25
4,I,0.555,0.425,0.13,0.782,0.3695,0.16,0.1975


In [7]:
test_id = test['id']
test = test.drop('id', axis = 1)
test.columns = train.columns
test.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight
0,M,0.645,0.475,0.155,1.238,0.6185,0.3125,0.3005
1,M,0.58,0.46,0.16,0.983,0.4785,0.2195,0.275
2,M,0.56,0.42,0.14,0.8395,0.3525,0.1845,0.2405
3,M,0.57,0.49,0.145,0.874,0.3525,0.1865,0.235
4,I,0.415,0.325,0.11,0.358,0.1575,0.067,0.105


In [8]:
encoder = OneHotEncoder(sparse_output = False, handle_unknown = 'ignore')

train = pd.concat([
                    train.iloc[:,1:], 
                    pd.DataFrame(encoder.fit_transform(train[['Sex']]).astype('int'), 
                                 columns = encoder.categories_[0])
                    ], 
                    axis = 1
                )

test  = pd.concat([
                    test.iloc[:,1:], 
                    pd.DataFrame(encoder.transform(test[['Sex']]).astype('int'), 
                                 columns = encoder.categories_[0])
                    ], 
                    axis = 1
                )

In [9]:
def log_transformation(data, columns):
    for column in columns:
        positive_values = data[column] - data[column].min() + 1
        data[f'{column}_log'] = np.log(positive_values)
    return data


if APPLY_LOG_TRANSFORMATION:
    train = log_transformation(train, ['Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight','Viscera weight', 'Shell weight'])
    test  = log_transformation(test, ['Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight','Viscera weight', 'Shell weight'])

In [10]:
%%time


def objective(trial):

    params = {
        "verbose": False,
        "iterations": 1000,
        "loss_function":'RMSE',
        "random_state": RANDOM_SEED,
        "depth": trial.suggest_int("depth", 3, 15),
        "subsample": trial.suggest_float("subsample", 0.01, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 1.0),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
    scores = []
    for _, (train_index, valid_index) in enumerate(cv.split(train, y)):
        X_train, y_train = train.iloc[train_index], y_log.iloc[train_index]
        X_valid, y_valid = train.iloc[valid_index], y_log.iloc[valid_index]
        model = CatBoostRegressor(**params)

        model.fit(X_train, y_train, 
                  eval_set=(X_valid, y_valid),
                  early_stopping_rounds=100)
        
        y_pred = model.predict(X_valid)
        scores.append(root_mean_squared_error(y_valid, y_pred))
    return np.mean(scores)


study = optuna.create_study(direction='minimize', study_name="optuna_catboost")
if FIND_BEST_PARAMS:
    study.optimize(objective, n_trials=50)
    print(f"Best trial average RMSE: {study.best_value:.4f}")
    for key, value in study.best_params.items():
        print(f"{key}: {value}")

[I 2024-04-09 19:58:26,590] A new study created in memory with name: optuna_catboost


CPU times: user 1.97 ms, sys: 69 µs, total: 2.04 ms
Wall time: 1.69 ms


In [11]:
%%time


def objective(trial):

    params = {
        'n_jobs':-1,
        "metric":'rmse',  
        "verbosity": -1,
        "bagging_freq": 1,
        "boosting_type": "gbdt",    
        "objective":'regression', 
        'random_state':RANDOM_SEED,
        'max_depth': trial.suggest_int('max_depth', 3, 15),                        
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "n_estimators": trial.suggest_int('n_estimators', 400, 1000),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),               
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.01),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 60),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.3, 1.0),
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
    scores = []
    for _, (train_index, valid_index) in enumerate(cv.split(train, y)):
        X_train, y_train = train.iloc[train_index], y_log.iloc[train_index]
        X_valid, y_valid = train.iloc[valid_index], y_log.iloc[valid_index]
        model = LGBMRegressor(**params)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_valid)     
        scores.append(root_mean_squared_error(y_valid, y_pred))
    return np.mean(scores)


study = optuna.create_study(direction='minimize', study_name="optuna_lgbm")
if FIND_BEST_PARAMS:
    study.optimize(objective, n_trials=50)
    print(f"Best trial average RMSE: {study.best_value:.4f}")
    for key, value in study.best_params.items():
        print(f"{key}: {value}")

[I 2024-04-09 19:58:26,617] A new study created in memory with name: optuna_lgbm


CPU times: user 488 µs, sys: 874 µs, total: 1.36 ms
Wall time: 1.14 ms


In [12]:
%%time


def objective(trial):

    params = {
        'eval_metric': 'rmse',
        'random_state': RANDOM_SEED,
        'objective': 'reg:squarederror',
        'gamma': trial.suggest_float("gamma", 1e-2, 1.0),
        'max_depth': trial.suggest_int('max_depth',2, 20),
        'subsample': trial.suggest_float("subsample", 0.05, 1.0),
        'n_estimators': trial.suggest_int('n_estimators',100, 1000),
        'min_child_weight': trial.suggest_int('min_child_weight',2, 20),
        'colsample_bytree': trial.suggest_float("colsample_bytree", 0.05, 1.0),
        'learning_rate': trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
    scores = []
    for _, (train_index, valid_index) in enumerate(cv.split(train, y)):
        X_train, y_train = train.iloc[train_index], y_log.iloc[train_index]
        X_valid, y_valid = train.iloc[valid_index], y_log.iloc[valid_index]
        model = XGBRegressor(**params)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_valid)     
        scores.append(root_mean_squared_error(y_valid, y_pred))
    return np.mean(scores)


study = optuna.create_study(direction='minimize', study_name="optuna_xgboost")
if FIND_BEST_PARAMS:
    study.optimize(objective, n_trials=50)
    print(f"Best trial average RMSE: {study.best_value:.4f}")
    for key, value in study.best_params.items():
        print(f"{key}: {value}")

[I 2024-04-09 19:58:26,644] A new study created in memory with name: optuna_xgboost


CPU times: user 551 µs, sys: 660 µs, total: 1.21 ms
Wall time: 1.1 ms


In [13]:
xgboost_params = {
    'max_depth': 10, 
    'verbosity': 0,
    'random_state':RANDOM_SEED,
    'device': 'cuda',
    'booster': 'gbtree',
    'n_estimators': 1137, 
    'tree_method': 'hist',
    'min_child_weight': 7, 
    'grow_policy': 'lossguide', 
    'gamma': 0.03816426816838989, 
    'subsample': 0.486382907668344, 
    'objective': 'reg:squarederror',
    'reg_lambda': 1.7487237399420372, 
    'reg_alpha': 0.013043045359306716,
    'learning_rate': 0.011733966748427322, 
    'colsample_bytree': 0.5748511749872887, 
}

lgbm_params = {
     'metric':'rmse', 
     'device':'gpu', 
     'verbosity': -1,
     'max_depth': 15,
     'random_state':RANDOM_SEED,
     'num_leaves': 138, 
     'n_estimators': 913, 
     'boosting_type': 'gbdt', 
     'min_child_samples': 34, 
     'objective':'regression', 
     'subsample_for_bin': 185680, 
     'subsample': 0.799314727120346, 
     'reg_alpha': 5.916235901972299e-09, 
     'reg_lambda': 6.943912907338958e-08, 
     'learning_rate': 0.01851440025520457, 
     'colsample_bytree': 0.4339090795122026, 
}

catboost_params = {
    'depth': 15, 
    'max_bin': 464, 
    'verbose': False,
    'random_state':RANDOM_SEED,
    'task_type': 'CPU', 
    'eval_metric': 'RMSE', 
    'min_data_in_leaf': 78, 
    'loss_function': 'RMSE', 
    'grow_policy': 'Lossguide', 
    'bootstrap_type': 'Bernoulli', 
    'subsample': 0.83862137638162, 
    'l2_leaf_reg': 8.365422739510098, 
    'random_strength': 3.296124856352495, 
    'learning_rate': 0.09992185242598203, 
}

In [14]:
cv_estimators = [
    ('lgbm', LGBMRegressor(**lgbm_params)),
    ('xgboost', XGBRegressor(**xgboost_params)),
    ('catboost', CatBoostRegressor(**catboost_params))
]

In [15]:
%%time


def objective(trial):
    
    params = {
        'lgbm_weight': trial.suggest_float('lgbm_weight', 0.0, 5.0),
        'xgboost_weight': trial.suggest_float('xgboost_weight', 0.0, 5.0),
        'catboost_weight': trial.suggest_float('catboost_weight', 0.0, 5.0),
    }


    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
    scores = []
    for _, (train_index, valid_index) in enumerate(cv.split(train, y)):
        X_train, y_train = train.iloc[train_index], y_log.iloc[train_index]
        X_valid, y_valid = train.iloc[valid_index], y_log.iloc[valid_index]
        voting_regressor = VotingRegressor(
            estimators=cv_estimators,
            weights=[params['lgbm_weight'], params['xgboost_weight'], params['catboost_weight']]
        )
        voting_regressor.fit(X_train, y_train)
        y_pred = voting_regressor.predict(X_valid)  
        scores.append(root_mean_squared_error(y_valid, y_pred))
    return np.mean(scores)


study = optuna.create_study(direction='minimize', study_name="voting_regressor_optuna")
if FIND_BEST_PARAMS:
    study.optimize(objective, n_trials=100)
    print(f"Best trial average RMSE: {study.best_value:.4f}")
    for key, value in study.best_params.items():
        print(f"{key}: {value}")


[I 2024-04-09 19:58:26,719] A new study created in memory with name: voting_regressor_optuna


CPU times: user 433 µs, sys: 765 µs, total: 1.2 ms
Wall time: 1.01 ms


In [16]:
%%time

train2 = train.copy()
test2  = test.copy()

# I find these drop cols with feature selection base genetic algorithm
lst_drop_cols = [
    ['Shucked weight', 'Shell weight', 'Length_log', 'Diameter_log', 'Height_log', 'Viscera weight_log'],
                 ['Shell weight', 'I', 'Length_log', 'Height_log', 'Viscera weight_log']]

lst_y_pred_test = []
for i in range(len(lst_drop_cols)):
    if APPLY_FEATURE_ENGINEERING:
        train2 = train.drop(lst_drop_cols[i], axis=1)
        test2  = test.drop(lst_drop_cols[i], axis=1)

    weight_best_params = {
        'lgbm_weight': 4.104966149239676, 
        'xgboost_weight': 0.48550637896530635, 
        'catboost_weight': 4.189724537494019,
    }


    voting_regressor = VotingRegressor(
        estimators=cv_estimators,
        weights=[ weight_best_params['lgbm_weight'], 
                  weight_best_params['xgboost_weight'], 
                  weight_best_params['catboost_weight']
        ]
    )

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
    scores = []
    y_pred_test = []
    for fold_i, (train_index, valid_index) in enumerate(cv.split(train, y)):
        X_train, y_train = train2.iloc[train_index], y_log.iloc[train_index]
        X_valid, y_valid = train2.iloc[valid_index], y_log.iloc[valid_index]
        voting_regressor.fit(X_train, y_train)
        y_pred = voting_regressor.predict(X_valid)  
        score = root_mean_squared_error(y_valid, y_pred)
        scores.append(score)
        y_pred_test.append(voting_regressor.predict(test2))
        print(f"FOLD {fold_i} Done. RMSE : {score}")
    print(f"All FOLD. Mean RMSE : {np.mean(scores)}")
    lst_y_pred_test.append(np.mean(y_pred_test, axis=0))



FOLD 0 Done. RMSE : 0.14846438931642345
FOLD 1 Done. RMSE : 0.1470052260803556
FOLD 2 Done. RMSE : 0.14815568475829008
FOLD 3 Done. RMSE : 0.1490412006038859
FOLD 4 Done. RMSE : 0.14817057414995863
All FOLD. Mean RMSE : 0.14816741498178274
FOLD 0 Done. RMSE : 0.14849932345345693
FOLD 1 Done. RMSE : 0.14703735176517257
FOLD 2 Done. RMSE : 0.14809290486302978
FOLD 3 Done. RMSE : 0.1491554269362705
FOLD 4 Done. RMSE : 0.14828814879962343
All FOLD. Mean RMSE : 0.14821463116351064
CPU times: user 27min 28s, sys: 1min 29s, total: 28min 58s
Wall time: 10min 55s


In [17]:
predictions = np.mean(lst_y_pred_test, axis=0)
sub  = pd.DataFrame(columns = ['id', 'Rings'])
sub['id'] = test_id
sub['Rings'] = np.exp(predictions)-1
sub.to_csv('submission_0.14550.csv', index = False)

In [18]:
import pandas as pd

df1 = pd.read_csv('/kaggle/input/ps4e4-feature-engineering-regression/submission_comb.csv')
df2 = sub

In [19]:
if MAKING_ENSEMBLE:
    zr1 = 0.9
    zr2 = 0.1

    df3 = df1.copy()
    df3['Rings'] = df1['Rings']*zr1 + df2['Rings']*zr2
    df3.to_csv('submission.csv', index=False)

else:
    sub.to_csv('submission.csv', index = False)