In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
from optuna.samplers import TPESampler
import pickle
from IPython.core.display import HTML
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_log_error

In [None]:
original_train = pd.read_csv('/kaggle/input/ps-4-e-2-abalone-dataset-from-uci/abalone.data', header=None)

train = pd.read_csv('/kaggle/input/playground-series-s4e4/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s4e4/test.csv')
sample_submission = pd.read_csv('/kaggle/input/playground-series-s4e4/sample_submission.csv')

In [None]:
# Show all properties on display
pd.set_option('display.max_columns', None)

train.head()

In [None]:
test.head()

In [None]:
print(original_train.shape)
original_train.columns.tolist()

In [None]:
# Save original dataset into .csv
original_train.columns = train.columns[1:]
original_train.to_csv('🌊🐚⛵🍤 PS4 E4 original abalone dataset from UCI.csv', index=False)
original_train.tail()

In [None]:
print(train.shape)
train.columns.tolist()

In [None]:
submission_id = test.id

train.drop(columns='id', axis=1, inplace=True)
test.drop(columns='id', axis=1, inplace=True)

In [None]:
train = pd.concat(objs=[train, original_train])
train.shape

In [None]:
train_duplicates_number = train[train.duplicated()]
test_duplicates_number = test[test.duplicated()]
                             
print(len(train_duplicates_number))
print(len(test_duplicates_number))

#### There are no any duplicats in data but if they would be presented use code below

In [None]:
train = train.drop_duplicates()

# Check whether all duplicates were removed
train_duplicates = train[train.duplicated()]
print(len(train_duplicates))

In [None]:
print(f'Train data: {train.shape}')
print(f'Test data: {test.shape}\n')

train_data_percentage = np.round(train.shape[0] / (train.shape[0] + test.shape[0]), 4)
print(f'Train data consists of {train_data_percentage * 100}% of all observations')
print(f'Test data consists of {(1 - train_data_percentage) * 100}% of all observations')

In [None]:
train.describe().T

In [None]:
print('TRAIN data\n')
print(f'{train.isna().sum()}\n\n\n')

print('TEST data\n')
print(test.isna().sum())

#### There is no any gaps in the data

In [None]:
train.info()

#### Sex feature is object, so we have to convert in into numeric

In [None]:
train = train.drop_duplicates()

# Check whether all duplicates were removed
duplicates = train[train.duplicated()]
len(duplicates)

In [None]:
X = pd.get_dummies(train, drop_first=True, dtype=int)
test = pd.get_dummies(test, drop_first=True, dtype=int)

In [None]:
sns.set(rc={'figure.figsize': (20, 16)})
X.hist(color='orange');

In [None]:
print(f'{train.Rings.value_counts()}\n\n')
print(train.Rings.value_counts() / train.shape[0])

In [None]:
# Split the train data into X and y
X = X.drop(['Rings'], axis=1)
y = train.Rings

# for column in X.columns.tolist():
#     X[column] = X[column].apply(lambda x: (x - X[column].min()) / (X[column].max() - X[column].min()))

# # Transform test data
# for column in test.columns.tolist():
#     test[column] = test[column].apply(lambda x: (x - test[column].min()) / (test[column].max() - test[column].min()))
    
# X.hist(color='LightSeaGreen');

In [None]:
%%time
# I figured out best hyperparameters previously
best_forest = RandomForestRegressor(
    random_state=27,
)
    
best_forest.fit(X, y)
importance = best_forest.feature_importances_

feature_importance = pd.DataFrame(data=importance, index=X.columns, columns=['importance']) \
    .sort_values(ascending=True, by='importance')

feature_importance.plot(kind='barh', figsize=(12, 8), color='orange');

In [None]:
X.columns

In [None]:
# Training dataset
numeric_columns_train = X.select_dtypes(include=np.number)
corr_train = numeric_columns_train.corr(method='pearson')
mask_train = np.triu(np.ones_like(corr_train))
sns.heatmap(corr_train, annot=True, fmt='.2f', mask=mask_train, cmap='Spectral', cbar=None, linewidth=2)
plt.tight_layout()
plt.show()

#### Let's drop highly correlated features and look on the metric

In [None]:
X = X.drop(['Diameter', 'Whole weight.2', 'Whole weight'], axis=1)
test = test.drop(['Diameter', 'Whole weight.2', 'Whole weight'], axis=1)

In [None]:
# Training dataset
numeric_columns_train = X.select_dtypes(include=np.number)
corr_train = numeric_columns_train.corr(method='pearson')
mask_train = np.triu(np.ones_like(corr_train))
sns.heatmap(corr_train, annot=True, fmt='.2f', mask=mask_train, cmap='summer', cbar=None, linewidth=2)
plt.tight_layout()
plt.show()

In [None]:
# Split data into train and val
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=27)

In [None]:
# %%time
# def objective(trial):
#     model = RandomForestRegressor(
#         n_estimators=trial.suggest_int("n_estimators", 100, 1000),
# #         criterion=trial.suggest_categorical("criterion", ['poisson', 'absolute_error', 'friedman_mse', 'squared_error']),
#         min_samples_leaf=trial.suggest_int("min_samples_leaf", 1, 100),
#         max_depth=trial.suggest_int("max_depth", 1, 100),
#         min_samples_split=trial.suggest_int("min_samples_split", 2, 100),
#         random_state=27
#     )
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     return np.sqrt(mean_squared_log_error(y_test, y_pred))


# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=27)

# optuna.logging.set_verbosity(optuna.logging.WARNING)

# sampler = TPESampler(seed=27)
# study = optuna.create_study(study_name="random_forest", direction="maximize", sampler=sampler)
# study.optimize(objective, n_trials=5)

# print("Number of finished trials: ", len(study.trials))
# print("Best trial:")
# trial = study.best_trial
# print("  Value: ", trial.value)
# print("  Params: ")
# for key, value in trial.params.items():
#     print(f"    {key}: {value}")
# print()

"""
Number of finished trials:  5
Best trial:
  Value:  0.15237508927103727
  Params: 
    n_estimators: 445
    min_samples_leaf: 98
    max_depth: 90
    min_samples_split: 22

CPU times: user 11min 43s, sys: 171 ms, total: 11min 44s
Wall time: 11min 44s
"""

In [None]:
# %%time
# def objective(trial):
#     model = XGBRegressor(
#         max_depth=trial.suggest_int('max_depth', 1, 100),
#         learning_rate=trial.suggest_float('learning_rate', 0.01, 1.0, log=True),
#         n_estimators=trial.suggest_int('n_estimators', 50, 1000),
#         min_child_weight=trial.suggest_int('min_child_weight', 1, 10),
#         gamma=trial.suggest_float('gamma', 1e-8, 1.0, log=True),
#         subsample=trial.suggest_float('subsample', 0.01, 1.0, log=True),
#         colsample_bytree=trial.suggest_float('colsample_bytree', 0.01, 1.0, log=True),
#         reg_alpha=trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
#         reg_lambda=trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
#         use_label_encoder=False,
#         random_state=27
#     )
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     try:
#         return np.sqrt(mean_squared_log_error(y_test, y_pred))
#     except Exception as e:
#         print(e)


# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=27)

# optuna.logging.set_verbosity(optuna.logging.WARNING)

# sampler = TPESampler(seed=27)
# study = optuna.create_study(study_name="xgb", direction="maximize", sampler=sampler)
# study.optimize(objective, n_trials=1)

# print("Number of finished trials: ", len(study.trials))
# print("Best trial:")
# trial = study.best_trial
# print("  Value: ", trial.value)
# print("  Params: ")
# for key, value in trial.params.items():
#     print(f"    {key}: {value}")
# print()

"""
Number of finished trials:  1
Best trial:
  Value:  0.1775845058982026
  Params: 
    max_depth: 43
    learning_rate: 0.42576257222865277
    n_estimators: 749
    min_child_weight: 9
    gamma: 1.1669337024772915e-05
    subsample: 0.9097315662154742
    colsample_bytree: 0.6114890625963008
    reg_alpha: 4.761254082318455e-07
    reg_lambda: 0.008602430632882225

CPU times: user 24.5 s, sys: 667 ms, total: 25.2 s
Wall time: 25.2 s
"""

In [None]:
# # Searching for best parameters of XGBoost
# from sklearn.model_selection import RandomizedSearchCV

# xgb_regressor = XGBRegressor(random_state=27)

# xgb_parameters = {
#     'n_estimators': range(5, 1001, 10),
#     'learning_rate': [0.001, 0.05, 0.01],
#     'max_depth': range(2, 100, 4),
# }

# xgb_random_search = RandomizedSearchCV(estimator=xgb_regressor, param_distributions=xgb_parameters, n_iter=5, n_jobs=-1, cv=5, verbose=4, random_state=27)
# xgb_random_search.fit(X_val, y_val)
# print(f'Best params: {xgb_random_search.best_params_}')

"""
Best params: {'n_estimators': 395, 'max_depth': 6, 'learning_rate': 0.01}
"""

#### I found that XGBoost in this competition can't work with negative targets when it is looking for model that fitting with metrics

In [None]:
# %%time
# def objective(trial):
#     model = CatBoostRegressor(
#         iterations=trial.suggest_int("iterations", 100, 1000),
#         learning_rate=trial.suggest_float("learning_rate", 1e-3, 1e-1, log=True),
#         colsample_bylevel=trial.suggest_float("colsample_bylevel", 0.05, 1.0),
#         min_data_in_leaf=trial.suggest_int("min_data_in_leaf", 1, 100),
#         depth=trial.suggest_int("depth", 4, 16),
#         l2_leaf_reg=trial.suggest_float("l2_leaf_reg", 1e-8, 100.0, log=True),
#         verbose=False,
#         random_state=27
#     )
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     return np.sqrt(mean_squared_log_error(y_test, y_pred))


# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=27)

# optuna.logging.set_verbosity(optuna.logging.WARNING)

# sampler = TPESampler(seed=27)
# study = optuna.create_study(study_name="catboost", direction="maximize", sampler=sampler)
# study.optimize(objective, n_trials=20)

# print("Number of finished trials: ", len(study.trials))
# print("Best trial:")
# trial = study.best_trial
# print("  Value: ", trial.value)
# print("  Params: ")

# for key, value in trial.params.items():
#     print(f"    {key}: {value}")
# print()

"""
Number of finished trials:  20
Best trial:
  Value:  0.27250015755480833
  Params: 
    iterations: 101
    learning_rate: 0.0010172906333606835
    colsample_bylevel: 0.4796381789116622
    min_data_in_leaf: 42
    depth: 13
    l2_leaf_reg: 2.895211427077531e-08

CPU times: user 18min 10s, sys: 9min 21s, total: 27min 31s
Wall time: 13min 5s
"""

In [None]:
# %%time
# def objective(trial):
#     model = LGBMRegressor(
#         n_estimators=trial.suggest_int("n_estimators", 100, 1000),
#         max_depth=trial.suggest_int("max_depth", 1, 100),
#         learning_rate=trial.suggest_float("learning_rate", 1e-3, 1e-1, log=True),
#         verbosity=-1,
#         boosting_type=trial.suggest_categorical('boosting_type', ['gbdt', 'dart']),
#         num_leaves=trial.suggest_int('num_leaves', 2, 256),
#         min_child_samples=trial.suggest_int('min_child_samples', 5, 100),
#         random_state=27
#     )
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     return np.sqrt(mean_squared_log_error(y_test, y_pred))


# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=27)

# optuna.logging.set_verbosity(optuna.logging.WARNING)

# sampler = TPESampler(seed=27)
# study = optuna.create_study(study_name="lgbm", direction="maximize", sampler=sampler)
# study.optimize(objective, n_trials=10)

# print("Number of finished trials: ", len(study.trials))
# print("Best trial:")
# trial = study.best_trial
# print("  Value: ", trial.value)
# print("  Params: ")
# for key, value in trial.params.items():
#     print(f"    {key}: {value}")
# print()

"""
Number of finished trials:  10
Best trial:
  Value:  0.793442492293447
  Params: 
    n_estimators: 374
    max_depth: 79
    learning_rate: 0.002117446099339252
    boosting_type: dart
    num_leaves: 48
    min_child_samples: 62

CPU times: user 12min 4s, sys: 269 ms, total: 12min 4s
Wall time: 12min 5s
"""

In [None]:
base_models = [
    ('XGBoost', XGBRegressor(
        random_state=27
    )),
    ('LightGBM', LGBMRegressor(
        random_state=27
    )),
    ('Catboost', CatBoostRegressor(
    random_state=27
    )),
    ('RandomForest', RandomForestRegressor(
        random_state=27
    ))
]

In [None]:
meta_model = CatBoostRegressor(
    iterations=101,
    learning_rate=0.0010172906333606835,
    colsample_bylevel=0.4796381789116622,
    min_data_in_leaf=42,
    depth=13,
    l2_leaf_reg=2.895211427077531e-08,
    random_state=27,
)

In [None]:
%%time
stacking_model = StackingRegressor(estimators=base_models, final_estimator=meta_model)
stacking_model.fit(X, y)

In [None]:
y_pred_val = stacking_model.predict(X_val)

rmsle_val = np.sqrt(mean_squared_log_error(y_val, y_pred_val))
print(f"Validation Root mean squared logarithmic error regression loss: {rmsle_val:.8f}")

In [None]:
y_pred_test = stacking_model.predict(test)
y_pred_test[:10]

![](https://en.australia51.com/Attach/Download/EA3D1916-8F57-1147-653F-329276FBB95A/0656DE71-5A1F-235F-E504-F21533E82969.jpg)

In [None]:
submission = pd.DataFrame({
    'id': sample_submission.id,
    'Rings': y_pred_test
})

submission.to_csv('submission.csv', index=False)
submission.head(10)

In [None]:
pickle.dump(stacking_model, open("Kapturov_stacking_model.pkl", "wb"))