**Regression with an Abalone Dataset Kaggle Competition - Just for Fun**

Author: Tihoc Andrei

In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.2.1-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.6/383.6 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.2-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.15.2 colorlog-6.9.0 optuna-4.2.1


In [None]:
#Imports and Setup
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_log_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
import optuna
import warnings

warnings.filterwarnings("ignore")


In [None]:
#Load Data
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [None]:
#Preprocessing: One-Hot Encoding (get_dummies)
train_df = pd.get_dummies(train_df)
test_df = pd.get_dummies(test_df)

In [None]:
#Feature/Target Split
X = train_df.drop(columns="Rings")
y = train_df["Rings"]

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
#Machine Learning Pipeline (Scaler + XGBoost)
def build_pipeline(params):
    """Builds a scikit-learn pipeline with StandardScaler and XGBRegressor."""
    pipeline = Pipeline([
        ("scaler", StandardScaler()),
        ("xgb", XGBRegressor(**params))
    ])
    return pipeline

In [None]:
#Optuna Objective Function for RMSLE
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 1.0),
        "random_state": 42,
        "n_jobs": -1,
    }
    model = build_pipeline(params)
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    preds = np.maximum(preds, 0)
    return mean_squared_log_error(y_val, preds)

In [None]:
#Run Optuna Tuning
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=30)

[I 2025-04-03 11:32:05,704] A new study created in memory with name: no-name-98da21f8-a02a-442d-a364-306067f1b623
[I 2025-04-03 11:32:06,700] Trial 0 finished with value: 0.02332470193505287 and parameters: {'n_estimators': 121, 'max_depth': 5, 'learning_rate': 0.02865470203929136, 'subsample': 0.6509459870638813, 'colsample_bytree': 0.9607526693517585, 'reg_alpha': 0.3977616374456865, 'reg_lambda': 0.6435084123766777}. Best is trial 0 with value: 0.02332470193505287.
[I 2025-04-03 11:32:07,736] Trial 1 finished with value: 0.023129085078835487 and parameters: {'n_estimators': 292, 'max_depth': 3, 'learning_rate': 0.05077729991884031, 'subsample': 0.9584235215618857, 'colsample_bytree': 0.929721545335434, 'reg_alpha': 0.6152097614211661, 'reg_lambda': 0.38106975143657384}. Best is trial 1 with value: 0.023129085078835487.
[I 2025-04-03 11:32:13,035] Trial 2 finished with value: 0.022953232750296593 and parameters: {'n_estimators': 315, 'max_depth': 10, 'learning_rate': 0.06191898589638

In [None]:
#Train Final Model on Full Data
best_params = study.best_params
print("\n Best Parameters:", best_params)
print(" Best RMSLE:", study.best_value)

final_pipeline = build_pipeline(best_params)
final_pipeline.fit(X, y)


 Best Parameters: {'n_estimators': 240, 'max_depth': 9, 'learning_rate': 0.03764204693419036, 'subsample': 0.8800825020661489, 'colsample_bytree': 0.9045744967333789, 'reg_alpha': 0.7467842898555911, 'reg_lambda': 0.6927102409659385}
 Best RMSLE: 0.022503267973661423


In [None]:
#Predict on Test Set
test_preds = final_pipeline.predict(test_df)
test_preds = np.round(np.clip(test_preds, 0, 30)).astype(int)

In [None]:
#Build Submission File
submission_df = pd.DataFrame({
    "id": test_df["id"],
    "Rings": test_preds
})
submission_df.to_csv("submission.csv", index=False)
print("\n Final submission saved as submission.csv")


 Final submission saved as submission.csv
