In [15]:
import pandas as pd
import numpy as np
import xgboost as xgb
import optuna
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error


def manual_openfe_features(df_raw):
    df = df_raw.copy()

    freq_shell = df["Shell weight"].value_counts(normalize=True)
    freq_whole = df["Whole weight"].value_counts(normalize=True)

    freq_feature_shell = df["Shell weight"].map(freq_shell)
    freq_feature_whole = df["Whole weight"].map(freq_whole)
    
    df_manual = pd.DataFrame({
        "f01_Length_div_ShellWeight": df["Length"] / (df["Shell weight"] ),
        "f02_Whole1_div_ShellWeight": df["Whole weight.1"] / (df["Shell weight"]),
        "f03_Diameter_div_ShellWeight": df["Diameter"] / (df["Shell weight"] ),
        "f05_Length_minus_Shell": df["Length"] - df["Shell weight"],
        "f07_freq_ShellWeight": freq_feature_shell,
        "f08_Max_Whole2_Shell": df[["Whole weight.2", "Shell weight"]].max(axis=1),
         "f09_log_Whole_weight": np.log(df["Whole weight"]),
        "f10_freq_WholeWeight": freq_feature_whole,
        "f11_Shell_plus_Height": df["Shell weight"] + df["Height"]
    })
    
    return df_manual



df = pd.read_csv("train_cleaned_sex_binary.csv")  


y = df["Rings"]
X_raw = df.drop(columns=["Rings", "id"], errors='ignore') 


X_manual = manual_openfe_features(X_raw)
X_final = pd.concat([X_raw.reset_index(drop=True), X_manual], axis=1)
X_final = X_final.drop(columns=[ "Length", "Whole weight"], errors="ignore")


X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42)


def xgb_objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'gamma': trial.suggest_float('gamma', 0, 1),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 2),
        'max_depth': trial.suggest_int('max_depth', 4, 20),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'random_state': 42,
        'n_jobs': -1
    }

    model = xgb.XGBRegressor(**params)
    scores = cross_val_score(model, X_train, y_train, cv=3, scoring='neg_mean_squared_error')
    return np.mean(scores)


study = optuna.create_study(direction='maximize')
study.optimize(xgb_objective, n_trials=100)


best_params = study.best_trial.params
print(" Best Parameters:", best_params)
print(" Best CV MSE:", -study.best_value)

best_model = xgb.XGBRegressor(**best_params, objective='reg:squarederror', random_state=42)
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)
final_mse = mean_squared_error(y_test, y_pred)
print("Final Test MSE:", final_mse)






[I 2025-05-24 16:18:01,050] A new study created in memory with name: no-name-01cc8111-b2f3-48b2-a629-0959a2db9526
[I 2025-05-24 16:18:02,714] Trial 0 finished with value: -3.655890850120734 and parameters: {'n_estimators': 398, 'learning_rate': 0.27770670226641003, 'gamma': 0.06335706036984334, 'reg_alpha': 0.26020931447393936, 'reg_lambda': 1.4830552867301958, 'max_depth': 7, 'min_child_weight': 6, 'subsample': 0.9676477194058462, 'colsample_bytree': 0.682037447226046}. Best is trial 0 with value: -3.655890850120734.
[I 2025-05-24 16:18:04,378] Trial 1 finished with value: -3.280203542024269 and parameters: {'n_estimators': 454, 'learning_rate': 0.01773382882067011, 'gamma': 0.3810729505691772, 'reg_alpha': 0.6462500903000221, 'reg_lambda': 0.1315872464391119, 'max_depth': 6, 'min_child_weight': 10, 'subsample': 0.8021994082536499, 'colsample_bytree': 0.7476190122472745}. Best is trial 1 with value: -3.280203542024269.
[I 2025-05-24 16:18:11,400] Trial 2 finished with value: -3.603618


✅ Best Parameters: {'n_estimators': 471, 'learning_rate': 0.026503466142953375, 'gamma': 0.028678259085637783, 'reg_alpha': 0.932619054483705, 'reg_lambda': 1.0667272742633322, 'max_depth': 7, 'min_child_weight': 6, 'subsample': 0.8297580203948203, 'colsample_bytree': 0.7719655510820679}
✅ Best CV MSE: 3.255136251692843
✅ Final Test MSE: 3.2871988329405113


In [23]:
from sklearn.metrics import mean_squared_error, r2_score

test_mse = mean_squared_error(y_test, y_pred)
print("Final Test MSE:", test_mse)

r2 = r2_score(y_test, y_pred)
print("Final Test R^2:", r2)

from sklearn.metrics import mean_squared_log_error
y_pred = np.maximum(y_pred, 0)
y_test_clipped = np.maximum(y_test, 0)
rmlse = np.sqrt(mean_squared_log_error(y_test_clipped, y_pred))
print("Final Test RMSLE:", rmlse)

Final Test MSE: 3.2871988329405113
Final Test R^2: 0.6544767618179321
Final Test RMSLE: 0.14648721809270995
