<a href="https://colab.research.google.com/github/Foysal348/Crab-Age-Prediction-with-Highest-Score/blob/main/Crab_Age_Prediction_Using_CatBoost_%26_HuberRegressor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install opendatasets



In [None]:
import opendatasets as od
od.download("https://www.kaggle.com/competitions/phitron-contest/data")

Skipping, found downloaded files in "./phitron-contest" (use force=True to force download)


In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import HuberRegressor
from sklearn.metrics import mean_absolute_error

from catboost import CatBoostRegressor


# 1. Load the Dataset
df_train = pd.read_csv("/content/phitron-contest/train.csv")
df_test = pd.read_csv("/content/phitron-contest/test.csv")



In [None]:
!pip install catboost




In [None]:
X = df_train.drop(["Age", "id"], axis=1)
y = df_train["Age"]

X_test = df_test.drop("id", axis=1)
test_ids = df_test["id"]


In [None]:
num_cols = X.select_dtypes(include=["int64","float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns


In [None]:
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), cat_cols)
])

def huber_pipeline():
    return Pipeline([
        ("prep", preprocessor),
        ("model", HuberRegressor(epsilon=1.1))
    ])


In [None]:
cat_idx = [X.columns.get_loc(c) for c in cat_cols]

cat_model = CatBoostRegressor(
    iterations=1200,
    learning_rate=0.03,
    depth=5,
    loss_function="MAE",
    random_seed=42,
    verbose=0
)


In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

oof_huber = np.zeros(len(X))
oof_cat   = np.zeros(len(X))

test_huber = np.zeros(len(X_test))
test_cat   = np.zeros(len(X_test))

for tr, val in kf.split(X):
    X_tr, X_val = X.iloc[tr], X.iloc[val]
    y_tr, y_val = y.iloc[tr], y.iloc[val]

    # Huber
    h = huber_pipeline()
    h.fit(X_tr, y_tr)
    oof_huber[val] = h.predict(X_val)
    test_huber += h.predict(X_test) / kf.n_splits

    # CatBoost
    c = CatBoostRegressor(
        iterations=1200,
        learning_rate=0.03,
        depth=5,
        loss_function="MAE",
        random_seed=42,
        verbose=0
    )
    c.fit(X_tr, y_tr, cat_features=cat_idx)
    oof_cat[val] = c.predict(X_val)
    test_cat += c.predict(X_test) / kf.n_splits


In [None]:
blend_oof = 0.65 * oof_cat + 0.35 * oof_huber
print("OOF MAE:", mean_absolute_error(y, blend_oof))


OOF MAE: 1.3337837522643377


In [None]:
final_pred = 0.65 * test_cat + 0.35 * test_huber


In [None]:
q = np.quantile(y, np.linspace(0,1,1000))
final_pred = np.array([q[np.argmin(np.abs(q - p))] for p in final_pred])


In [None]:
submission = pd.DataFrame({
    "id": test_ids,
    "Age": final_pred
})
submission.to_csv("submission.csv", index=False)
submission.head()


Unnamed: 0,id,Age
0,15000,13.0
1,15001,12.0
2,15002,9.0
3,15003,11.0
4,15004,9.0
