# Model Comparison
I will compare CatBoost, XGBoost, and LightGBM.

Expectations:

CatBoost is expected to perform best with limited

*   CatBoost is expected to perform best with limited preprocessing, but to be outperformed by XGBoost when both models are tuned.






In [2]:
!pip install -q gdown
! pip install -q catboost
import warnings
warnings.filterwarnings('ignore')
import gdown
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin, OneToOneFeatureMixin
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
file_id = "1lvt_himfQapYiUPbaS07dONMZ718cfk0"
gdown.download(id=file_id, output="Mercari-dataset.tsv", quiet=False)

df = pd.read_csv("Mercari-dataset.tsv", sep="\t")
df.head()

Downloading...
From (original): https://drive.google.com/uc?id=1lvt_himfQapYiUPbaS07dONMZ718cfk0
From (redirected): https://drive.google.com/uc?id=1lvt_himfQapYiUPbaS07dONMZ718cfk0&confirm=t&uuid=2c5445e8-e520-4399-b863-d618990fd365
To: /content/Mercari-dataset.tsv
100%|██████████| 338M/338M [00:03<00:00, 97.1MB/s]


Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [4]:
df.drop_duplicates(inplace=True)

df['log_price'] = np.log1p(df['price'])

X = df.drop(columns=['price', 'log_price', 'train_id'])
y = df['log_price']

X_train, X_holdout, y_train, y_holdout = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Preprocessing – XGBoost & LightGBM
Due to the lack of numeric features and the majority being binary, categorical, and text features, the preprocessing used in the linear regression baseline can be reused for a simple comparison.

In [5]:
class ShippingToInt64(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        if self.column in X.columns:
            X[self.column] = pd.to_numeric(X[self.column], errors='coerce').astype('Int64')
        return X

class ModeImputer(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def __init__(self, column):
        self.column = column
        self.mode_ = None

    def fit(self, X, y=None):
        self.mode_ = X[self.column].mode(dropna=True)[0]
        return self

    def transform(self, X):
        X = X.copy()
        X[self.column] = X[self.column].fillna(self.mode_)
        return X

class ShippingToint64(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def __init__(self, column='shipping'):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        if self.column in X.columns:
            X[self.column] = pd.to_numeric(X[self.column], errors='coerce').astype('int64')
        return X

shipping_pipeline = Pipeline([
    ('coerce_Int64', ShippingToInt64(column='shipping')),
    ('mode_imputer', ModeImputer(column='shipping')),
    ('coerce_int64', ShippingToInt64(column='shipping')),
])

shipping_pipeline.set_output(transform="pandas")
_ = shipping_pipeline

In [6]:
class ItemConditionOrdinalEncoder(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def __init__(self, column='item_condition_id'):
        self.column = column

    def fit(self, X, y=None):
        self.categories_ = sorted(X[self.column].unique())
        return self

    def transform(self, X):
        X = X.copy()
        mapping = {cat: i for i, cat in enumerate(self.categories_)}
        X[self.column] = X[self.column].map(mapping).astype('int64')
        return X

ordinal_pipeline = Pipeline([
    ('coerce_Int64', ShippingToInt64(column='item_condition_id')),
    ('mode_imputer', ModeImputer(column='item_condition_id')),
    ('coerce_int64', ShippingToInt64(column='item_condition_id')),
    ('ordinal', ItemConditionOrdinalEncoder(column='item_condition_id')),
])

ordinal_pipeline.set_output(transform="pandas")
_ = ordinal_pipeline

In [7]:
class NormalizeTextColumn(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def __init__(self, column, fill_value="missing"):
        self.column = column
        self.fill_value = fill_value

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X[self.column] = (
            X[self.column]
            .fillna(self.fill_value)
            .astype(str)
            .str.strip()
            .str.lower()
        )
        return X

class FillMissingCategory(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def __init__(self, column, fill_value='missing'):
        self.column = column
        self.fill_value = fill_value

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X[self.column] = X[self.column].fillna(self.fill_value)
        return X

class TargetEncoder(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def __init__(self, column, smoothing=5):
        self.column = column
        self.smoothing = smoothing
        self.target_means_ = None
        self.global_mean_ = None

    def fit(self, X, y):
        X = X.copy()
        y = pd.Series(y)

        self.global_mean_ = y.mean()

        stats = (
            X.groupby(self.column)[self.column]
            .count()
            .to_frame("count")
            .join(y.groupby(X[self.column]).mean().to_frame("mean"))
        )

        smoothing = 1 / (1 + np.exp(-(stats["count"] - self.smoothing)))
        stats["smoothed"] = self.global_mean_ * (1 - smoothing) + stats["mean"] * smoothing
        self.target_means_ = stats["smoothed"]
        return self

    def transform(self, X):
        X = X.copy()
        X[self.column + "_encoded"] = X[self.column].map(self.target_means_)
        X[self.column + "_encoded"] = X[self.column + "_encoded"].fillna(self.global_mean_)
        return X[[self.column + "_encoded"]]

target_pipeline = Pipeline([
    ('normalise', NormalizeTextColumn(column='brand_name')),
    ('impute', FillMissingCategory(column='brand_name')),
    ('target', TargetEncoder(column='brand_name')),
])

target_pipeline.set_output(transform="pandas")
_ = target_pipeline

In [8]:
class SplitAndLabelEncodeCategory(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def __init__(self, column='category_name', drop_original=True, fill_value='missing'):
        self.column = column
        self.drop_original = drop_original
        self.fill_value = fill_value
        self.label_maps_ = {}

    def fit(self, X, y=None):
        X = X.copy()
        splits = X[self.column].str.split('/', expand=True)

        for i, cat_col in enumerate(['cat1', 'cat2', 'cat3']):
            if i < splits.shape[1]:
                values = splits[i].fillna(self.fill_value).str.strip().str.lower()
            else:
                values = pd.Series([self.fill_value] * len(X))
            unique_vals = values.dropna().unique()
            self.label_maps_[cat_col] = {val: idx for idx, val in enumerate(sorted(unique_vals))}
        return self

    def transform(self, X):
        X = X.copy()
        splits = X[self.column].str.split('/', expand=True)

        for i, cat_col in enumerate(['cat1', 'cat2', 'cat3']):
            if i < splits.shape[1]:
                values = splits[i].fillna(self.fill_value).str.strip().str.lower()
            else:
                values = pd.Series([self.fill_value] * len(X), index=X.index)
            mapping = self.label_maps_[cat_col]
            X[cat_col + "_encoded"] = values.map(mapping).fillna(-1).astype("int64")

        if self.drop_original and self.column in X.columns:
            X = X.drop(columns=[self.column])

        return X[[col + "_encoded" for col in ['cat1', 'cat2', 'cat3']]]



label_pipeline = Pipeline([
    ('split_encode', SplitAndLabelEncodeCategory(column='category_name'))
])
label_pipeline.set_output(transform="pandas")

label_pipeline.set_output(transform="pandas")
_ = label_pipeline

In [9]:
class TextFeatureExtractor(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            X[f"{col}_char_len"] = X[col].apply(lambda x: len(str(x)))
            X[f"{col}_word_count"] = X[col].apply(lambda x: len(str(x).split()))
        return X

class TFIDFVectorizerWrapper(BaseEstimator, TransformerMixin):
    def __init__(self, column, max_features=1000, stop_words=None):
        self.column = column
        self.max_features = max_features
        self.stop_words = stop_words
        self.vectorizer = None

    def fit(self, X, y=None):
        self.vectorizer = TfidfVectorizer(
            max_features=self.max_features,
            stop_words=self.stop_words
        )
        self.vectorizer.fit(X[self.column])
        return self

    def transform(self, X):
        return self.vectorizer.transform(X[self.column])

name_pipeline = Pipeline([
    ('normalize', NormalizeTextColumn(column='name')),
    ('extract_features', TextFeatureExtractor(columns=['name'])),
    ('tfidf', TFIDFVectorizerWrapper(column='name', max_features=300))
])

item_pipeline = Pipeline([
    ('normalize', NormalizeTextColumn(column='item_description')),
    ('extract_features', TextFeatureExtractor(columns=['item_description'])),
    ('tfidf', TFIDFVectorizerWrapper(column='item_description', max_features=1000, stop_words='english'))
])

_ = item_pipeline

In [10]:
preprocessor = ColumnTransformer([
    ('shipping', shipping_pipeline, ['shipping']),
    ('item_condition', ordinal_pipeline, ['item_condition_id']),
    ('brand', target_pipeline, ['brand_name']),
    ('category', label_pipeline, ['category_name']),
    ('name_text', name_pipeline, ['name']),
    ('description_text', item_pipeline, ['item_description']),
])

final_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('regressor', LinearRegression())
])

_ = final_pipeline

# XGBoost vs. LightGBM
XGBoost substantially outperformed LightGBM. This is expected, as XGBoost is the industry preference over LightGBM, and there was no indication in the EDA or baseline that this dataset fits LightGBM’s niche.

In [11]:
models = {
    'XGBoost': XGBRegressor(n_estimators=100, random_state=42, verbosity=0),
    'LightGBM': LGBMRegressor(n_estimators=100, random_state=42, verbose=-1)
}

for name, regressor in models.items():
    pipeline = Pipeline([
        ('preprocessing', preprocessor),
        ('regressor', regressor)
    ])

    X_train_copy = X_train.copy()
    X_holdout_copy = X_holdout.copy()
    y_train_copy = y_train.copy()
    y_holdout_copy = y_holdout.copy()

    X_subtrain, X_val, y_subtrain, y_val = train_test_split(
        X_train_copy, y_train_copy, test_size=0.2, random_state=42
    )

    pipeline.fit(X_subtrain, y_subtrain)
    y_val_pred = pipeline.predict(X_val)

    r2_val = r2_score(y_val, y_val_pred)
    mse_val = mean_squared_error(y_val, y_val_pred)
    rmse_val = np.sqrt(mse_val)
    mae_val = mean_absolute_error(y_val, y_val_pred)

    pipeline.fit(X_train_copy, y_train_copy)
    y_test_pred = pipeline.predict(X_holdout_copy)

    r2_test = r2_score(y_holdout_copy, y_test_pred)
    mse_test = mean_squared_error(y_holdout_copy, y_test_pred)
    rmse_test = np.sqrt(mse_test)
    mae_test = mean_absolute_error(y_holdout_copy, y_test_pred)

    print(f"\n{name}")
    print("R2:", r2_val)
    print("MSE:", mse_val)
    print("RMSE:", rmse_val)
    print("MAE:", mae_val)
    print("Hold-Out Test Metrics:")
    print("R2", r2_test)
    print("MSE:", mse_test)
    print("RMSE:", rmse_test)
    print("MAE:", mae_test)


XGBoost
R2: 0.4999932234103832
MSE: 0.28064260709173905
RMSE: 0.5297571208504318
MAE: 0.3988648906624664
Hold-Out Test Metrics:
R2 0.5036049632575783
MSE: 0.2800196807324655
RMSE: 0.5291688584303366
MAE: 0.3976330484245302

LightGBM
R2: 0.48716360062589614
MSE: 0.28784358706805085
RMSE: 0.5365105656630174
MAE: 0.4042934023920858
Hold-Out Test Metrics:
R2 0.4880068438727124
MSE: 0.2888186817032255
RMSE: 0.53741853494574
MAE: 0.40481049205338576


# CatBoost

XGBoost outperforms CatBoost:


*   This matched expectations, XGBoost had an advantage in this comparison due to the reused preprocessing being higher quality than the CatBoost preprocessing. However, CatBoost is generally known for its native handling of categoricals and text, which makes preprocessing easier but limits tuning flexibility. This limitation gives XGBoost a tuning edge.



In [12]:
class FillMissingCategory(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def __init__(self, fill_value='missing'):
        self.fill_value = fill_value

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.fillna(self.fill_value).astype(str)

class TextNaNFiller(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.fillna("")

cat_features_to_fill = ['brand_name', 'category_name']
text_features_to_fill = ['item_description', 'name']

cat_preprocessor = ColumnTransformer(transformers=[
    ('brand', FillMissingCategory(), cat_features_to_fill),
    ('text', TextNaNFiller(), text_features_to_fill)
], remainder='passthrough', verbose_feature_names_out=False)
cat_preprocessor.set_output(transform="pandas")

X_train_copy = X_train.copy()
X_holdout_copy = X_holdout.copy()
y_train_copy = y_train.copy()
y_holdout_copy = y_holdout_copy.copy()

X_subtrain, X_val, y_subtrain, y_val = train_test_split(
    X_train_copy, y_train_copy, test_size=0.2, random_state=42
)

X_subtrain = cat_preprocessor.fit_transform(X_subtrain)
X_val = cat_preprocessor.transform(X_val)
X_holdout_copy = cat_preprocessor.transform(X_holdout_copy)

cat_features = ['shipping', 'item_condition_id', 'brand_name', 'category_name']
text_features = ['name', 'item_description']

catboost_model = CatBoostRegressor(
    iterations=100,
    random_state=42,
    verbose=0,
    cat_features=cat_features,
    text_features=text_features
)

catboost_model.fit(X_subtrain, y_subtrain, eval_set=(X_val, y_val))

y_val_pred = catboost_model.predict(X_val)
y_test_pred = catboost_model.predict(X_holdout_copy)

r2_val = r2_score(y_val, y_val_pred)
mse_val = mean_squared_error(y_val, y_val_pred)
rmse_val = np.sqrt(mse_val)
mae_val = mean_absolute_error(y_val, y_val_pred)

r2_test = r2_score(y_holdout_copy, y_test_pred)
mse_test = mean_squared_error(y_holdout_copy, y_test_pred)
rmse_test = np.sqrt(mse_test)
mae_test = mean_absolute_error(y_holdout_copy, y_test_pred)

print("\nCatBoost")
print("R2:", r2_val)
print("MSE:", mse_val)
print("RMSE:", rmse_val)
print("MAE:", mae_val)
print("Hold-Out Test Metrics:")
print("R2", r2_test)
print("MSE:", mse_test)
print("RMSE:", rmse_test)
print("MAE:", mae_test)



CatBoost
R2: 0.4926583828309743
MSE: 0.28475948886051783
RMSE: 0.533628605736722
MAE: 0.403284155145839
Hold-Out Test Metrics:
R2 0.49350282296847237
MSE: 0.2857183640171211
RMSE: 0.5345262987142176
MAE: 0.4039754687716302


# Summary
The metrics from XGBoost are a good starting point for further tuning.

Plan going forward:


*   Tune preprocessing specifically for XGBoost.
*   Test different encoding and tokenization strategies.
*   Tune hyperparameters and perform feature engineering.



