# XGBoost final model

In [58]:
''' !pip install -q gdown
! pip install -q category_encoders
! pip install -q optuna '''
import warnings
warnings.filterwarnings('ignore')
import gdown
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin, OneToOneFeatureMixin
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
import re
from sklearn.pipeline import Pipeline, FeatureUnion
from category_encoders import TargetEncoder
from sklearn.model_selection import KFold
from scipy.sparse import hstack, vstack, csr_matrix
import psutil
import optuna

# Load data

In [59]:
file_id = "1lvt_himfQapYiUPbaS07dONMZ718cfk0"
gdown.download(id=file_id, output="Mercari-dataset.tsv", quiet=False)

df = pd.read_csv("Mercari-dataset.tsv", sep="\t")
''' df.head() '''

Downloading...
From (original): https://drive.google.com/uc?id=1lvt_himfQapYiUPbaS07dONMZ718cfk0
From (redirected): https://drive.google.com/uc?id=1lvt_himfQapYiUPbaS07dONMZ718cfk0&confirm=t&uuid=994cfaa1-9d74-429b-b167-713b3e9c6aaf
To: /content/Mercari-dataset.tsv
100%|██████████| 338M/338M [00:03<00:00, 89.8MB/s]


' df.head() '

# Setup
clipping outliers in target improved error metrics

In [60]:
df.drop_duplicates(inplace=True)

df['log_price'] = np.log1p(df['price'])

X = df.drop(columns=['price', 'log_price', 'train_id'])
y = df['log_price']

X_train, X_holdout, y_train, y_holdout = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Shipping feature
After testing a couple stratedies mode gave the best results

In [61]:
class ShippingToInt64(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        if self.column in X.columns:
            X[self.column] = pd.to_numeric(X[self.column], errors='coerce').astype('Int64')
        return X

class FillWithMode(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column
        self.mode = None

    def fit(self, X, y=None):
        self.mode = X[self.column].mode(dropna=True)[0]
        return self

    def transform(self, X):
        X = X.copy()
        X[self.column] = X[self.column].fillna(self.mode)
        return X[[self.column]]

shipping_pipeline = Pipeline([
    ('coerce_Int64', ShippingToInt64(column='shipping')),
    ('fill_mode', FillWithMode(column='shipping')),
])

# Item condition feature
Created missingness indicator

In [62]:
class CoerceToInt64(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X[self.column] = X[self.column].astype("Int64")
        return X[[self.column]]

class FillNAInt64(BaseEstimator, TransformerMixin):
    def __init__(self, column, fill_value=-1):
        self.column = column
        self.fill_value = fill_value

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X[self.column] = X[self.column].fillna(self.fill_value)
        return X[[self.column]]

condition_pipeline = Pipeline([
    ('coerce_Int64', CoerceToInt64(column='item_condition_id')),
    ('fill_Int64', FillNAInt64(column='item_condition_id')),

])

# Brand name
Target encoding for high cardinality

In [63]:
class NormalizeTextColumn(BaseEstimator, TransformerMixin):
    def __init__(self, column, lower=True, fill_value='missing'):
        self.column = column
        self.lower = lower
        self.fill_value = fill_value

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        col = X.columns[0] if len(X.columns) == 1 else self.column

        X[col] = X[col].astype(str)
        X[col] = X[col].str.strip()
        X[col] = X[col].replace(r'^\s*$', np.nan, regex=True)
        X[col] = X[col].fillna(self.fill_value)

        if self.lower:
            X[col] = X[col].str.lower()

        return X

class SafeTargetEncoderColumn(BaseEstimator, TransformerMixin):
    def __init__(self, column, fill_value='missing', smoothing=1.0):
        self.column = column
        self.fill_value = fill_value
        self.smoothing = smoothing
        self.encoder = None

    def fit(self, X, y):
        X = X.copy()
        col = self.column

        X[col] = X[col].fillna(self.fill_value).astype(str)
        self.encoder = TargetEncoder(
            cols=[col],
            smoothing=self.smoothing,
            handle_missing='value',
            handle_unknown='value'
        )
        self.encoder.fit(X[[col]], y)
        return self

    def transform(self, X):
        X = X.copy()
        col = self.column

        X[col] = X[col].fillna(self.fill_value).astype(str)
        X[col] = self.encoder.transform(X[[col]])[col]
        return X

    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)

class MissingIndicator(BaseEstimator, TransformerMixin):
    def __init__(self, column, output_column=None):
        self.column = column
        self.output_column = output_column or f'{column}_missing_flag'

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X[self.output_column] = X[self.column].isna().astype(int)
        return X

brand_pipeline = Pipeline([
    ('missing_flag', MissingIndicator(column='brand_name')),
    ('normalize', NormalizeTextColumn(column='brand_name')),
    ('label_encode', SafeTargetEncoderColumn(column='brand_name'))
])

# Category name feature
Tree target encoding to preserve hierarchy, dynamically batch to minimise ram usage

In [64]:
class NormalizeCategoryName(BaseEstimator, TransformerMixin):
    def __init__(self, column='category_name', lower=True):
        self.column = column
        self.lower = lower

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        col = X.columns[0]

        X[col] = X[col].astype(str)
        X[col] = X[col].str.strip()
        X[col] = X[col].replace(r'^\s*$', np.nan, regex=True)

        if self.lower:
            X[col] = X[col].str.lower()

        X[col] = X[col].str.replace(r'/+', '/', regex=True)
        X[col] = X[col].str.strip('/')
        X[col] = X[col].str.replace(r'[^a-z0-9/ &+-]', '', regex=True)

        return pd.DataFrame({self.column: X[col]})

class SafeCategorySplitter(BaseEstimator, TransformerMixin):
    def __init__(self, source_column='category_name', fill_value='missing'):
        self.source_column = source_column
        self.fill_value = fill_value
        self.output_columns = ['cat_lvl_1', 'cat_lvl_2', 'cat_lvl_3']

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        default_fill = f"{self.fill_value}/{self.fill_value}/{self.fill_value}"
        X[self.source_column] = X[self.source_column].fillna(default_fill)

        splits = X[self.source_column].str.split('/', n=2, expand=True)

        for i in range(3):
            if i not in splits.columns:
                splits[i] = self.fill_value

        splits.columns = self.output_columns

        splits = splits.fillna(self.fill_value)

        return splits



class TreeBasedTargetEncoder(BaseEstimator, TransformerMixin):
    def __init__(self,
                 col_lvl_1='cat_lvl_1',
                 col_lvl_2='cat_lvl_2',
                 col_lvl_3='cat_lvl_3',
                 smoothing=5,
                 n_splits=5,
                 random_state=42):
        self.col_lvl_1 = col_lvl_1
        self.col_lvl_2 = col_lvl_2
        self.col_lvl_3 = col_lvl_3
        self.smoothing = smoothing
        self.n_splits = n_splits
        self.random_state = random_state
        self.encoding_maps_ = {}
        self.global_mean_ = None

    def _combine_levels(self, X):
        X = X.copy()
        X['cat1_2'] = X[self.col_lvl_1] + '/' + X[self.col_lvl_2]
        X['cat1_2_3'] = X[self.col_lvl_1] + '/' + X[self.col_lvl_2] + '/' + X[self.col_lvl_3]
        return X

    def _fit_target_encoding(self, series, y):
        series = series.astype(str)
        mean = y.mean()
        stats = y.groupby(series).agg(['mean', 'count'])
        smooth = (stats['mean'] * stats['count'] + mean * self.smoothing) / (stats['count'] + self.smoothing)
        return smooth.to_dict()

    def fit(self, X, y):
        X = X.copy()
        y = pd.Series(y, index=X.index)
        X = self._combine_levels(X)

        self.global_mean_ = y.mean()
        self.encoding_maps_ = {}

        for col in [self.col_lvl_1, 'cat1_2', 'cat1_2_3']:
            oof_encoded = pd.Series(np.nan, index=X.index, dtype=float)
            kf = KFold(n_splits=self.n_splits, shuffle=True, random_state=self.random_state)

            for train_idx, val_idx in kf.split(X):
                X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
                X_val = X.iloc[val_idx]
                enc_map = self._fit_target_encoding(X_train[col], y_train)
                oof_encoded.iloc[val_idx] = X_val[col].astype(str).map(enc_map)

            oof_encoded = oof_encoded.fillna(self.global_mean_)
            X[f'{col}_enc'] = oof_encoded

            self.encoding_maps_[col] = self._fit_target_encoding(X[col], y)

        self.fitted_columns_ = [f'{col}_enc' for col in [self.col_lvl_1, 'cat1_2', 'cat1_2_3']]
        self.fitted_data_ = X[self.fitted_columns_].copy()

        return self

    def transform(self, X):
        X = self._combine_levels(X)
        X_encoded = pd.DataFrame(index=X.index)

        for col in [self.col_lvl_1, 'cat1_2', 'cat1_2_3']:
            encoded = X[col].astype(str).map(self.encoding_maps_[col])
            X_encoded[f'{col}_enc'] = encoded.fillna(self.global_mean_).astype(float)

        return X_encoded


category_pipeline = Pipeline([
    ('normalize', NormalizeCategoryName(column='category_name')),
    ('split', SafeCategorySplitter(source_column='category_name')),
    ('target_encode', TreeBasedTargetEncoder(
        col_lvl_1='cat_lvl_1',
        col_lvl_2='cat_lvl_2',
        col_lvl_3='cat_lvl_3',
        smoothing=5,
        n_splits=5,
        random_state=42
    ))
])

# Name & item description feature
Tokenization remains the same simply expirimented with tags, range and max features

In [65]:
class TFIDFVectorizerWrapper(BaseEstimator, TransformerMixin):
    def __init__(self, column, max_features=1000, stop_words=None, ngram_range=(1, 1)):
        self.column = column
        self.max_features = max_features
        self.stop_words = stop_words
        self.ngram_range = ngram_range
        self.vectorizer = None

    def fit(self, X, y=None):
        X = X.copy()
        col = self.column
        X_col = X[col].fillna('').astype(str)

        self.vectorizer = TfidfVectorizer(
            max_features=self.max_features,
            stop_words=self.stop_words,
            ngram_range=self.ngram_range
        )
        self.vectorizer.fit(X_col)
        return self

    def transform(self, X):
        X = X.copy()
        col = self.column
        X_col = X[col].fillna('').astype(str)

        tfidf_matrix = self.vectorizer.transform(X_col)

        word_count = X_col.apply(lambda x: len(x.split())).values.reshape(-1, 1)
        word_count_sparse = csr_matrix(word_count)

        nnz = (tfidf_matrix != 0).sum(axis=1).A1.reshape(-1, 1)
        nnz_sparse = csr_matrix(nnz)

        if col == 'name':
            common_keywords = ['new', 'used', 'bundle']
            name_common_word_flag = X_col.apply(
                lambda text: int(any(word in text.lower() for word in common_keywords))
            ).values.reshape(-1, 1)
            name_common_word_sparse = csr_matrix(name_common_word_flag)

            max_tfidf = tfidf_matrix.max(axis=1).toarray().reshape(-1, 1)
            max_tfidf_sparse = csr_matrix(max_tfidf)

            return hstack([
                tfidf_matrix,
                word_count_sparse,
                max_tfidf_sparse,
                name_common_word_sparse
            ])

        elif col == 'item_description':
            mean_tfidf = tfidf_matrix.sum(axis=1).A1.reshape(-1, 1) / (nnz + 1e-6)
            mean_tfidf_sparse = csr_matrix(mean_tfidf)

            return hstack([
                tfidf_matrix,
                word_count_sparse,
                nnz_sparse,
                mean_tfidf_sparse
            ])

        return hstack([tfidf_matrix, word_count_sparse])

name_pipeline = Pipeline([
    ('normalize', NormalizeTextColumn(column='name')),
    ('vectorize', TFIDFVectorizerWrapper(
        column='name',
        max_features=400,
        stop_words=None,
        ngram_range=(1, 2)
    ))
])

item_pipeline = Pipeline([
    ('normalize', NormalizeTextColumn(column='item_description')),
    ('vectorize', TFIDFVectorizerWrapper(
        column='item_description',
        max_features=1500,
        stop_words='english',
        ngram_range=(1, 2)
    ))
])

# Preproccessor

In [66]:
preprocessor = ColumnTransformer([
    ('shipping', shipping_pipeline, ['shipping']),
    ('item_condition', condition_pipeline, ['item_condition_id']),
    ('brand', brand_pipeline, ['brand_name']),
    ('category', category_pipeline, ['category_name']),
    ('name_text', name_pipeline, ['name']),
    ('description_text', item_pipeline, ['item_description']),
])

# Metrics

In [67]:

xgb_model = XGBRegressor(
    n_estimators=100,
    max_depth=10,
    learning_rate=0.28876729035886295,
    subsample=0.6249548549636976,
    colsample_bytree=0.5162030917981775,
    reg_alpha=1.623450951405694,
    reg_lambda=4.2760846903011895,
    random_state=42,
    verbosity=0
)

pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('regressor', xgb_model)
])

X_subtrain, X_val, y_subtrain, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

pipeline.fit(X_subtrain, y_subtrain)
y_val_pred = pipeline.predict(X_val)

r2_val = r2_score(y_val, y_val_pred)
rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))
mae_val = mean_absolute_error(y_val, y_val_pred)

pipeline.fit(X_train, y_train)
y_test_pred = pipeline.predict(X_holdout)

r2_test = r2_score(y_holdout, y_test_pred)
rmse_test = np.sqrt(mean_squared_error(y_holdout, y_test_pred))
mae_test = mean_absolute_error(y_holdout, y_test_pred)

print("XGBoost")
print("R2:", r2_val)
print("RMSE:", rmse_val)
print("MAE:", mae_val)
print("Hold-Out Test Metrics:")
print("R2:", r2_test)
print("RMSE:", rmse_test)
print("MAE:", mae_test)


XGBoost
R2: 0.5513238354153979
RMSE: 0.5018285404550334
MAE: 0.3762923001617534
Hold-Out Test Metrics:
R2: 0.5528617939627936
RMSE: 0.502228614257657
MAE: 0.37595349203995004
