# Baseline - Linear regression

In [None]:
!pip install -q gdown
import warnings
warnings.filterwarnings('ignore')
import gdown
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin, OneToOneFeatureMixin
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [None]:
file_id = "1lvt_himfQapYiUPbaS07dONMZ718cfk0"
gdown.download(id=file_id, output="Mercari-dataset.tsv", quiet=False)

df = pd.read_csv("Mercari-dataset.tsv", sep="\t")
df.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


# Split data

In [None]:
df.drop_duplicates(inplace=True)

df['log_price'] = np.log1p(df['price'])

X = df.drop(columns=['price', 'log_price', 'train_id'])
y = df['log_price']

X_train, X_holdout, y_train, y_holdout = train_test_split(
    X, y, test_size=0.2, random_state=42
)

#Binary feature - shipping
Forced to Int64 so all NaNs would be categorized as < NA > for easy imputation, then converted back to int64 for consistency and memory efficiency.

In [None]:
class ShippingToInt64(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        if self.column in X.columns:
            X[self.column] = pd.to_numeric(X[self.column], errors='coerce').astype('Int64')
        return X

class ModeImputer(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def __init__(self, column):
        self.column = column
        self.mode_ = None

    def fit(self, X, y=None):
        self.mode_ = X[self.column].mode(dropna=True)[0]
        return self

    def transform(self, X):
        X = X.copy()
        X[self.column] = X[self.column].fillna(self.mode_)
        return X

class ShippingToint64(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def __init__(self, column='shipping'):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        if self.column in X.columns:
            X[self.column] = pd.to_numeric(X[self.column], errors='coerce').astype('int64')
        return X

shipping_pipeline = Pipeline([
    ('coerce_Int64', ShippingToInt64(column='shipping')),
    ('mode_imputer', ModeImputer(column='shipping')),
    ('coerce_int64', ShippingToInt64(column='shipping')),
])

shipping_pipeline.set_output(transform="pandas")
_ = shipping_pipeline


#Ordinal feature - condition
Forced to Int64 so all NaNs would be categorized as < NA > for easy imputation, then converted back to int64 for consistency and memory efficiency, followed by ordinal encoding implicitly defining order.

In [None]:
class ItemConditionOrdinalEncoder(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def __init__(self, column='item_condition_id'):
        self.column = column

    def fit(self, X, y=None):
        self.categories_ = sorted(X[self.column].unique())
        return self

    def transform(self, X):
        X = X.copy()
        mapping = {cat: i for i, cat in enumerate(self.categories_)}
        X[self.column] = X[self.column].map(mapping).astype('int64')
        return X

ordinal_pipeline = Pipeline([
    ('coerce_Int64', ShippingToInt64(column='item_condition_id')),
    ('mode_imputer', ModeImputer(column='item_condition_id')),
    ('coerce_int64', ShippingToInt64(column='item_condition_id')),
    ('ordinal', ItemConditionOrdinalEncoder(column='item_condition_id')),
])

ordinal_pipeline.set_output(transform="pandas")
_ = ordinal_pipeline

# High cardinality feature - brand name
1. Normalise strings
2. Impute
3. Label encode

In [None]:
class NormalizeTextColumn(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def __init__(self, column, fill_value="missing"):
        self.column = column
        self.fill_value = fill_value

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X[self.column] = (
            X[self.column]
            .fillna(self.fill_value)
            .astype(str)
            .str.strip()
            .str.lower()
        )
        return X

class FillMissingCategory(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def __init__(self, column, fill_value='missing'):
        self.column = column
        self.fill_value = fill_value

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X[self.column] = X[self.column].fillna(self.fill_value)
        return X

class TargetEncoder(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def __init__(self, column, smoothing=5):
        self.column = column
        self.smoothing = smoothing
        self.target_means_ = None
        self.global_mean_ = None

    def fit(self, X, y):
        X = X.copy()
        y = pd.Series(y)

        self.global_mean_ = y.mean()

        stats = (
            X.groupby(self.column)[self.column]
            .count()
            .to_frame("count")
            .join(y.groupby(X[self.column]).mean().to_frame("mean"))
        )

        smoothing = 1 / (1 + np.exp(-(stats["count"] - self.smoothing)))
        stats["smoothed"] = self.global_mean_ * (1 - smoothing) + stats["mean"] * smoothing
        self.target_means_ = stats["smoothed"]
        return self

    def transform(self, X):
        X = X.copy()
        X[self.column + "_encoded"] = X[self.column].map(self.target_means_)
        X[self.column + "_encoded"] = X[self.column + "_encoded"].fillna(self.global_mean_)
        return X[[self.column + "_encoded"]]

target_pipeline = Pipeline([
    ('normalise', NormalizeTextColumn(column='brand_name')),
    ('impute', FillMissingCategory(column='brand_name')),
    ('target', TargetEncoder(column='brand_name')),
])

target_pipeline.set_output(transform="pandas")
_ = target_pipeline

# Messy string - category name
1. Normalise strings
2. Impute
3. Label encode

In [None]:
class SplitAndLabelEncodeCategory(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def __init__(self, column='category_name', drop_original=True, fill_value='missing'):
        self.column = column
        self.drop_original = drop_original
        self.fill_value = fill_value
        self.label_maps_ = {}

    def fit(self, X, y=None):
        X = X.copy()
        splits = X[self.column].str.split('/', expand=True)

        for i, cat_col in enumerate(['cat1', 'cat2', 'cat3']):
            if i < splits.shape[1]:
                values = splits[i].fillna(self.fill_value).str.strip().str.lower()
            else:
                values = pd.Series([self.fill_value] * len(X))
            unique_vals = values.dropna().unique()
            self.label_maps_[cat_col] = {val: idx for idx, val in enumerate(sorted(unique_vals))}
        return self

    def transform(self, X):
        X = X.copy()
        splits = X[self.column].str.split('/', expand=True)

        for i, cat_col in enumerate(['cat1', 'cat2', 'cat3']):
            if i < splits.shape[1]:
                values = splits[i].fillna(self.fill_value).str.strip().str.lower()
            else:
                values = pd.Series([self.fill_value] * len(X), index=X.index)
            mapping = self.label_maps_[cat_col]
            X[cat_col + "_encoded"] = values.map(mapping).fillna(-1).astype("int64")

        if self.drop_original and self.column in X.columns:
            X = X.drop(columns=[self.column])

        return X[[col + "_encoded" for col in ['cat1', 'cat2', 'cat3']]]



label_pipeline = Pipeline([
    ('split_encode', SplitAndLabelEncodeCategory(column='category_name'))
])
label_pipeline.set_output(transform="pandas")

label_pipeline.set_output(transform="pandas")
_ = label_pipeline

# Messy strings - name, item description
Due to RAM constraints, the DataFrame was split into smaller parts, which limited global interpretability but enabled more efficient tuning and model development.

In [None]:
class TextFeatureExtractor(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            X[f"{col}_char_len"] = X[col].apply(lambda x: len(str(x)))
            X[f"{col}_word_count"] = X[col].apply(lambda x: len(str(x).split()))
        return X

class TFIDFVectorizerWrapper(BaseEstimator, TransformerMixin):
    def __init__(self, column, max_features=1000, stop_words=None):
        self.column = column
        self.max_features = max_features
        self.stop_words = stop_words
        self.vectorizer = None

    def fit(self, X, y=None):
        self.vectorizer = TfidfVectorizer(
            max_features=self.max_features,
            stop_words=self.stop_words
        )
        self.vectorizer.fit(X[self.column])
        return self

    def transform(self, X):
        return self.vectorizer.transform(X[self.column])

name_pipeline = Pipeline([
    ('normalize', NormalizeTextColumn(column='name')),
    ('extract_features', TextFeatureExtractor(columns=['name'])),
    ('tfidf', TFIDFVectorizerWrapper(column='name', max_features=300))
])

item_pipeline = Pipeline([
    ('normalize', NormalizeTextColumn(column='item_description')),
    ('extract_features', TextFeatureExtractor(columns=['item_description'])),
    ('tfidf', TFIDFVectorizerWrapper(column='item_description', max_features=1000, stop_words='english'))
])

_ = item_pipeline

# Pipeline

In [None]:
preprocessor = ColumnTransformer([
    ('shipping', shipping_pipeline, ['shipping']),
    ('item_condition', ordinal_pipeline, ['item_condition_id']),
    ('brand', target_pipeline, ['brand_name']),
    ('category', label_pipeline, ['category_name']),
    ('name_text', name_pipeline, ['name']),
    ('description_text', item_pipeline, ['item_description']),
])

final_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('regressor', LinearRegression())
])

_ = final_pipeline

#Metrics
## Primary metric - RMSE
RMSE is ideal for Mercari price prediction as it penalizes large errors, aligning with the goal of accurate pricing to ensure buyer trust and seller fairness.

Due to the large size of the dataset train test split was the optimal evalution strategy.

In [None]:
X_subtrain, X_val, y_subtrain, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

final_pipeline.fit(X_subtrain, y_subtrain)

y_val_pred = final_pipeline.predict(X_val)

r2_val = r2_score(y_val, y_val_pred)
mse_val = mean_squared_error(y_val, y_val_pred)
rmse_val = np.sqrt(mse_val)
mae_val = mean_absolute_error(y_val, y_val_pred)

print("Internal Validation Metrics:")
print("R²:", r2_val)
print("MSE:", mse_val)
print("RMSE:", rmse_val)
print("MAE:", mae_val)

final_pipeline.fit(X_train, y_train)

y_test_pred = final_pipeline.predict(X_holdout)

r2_test = r2_score(y_holdout, y_test_pred)
mse_test = mean_squared_error(y_holdout, y_test_pred)
rmse_test = np.sqrt(mse_test)
mae_test = mean_absolute_error(y_holdout, y_test_pred)

print("Hold-Out Test Metrics:")
print("R²:", r2_test)
print("MSE:", mse_test)
print("RMSE:", rmse_test)
print("MAE:", mae_test)

Internal Validation Metrics:
R²: 0.42926196100157765
MSE: 0.32034248080282934
RMSE: 0.5659880571203153
MAE: 0.42933798424301517
Hold-Out Test Metrics:
R²: 0.42997560546139824
MSE: 0.3215544821235625
RMSE: 0.5670577414369391
MAE: 0.4298897606035548


# Analysis
1. For a linear model with string/text-based features, this is a solid baseline to build off of.
2. The model is showing good generalization; this is considerably reliable due to the large size of the dataset.

###Summary:
After establishing this baseline, I am in a good position to build further.