In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import math

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Used Cars Model Trainer

This notebook is used to train an XGBoost regression model using the train/test splits generated using the `clean_car_data` notebook. 

In [3]:
def train_xgboost_separate(train_file, test_file, label_col, random_state = 42):
    """
    Trains an XGBoost model on some parquet data. 
    
    :param train_file: Path to the parquet file containing the training data.
    :param test_file: Path to the parquet file containing the test data.
    :param label_col: The name of the label/target column.
    :param random_state: Random seed for reproducibility.
    
    :return: Trained XGBoost model.
    """
    # Load the train data, separate features and labels, convert to DMatrix
    dtrain = pd.read_parquet(train_file)
    X_train = dtrain.drop(columns=[label_col, 'listing_id'])
    y_train = dtrain[label_col]
    dtrain_matrix = xgb.DMatrix(X_train, label=y_train)

    # Set up XGBoost parameters
    params = {
        'objective': 'reg:squarederror',
        'max_depth': 7,
        'eta': 0.3,
        'device': 'cpu',
        'subsample': 1.0,
        'sampling_method': 'uniform',
        'lambda': 2.0,
        'alpha': 0.0,
        'tree_method': 'hist',
        'num_parallel_tree': 1,
        'nthread': 2,
        'seed': random_state
    }

    # Train the model
    booster = xgb.train(max_threads
        params,
        dtrain_matrix,
        num_boost_round=400
    )

    # Load the test data for evaluation
    dtest = pd.read_parquet(test_file)
    X_test = dtest.drop(columns=[label_col, 'listing_id'])
    y_test = dtest[label_col]
    dtest_matrix = xgb.DMatrix(X_test, label=y_test)

    # Predict on the test set
    y_test_pred = booster.predict(dtest_matrix)
    y_train_pred = booster.predict(dtrain_matrix)

    # Compute and return the MSE
    test_loss = mean_squared_error(y_test, y_test_pred)
    train_loss = mean_squared_error(y_train, y_train_pred)

    # Report results
    print(f"train: {math.sqrt(train_loss)}, test: {math.sqrt(test_loss)}")
    
    return booster

In [4]:
seed = 42
booster = train_xgboost_separate('used_cars_train.parquet', 'used_cars_test.parquet', 'price', seed)

train: 6943.2990000780255, test: 8328.89817910091


In [5]:
booster.save_model('../Temp/Models/usedcars_model.json')