In [2]:
import pandas as pd
import xgboost as xgb
from sklearn.metrics import mean_squared_error

In [3]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Used Cars Model Trainer

This notebook is used to train an XGBoost regression model using the train/test splits generated using the `clean_car_data` notebook. 

In [4]:
df = pd.read_csv('/home/jwc/Data/UsedCars/used_cars_train.csv', nrows=10)
df

Unnamed: 0,frame_damaged,has_accidents,is_new,daysonmarket,height,horsepower,length,mileage,seller_rating,price,listing_id
0,0.009502,0.154409,1.0,0.053904,0.331984,0.2537,0.47234,0.0,0.697368,43352.0,267163014
1,0.009502,0.154409,1.0,0.095026,0.178138,0.140592,0.461702,1.2e-07,0.892857,21638.0,254107702
2,0.009502,0.154409,1.0,0.081134,0.489879,0.412262,0.765426,0.000311469,0.826389,64806.0,258636661
3,0.0,0.0,0.0,0.001667,0.337382,0.289641,0.49734,0.00010059,0.836957,52499.0,281158730
4,0.009502,0.154409,1.0,0.010836,0.458839,0.359408,0.721809,0.0,0.805556,57860.0,278179471
5,0.009502,0.154409,1.0,0.00389,0.197031,0.133192,0.455319,3e-08,0.875,21765.0,280287500
6,0.009502,0.154409,1.0,0.001667,0.327935,0.216702,0.46117,0.0,0.857143,44945.0,281101232
7,0.009502,0.154409,1.0,0.061406,0.367072,0.269556,0.52234,2e-08,0.63,39500.0,264774176
8,0.009502,0.154409,1.0,0.011948,0.295547,0.121564,0.409574,0.0,1.0,31988.0,277779585
9,0.009502,0.154409,1.0,0.015838,0.290148,0.133192,0.373936,1e-07,0.8125,26163.0,276519687


In [5]:
def train_xgboost_separate(train_file, test_file, label_col, num_boost_round=400):
    """
    Trains an XGBoost model by loading either the train or test set at a time to save memory.
    
    Parameters:
    - train_file (str): Path to the CSV file containing the training data.
    - test_file (str): Path to the CSV file containing the test data.
    - label_col (str): The name of the label/target column.
    - num_boost_round (int): Number of boosting rounds for training.
    - random_state (int): Random seed for reproducibility.
    
    Returns:
    - test_loss (float): Mean Squared Error on the test set.
    - booster (xgb.Booster): Trained XGBoost model.
    """
    # Load the train data first, perform training
    print("Loading and training on the training set...")
    dtrain = pd.read_csv(train_file)

    # Separate features and labels for training
    X_train = dtrain.drop(columns=[label_col, 'listing_id'])
    y_train = dtrain[label_col]

    # Free up memory by deleting the training dataframe after extracting data
    del dtrain

    # Convert to DMatrix, which is XGBoost's optimized data structure
    dtrain_matrix = xgb.DMatrix(X_train, label=y_train)

    # Free up memory by deleting feature/label data
    del X_train, y_train

    # Set up XGBoost parameters
    params = {
        'objective': 'reg:squarederror',
        'max_depth': 8,
        'eta': 0.1,
    }

    # Train the model using XGBoost's standard train method
    booster = xgb.train(
        params,
        dtrain_matrix,
        num_boost_round=num_boost_round
    )

    # Free up memory by deleting the training DMatrix
    del dtrain_matrix

    # Load the test data for evaluation
    print("Loading and evaluating on the test set...")
    dtest = pd.read_csv(test_file)

    # Separate features and labels for testing
    X_test = dtest.drop(columns=[label_col, 'listing_id'])
    y_test = dtest[label_col]

    # Free up memory by deleting the test dataframe after extracting data
    del dtest

    # Convert test data to DMatrix
    dtest_matrix = xgb.DMatrix(X_test, label=y_test)

    # Predict on the test set
    y_pred = booster.predict(dtest_matrix)

    # Free up memory by deleting the test DMatrix
    del dtest_matrix

    # Compute and return the Mean Squared Error
    test_loss = mean_squared_error(y_test, y_pred)

    # Free up memory by deleting feature/label data
    del X_test, y_test
    
    return test_loss, booster, y_pred

In [6]:
test_loss, booster, y_pred = train_xgboost_separate('/home/jwc/Data/UsedCars/used_cars_train.csv', '/home/jwc/Data/UsedCars/used_cars_test.csv', 'price')

Loading and training on the training set...
Loading and evaluating on the test set...


In [7]:
list(y_pred)

[20477.0,
 17144.709,
 11175.6875,
 23081.91,
 35378.64,
 26587.191,
 22622.402,
 47296.508,
 23967.443,
 34407.99,
 20734.873,
 46760.844,
 38200.742,
 33912.41,
 18834.047,
 11501.015,
 38769.47,
 27594.156,
 6317.7656,
 32797.28,
 10358.697,
 20813.137,
 30251.893,
 33730.44,
 64863.934,
 20068.752,
 38786.965,
 27622.975,
 47411.44,
 35915.113,
 5180.9155,
 36605.715,
 33001.098,
 16773.393,
 14443.729,
 40593.656,
 37516.742,
 21492.615,
 47947.035,
 60166.77,
 34774.383,
 45617.312,
 44171.48,
 22164.582,
 11487.635,
 24736.734,
 26886.646,
 18668.88,
 25361.877,
 26804.111,
 16346.108,
 7680.1484,
 56956.164,
 42178.836,
 23378.96,
 26151.799,
 35479.05,
 48167.94,
 12551.323,
 26174.123,
 54280.85,
 22523.664,
 19682.09,
 8382.487,
 34698.098,
 27202.922,
 42459.348,
 32017.252,
 53681.15,
 89479.97,
 17546.217,
 26687.023,
 20363.555,
 30859.479,
 48981.797,
 12326.495,
 52491.586,
 6994.8003,
 24371.518,
 29047.266,
 23341.41,
 35978.25,
 14025.969,
 46976.73,
 43085.887,
 31

In [8]:
test_loss

68061980.35196608

In [9]:
booster

<xgboost.core.Booster at 0x7fa73ec02710>

In [10]:
booster.save_model('../Temp/Models/usedcars_model.json')