In [2]:
# Import necessary modules
from autogluon.tabular import TabularDataset, TabularPredictor
import pandas as pd
from sklearn.model_selection import train_test_split

In [4]:
# Load and prepare the data
train = pd.read_csv("../data/playground-series-s5e2/train.csv", index_col='id')
train_extra = pd.read_csv("../data/playground-series-s5e2/training_extra.csv", index_col='id')
test = pd.read_csv('../data/playground-series-s5e2/test.csv', index_col='id')

In [5]:
# Feature engineering function
def add_features(df):
    # Create interaction features
    df['Weight_per_Compartment'] = df['Weight Capacity (kg)'] / df['Compartments']

    # Convert binary features to numeric
    binary_map = {'Yes': 1, 'No': 0}
    df['Laptop Compartment'] = df['Laptop Compartment'].map(binary_map)
    df['Waterproof'] = df['Waterproof'].map(binary_map)

    return df

In [6]:
# Apply feature engineering to all datasets
train = add_features(train)
train_extra = add_features(train_extra)
test = add_features(test)

In [8]:
# Combine train and train extra data sets
df = pd.concat([train, train_extra], axis=0, ignore_index=True)
# 打印df前10行
print(df.head(10))

          Brand   Material    Size  Compartments  Laptop Compartment  \
0      Jansport    Leather  Medium           7.0                 1.0   
1      Jansport     Canvas   Small          10.0                 1.0   
2  Under Armour    Leather   Small           2.0                 1.0   
3          Nike      Nylon   Small           8.0                 1.0   
4        Adidas     Canvas  Medium           1.0                 1.0   
5          Nike     Canvas  Medium          10.0                 0.0   
6          Nike        NaN   Large           3.0                 0.0   
7          Puma     Canvas   Small           1.0                 1.0   
8  Under Armour  Polyester  Medium           8.0                 1.0   
9  Under Armour      Nylon  Medium           2.0                 1.0   

   Waterproof      Style  Color  Weight Capacity (kg)      Price  \
0         0.0       Tote  Black             11.611723  112.15875   
1         1.0  Messenger  Green             27.078537   68.88056   
2  

In [9]:
# Define target column
target = 'Price'

In [10]:
# Split data into train and validation sets
train_data, val_data = train_test_split(df, test_size=0.2, random_state=42)

In [11]:
train_data

Unnamed: 0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price,Weight_per_Compartment
2730815,Puma,Nylon,Small,7.0,1.0,1.0,Backpack,Green,8.812586,143.97928,1.258941
2453018,Adidas,Canvas,Small,5.0,0.0,0.0,Backpack,Pink,29.273634,89.07518,5.854727
3789433,Adidas,Canvas,,9.0,0.0,0.0,Tote,Green,22.482912,65.10618,2.498101
1359411,Adidas,Leather,Small,8.0,1.0,0.0,Messenger,Red,18.566531,90.37096,2.320816
1312217,Puma,Polyester,Large,2.0,0.0,1.0,Backpack,Green,15.181152,139.50037,7.590576
...,...,...,...,...,...,...,...,...,...,...,...
2356330,Puma,Leather,Small,10.0,1.0,1.0,Tote,Green,25.292187,102.07108,2.529219
3511566,Jansport,Nylon,Large,6.0,0.0,1.0,Tote,Green,13.608859,47.48547,2.268143
2229084,Puma,Polyester,Small,8.0,1.0,1.0,Backpack,Gray,20.959452,57.86485,2.619931
2768307,Nike,Canvas,Small,8.0,0.0,0.0,Messenger,Gray,28.873280,110.44275,3.609160


In [12]:
# Initialize AutoGluon predictor with time constraints
predictor = TabularPredictor(
    label=target,
    problem_type='regression',
    eval_metric='root_mean_squared_error',
    path='ag_models_backpack'
).fit(
    train_data=train_data,
    tuning_data=val_data,
    # Use medium_quality preset instead of best_quality for faster training
    presets='medium_quality',
    # Set a strict 10-minute time limit (600 seconds)
    time_limit=600,
    # Skip hyperparameter tuning to save time
    hyperparameters='default',
    # Limit model types to faster ones
    excluded_model_types=['KNN', 'NN_TORCH', 'FASTAI'],
    verbosity=2
)

print("Done")

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.10.0
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Tue Nov 5 00:21:55 UTC 2024
CPU Count:          12
Memory Avail:       52.82 GB / 57.48 GB (91.9%)
Disk Space Avail:   3552.19 GB / 6519.49 GB (54.5%)
Presets specified: ['medium_quality']
Beginning AutoGluon training ... Time limit = 600s
AutoGluon will save models to "/mnt/n/code/competition/kaggle/backpack_prediction/notebook/ag_models_backpack"
Train Data Rows:    3195454
Train Data Columns: 10
Tuning Data Rows:    798864
Tuning Data Columns: 10
Label Column:       Price
Problem Type:       regression
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    54446.78 MB
	Train Data (Original)  Memory Usage: 1336.38 MB (2.5% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_i

[1000]	valid_set's rmse: 38.8607


	-38.8606	 = Validation score   (-root_mean_squared_error)
	73.41s	 = Training   runtime
	5.48s	 = Validation runtime
Fitting model: LightGBM ... Training model for up to 502.31s of the 502.30s of remaining time.
	-38.8467	 = Validation score   (-root_mean_squared_error)
	36.01s	 = Training   runtime
	3.44s	 = Validation runtime
Fitting model: RandomForestMSE ... Training model for up to 462.77s of the 462.77s of remaining time.
	-38.897	 = Validation score   (-root_mean_squared_error)
	254.24s	 = Training   runtime
	0.8s	 = Validation runtime
Fitting model: CatBoost ... Training model for up to 207.10s of the 207.09s of remaining time.
	Ran out of time, early stopping on iteration 256.
	-38.866	 = Validation score   (-root_mean_squared_error)
	206.97s	 = Training   runtime
	0.58s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ... Training model for up to 359.99s of the -6.73s of remaining time.
	Ensemble Weights: {'LightGBM': 0.864, 'RandomForestMSE': 0.136}
	-38.8454	 = Val

In [13]:
# Evaluate on validation data
performance = predictor.evaluate(val_data)


Validation performance: {'root_mean_squared_error': -38.84539802950375, 'mean_squared_error': -1508.9649480705739, 'mean_absolute_error': -33.58569063863002, 'r2': 0.0036086977358599226, 'pearsonr': 0.06024042027658161, 'median_absolute_error': -33.5429857421875}


In [15]:
# Generate predictions on test data
test_pred = predictor.predict(test)
test_pred

id
300000    80.746819
300001    82.589264
300002    82.403572
300003    81.244743
300004    79.295647
            ...    
499995    78.400635
499996    78.490158
499997    82.478157
499998    81.968971
499999    81.648521
Name: Price, Length: 200000, dtype: float32

In [16]:
# Create submission file
submission = pd.DataFrame({'id': test.index, 'Price': test_pred})
submission.set_index('id', inplace=True)
submission.to_csv('../output/submission.csv')
print("Submission file created")

Submission file created
