## Install AutoGluon

Installs the latest version of AutoGluon, a library for automated machine learning on tabular data.

In [1]:
!pip install -U autogluon.tabular

Collecting autogluon.tabular
  Downloading autogluon.tabular-1.4.0-py3-none-any.whl.metadata (16 kB)
Collecting scikit-learn<1.8.0,>=1.4.0 (from autogluon.tabular)
  Downloading scikit_learn-1.7.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting autogluon.core==1.4.0 (from autogluon.tabular)
  Downloading autogluon.core-1.4.0-py3-none-any.whl.metadata (12 kB)
Collecting autogluon.features==1.4.0 (from autogluon.tabular)
  Downloading autogluon.features-1.4.0-py3-none-any.whl.metadata (11 kB)
Collecting autogluon.common==1.4.0 (from autogluon.core==1.4.0->autogluon.tabular)
  Downloading autogluon.common-1.4.0-py3-none-any.whl.metadata (11 kB)
Downloading autogluon.tabular-1.4.0-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.3/487.3 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading autogluon.core-1.4.0-py3-none-any.whl (225 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## Import Libraries

Imports essential libraries for data manipulation, visualization, and modeling, including AutoGluon, LightGBM, CatBoost, XGBoost, and scikit-learn.

In [2]:
from autogluon.tabular import TabularPredictor

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import re

from lightgbm import log_evaluation, early_stopping
import lightgbm as lgb
from catboost import CatBoostRegressor, Pool

from xgboost import XGBRegressor


import random

from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

from autogluon.tabular import TabularPredictor



## Import Datasets

Loads the training and test datasets for the competition, along with the original used car dataset. Preprocesses the 'milage' and 'price' columns in the original dataset to extract numeric values. Removes the 'id' column from both train and test datasets, and concatenates the original dataset to the training data to increase sample size.

In [3]:
train = pd.read_csv("/kaggle/input/hackathon-qualification/archive/train.csv")
test  = pd.read_csv("/kaggle/input/hackathon-qualification/archive/test.csv")
Original = pd.read_csv('/kaggle/input/used-car-price-prediction-dataset/used_cars.csv')
Original[['milage', 'price']] = Original[['milage', 'price']].map(
    lambda x: int(''.join(re.findall(r'\d+', x))))


train.drop(columns=['id'], inplace=True)
test.drop(columns=['id'], inplace=True)


train = pd.concat([train, Original], ignore_index=True)


## Feature Engineering

Defines functions to create new features:
- `extract_age_features` calculates vehicle age and mileage per year.
- `extract_other_features` flags luxury brands.
Applies these feature engineering steps to both train and test datasets.

In [4]:
def extract_age_features(df):
    current_year = 2024

    df['Vehicle_Age'] = current_year - df['model_year']
    
    df['Mileage_per_Year'] = df['milage'] / df['Vehicle_Age']
    df['milage_with_age'] =  df.groupby('Vehicle_Age')['milage'].transform('mean')
    
    df['Mileage_per_Year_with_age'] =  df.groupby('Vehicle_Age')['Mileage_per_Year'].transform('mean')
    
    return df

train = extract_age_features(train)
test = extract_age_features(test)

In [5]:
def extract_other_features(df):
    
    luxury_brands =  ['Mercedes-Benz', 'BMW', 'Audi', 'Porsche', 'Land', 
                    'Lexus', 'Jaguar', 'Bentley', 'Maserati', 'Lamborghini', 
                    'Rolls-Royce', 'Ferrari', 'McLaren', 'Aston', 'Maybach']
    df['Is_Luxury_Brand'] = df['brand'].apply(lambda x: 1 if x in luxury_brands else 0)
    



    return df
train = extract_other_features(train)
test = extract_other_features(test)


## Categorical Feature Processing

Handles rare categories by replacing them with 'noise', fills missing values, and converts categorical columns to the 'category' dtype for modeling.

In [6]:
def update(df):
    
    t = 100
    
    cat_c = ['brand','model','fuel_type','engine','transmission','ext_col','int_col','accident','clean_title',
             
            ]
    re_ = ['model','engine','transmission','ext_col','int_col']
    
    for col in re_:
        df.loc[df[col].value_counts(dropna=False)[df[col]].values < t, col] = "noise"
        
    for col in cat_c:
        df[col] = df[col].fillna('missing')
        df[col] = df[col].astype('category')
        
    return df

train  = update(train)
test   = update(test)

X = train.drop('price', axis=1)
y = train['price']

## Model Training (LightGBM & CatBoost)

Trains LightGBM or CatBoost models using K-Fold cross-validation, returning out-of-fold predictions and trained models. Uses both MAE and MSE objectives, and stores their predictions for further analysis.

In [7]:
callbacks = [log_evaluation(period=300), early_stopping(stopping_rounds=200)]

cat_cols = train.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"cat_cols--------{cat_cols}")


def get_MAE_oof(df, target, lgb_params, cat_params=None, model_type='LGBM'):

    
    oof_predictions = np.zeros(len(df))
    kf = KFold(n_splits=5, shuffle=True, random_state=1)
    models = []
    rmse_scores = []
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(df)):
        print(f"Training fold {fold + 1}/{5} with {model_type}")

        X_train, X_val = df.iloc[train_idx], df.iloc[val_idx]
        y_train, y_val = target.iloc[train_idx], target.iloc[val_idx]

        if model_type == 'LGBM':
            train_data = lgb.Dataset(X_train, label=y_train)
            val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
            
            model = lgb.train(
                lgb_params,
                train_data,
                valid_sets=[train_data, val_data],
                valid_names=['train', 'valid'],
                callbacks=callbacks    
            )
        
        elif model_type == 'CAT':
            train_data = Pool(data=X_train, label=y_train , cat_features=cat_cols)
            val_data = Pool(data=X_val, label=y_val , cat_features=cat_cols )
            
            model = CatBoostRegressor(**cat_params)
            model.fit(train_data, eval_set=val_data, verbose=150, early_stopping_rounds=200)
        
        models.append(model)
        
        if model_type == 'LGBM':
            pred = model.predict(X_val, num_iteration=model.best_iteration)
        elif model_type == 'CAT':
            pred = model.predict(X_val)
        
        rmse = np.sqrt(mean_squared_error(y_val, pred))
        rmse_scores.append(rmse)

        print(f'{model_type} Fold RMSE: {rmse}')
        
        oof_predictions[val_idx] = pred
        
    print(f'Mean RMSE: {np.mean(rmse_scores)}')
    return oof_predictions, models




lgb_params = {
    'objective': 'MAE',
    'n_estimators': 1000,
    'random_state': 1,
}

oof_predictions_lgbm, models_lgbm = get_MAE_oof(X, y, lgb_params, model_type='LGBM')
X['LGBM_MAE'] = oof_predictions_lgbm


LGBM_preds = np.zeros(len(test))
for model in models_lgbm:
    LGBM_preds += model.predict(test) / len(models_lgbm)
test['LGBM_MAE'] = LGBM_preds



lgb_params = {
    'objective': 'MSE',
    'n_estimators': 1000,
    'random_state': 1,
}

oof_predictions_lgbm, models_lgbm = get_MAE_oof(X, y, lgb_params, model_type='LGBM')

X['LGBM_MSE_diff'] = oof_predictions_lgbm - X['LGBM_MAE']


LGBM_preds = np.zeros(len(test))
for model in models_lgbm:
    LGBM_preds += model.predict(test) / len(models_lgbm)
test['LGBM_MSE_diff'] = LGBM_preds - test['LGBM_MAE']

test.head()

cat_cols--------['brand', 'model', 'fuel_type', 'engine', 'transmission', 'ext_col', 'int_col', 'accident', 'clean_title']
Training fold 1/5 with LGBM
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010217 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1724
[LightGBM] [Info] Number of data points in the train set: 154033, number of used features: 16
[LightGBM] [Info] Start training from score 30825.000000
Training until validation scores don't improve for 200 rounds
[300]	train's l1: 16379.4	valid's l1: 16910.1
Early stopping, best iteration is:
[369]	train's l1: 16314.7	valid's l1: 16906.2
LGBM Fold RMSE: 67861.26982972873
Training fold 2/5 with LGBM
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006859 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enou

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,Vehicle_Age,Mileage_per_Year,milage_with_age,Mileage_per_Year_with_age,Is_Luxury_Brand,LGBM_MAE,LGBM_MSE_diff
0,Land,noise,2015,98000,Gasoline,240.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,6-Speed A/T,White,Beige,None reported,Yes,9,10888.888889,81078.503981,9008.722665,1,16402.458527,4280.396289
1,Land,Rover Defender SE,2020,9142,Hybrid,395.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,8-Speed A/T,Silver,Black,None reported,Yes,4,2285.5,34258.886442,8564.721611,1,57252.126782,15875.930352
2,Ford,Expedition Limited,2022,28121,Gasoline,3.5L V6 24V PDI DOHC Twin Turbo,10-Speed Automatic,White,Ebony,None reported,missing,2,14060.5,17877.043403,8938.521702,0,49451.472824,8181.763131
3,Audi,noise,2016,61258,Gasoline,2.0 Liter TFSI,Automatic,noise,Black,None reported,missing,8,7657.25,75999.679762,9499.95997,1,25768.824542,3535.516228
4,Audi,A6 2.0T Premium Plus,2018,59000,Gasoline,252.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,A/T,Gray,Black,None reported,Yes,6,9833.333333,52105.532436,8684.255406,1,27677.661661,4005.789445


## AutoGluon TabularPredictor

Fits an AutoGluon TabularPredictor on the engineered features, using RMSE as the evaluation metric and restricting models to GBM and CatBoost. Automatically uses GPU if available.

In [8]:
X['price'] = y

predictor = TabularPredictor(label='price',
                            eval_metric='rmse',
                            problem_type='regression').fit(X,
                                                       presets='best_quality',
                                                       time_limit=3600*1,
                                                       verbosity=2,
                                                       num_gpus=0,
                                                       included_model_types=['GBM', 'CAT']
                                                      )




No path specified. Models will be saved in: "AutogluonModels/ag-20250912_180942"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.4.0
Python Version:     3.11.13
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Sun Nov 10 10:07:59 UTC 2024
CPU Count:          4
Memory Avail:       29.96 GB / 31.35 GB (95.6%)
Disk Space Avail:   19.50 GB / 19.52 GB (99.9%)
Presets specified: ['best_quality']
Using hyperparameters preset: hyperparameters='zeroshot'
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon wil

[1000]	valid_set's rmse: 71332


	-72593.096	 = Validation score   (-root_mean_squared_error)
	31.19s	 = Training   runtime
	3.74s	 = Validation runtime
Fitting model: CatBoost_r137_BAG_L1 ... Training model for up to 1189.39s of the 2085.80s of remaining time.
	Fitting 8 child models (S1F1 - S1F8) | Fitting with SequentialLocalFoldFittingStrategy (sequential: cpus=2, gpus=0)
	-72583.8924	 = Validation score   (-root_mean_squared_error)
	183.05s	 = Training   runtime
	0.22s	 = Validation runtime
Fitting model: CatBoost_r13_BAG_L1 ... Training model for up to 1005.91s of the 1902.32s of remaining time.
	Fitting 8 child models (S1F1 - S1F8) | Fitting with SequentialLocalFoldFittingStrategy (sequential: cpus=2, gpus=0)
	Ran out of time, early stopping on iteration 735.
	Ran out of time, early stopping on iteration 810.
	-72554.9866	 = Validation score   (-root_mean_squared_error)
	717.12s	 = Training   runtime
	0.81s	 = Validation runtime
Fitting model: LightGBM_r188_BAG_L1 ... Training model for up to 287.58s of the 118

## Prediction & Submission Blending

Generates predictions on the test set, blends them with an existing Kaggle solution, and creates the final submission file for the competition.

In [9]:
y_pred = predictor.predict(test)


sub_blend = pd.read_csv('/kaggle/input/top-5-blended-car-prices/submission_9.csv')
sample_sub = pd.read_csv('/kaggle/input/hackathon-qualification/archive/sample_submission.csv')
sample_sub['price'] =  y_pred * 0.55 + sub_blend['price'] * 0.45
sample_sub.to_csv('/kaggle/working/./submission.csv', index=False)
sample_sub.head()

Unnamed: 0,id,price
0,188533,18975.962045
1,188534,78099.202956
2,188535,57735.001267
3,188536,29839.036537
4,188537,31191.854321
