# DSN Bootcamp Qualification Hackathon Qualification
### Car Price Prediction Pipeline


## Initial Setup and Imports

In [None]:
import warnings
warnings.filterwarnings("ignore")

import os
import random
from typing import Dict, List, Tuple
import numpy as np
import pandas as pd
import re

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import Ridge
from sklearn.base import clone

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor

RANDOM_STATE = 42
CURRENT_YEAR = 2024
N_SPLITS = 5
SEED = RANDOM_STATE

## Utility Functions

In [None]:
def seed_everything(seed: int = SEED) -> None:
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

def rmse(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    return mean_squared_error(y_true, y_pred, squared=False)

## Feature Engineering Functions

In [None]:
def extract_advanced_engine_features(engine_str):
    if pd.isna(engine_str) or engine_str == "":
        return 0.0, 0.0, 0, 0, 0, 0
    s = str(engine_str).upper()
    hp = 0.0
    for pattern in [r'(\d+\.?\d*)\s*HP', r'(\d+\.?\d*)\s*HORSEPOWER']:
        m = re.search(pattern, s)
        if m:
            hp = float(m.group(1)); break
    displacement = 0.0
    for pattern in [r'(\d+\.?\d*)\s*L(?:\s|$)', r'(\d+\.?\d*)\s*LITER']:
        m = re.search(pattern, s)
        if m:
            displacement = float(m.group(1)); break
    cyl = 0
    for pattern in [r'(\d+)\s*CYLINDER', r'V(\d+)', r'I(\d+)']:
        m = re.search(pattern, s)
        if m:
            cyl = int(m.group(1)); break
    is_turbo = 1 if 'TURBO' in s else 0
    is_supercharged = 1 if 'SUPERCHARGED' in s else 0
    is_diesel = 1 if 'DIESEL' in s else 0
    return hp, displacement, cyl, is_turbo, is_supercharged, is_diesel

def create_text_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    if 'model' in df.columns:
        df['model_word_count'] = df['model'].fillna('').str.split().str.len()
        df['model_has_numbers'] = df['model'].fillna('').str.contains(r'\d', na=False).astype(int)
        df['model_length'] = df['model'].fillna('').str.len()
        premium_keywords = ['Limited', 'Premium', 'Sport', 'GT', 'AMG', 'M']
        for keyword in premium_keywords:
            df[f'model_has_{keyword.lower()}'] = df['model'].fillna('').str.contains(keyword, case=False, na=False).astype(int)
    if 'transmission' in df.columns:
        df['trans_speed'] = df['transmission'].fillna('').str.extract(r'(\d+)').fillna(0).astype(int)
        df['is_cvt'] = df['transmission'].fillna('').str.contains('CVT', case=False, na=False).astype(int)
        df['is_4wd'] = df['transmission'].fillna('').str.contains('4WD|AWD', case=False, na=False).astype(int)
    return df

def create_interaction_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    if {'car_age', 'milage'} <= set(df.columns):
        df['age_mileage_ratio'] = df['milage'] / (df['car_age'] + 1)
        df['log_mileage'] = np.log1p(df['milage'])
        df['sqrt_mileage'] = np.sqrt(df['milage'])
    if {'horsepower', 'car_age'} <= set(df.columns):
        df['power_age_ratio'] = df['horsepower'] / (df['car_age'] + 1)
    if 'model_year' in df.columns:
        df['year_squared'] = df['model_year'] ** 2
        df['years_since_2000'] = df['model_year'] - 2000
        df['is_pre_2010'] = (df['model_year'] < 2010).astype(int)
        df['is_post_2020'] = (df['model_year'] >= 2020).astype(int)
    return df

def advanced_feature_engineering_no_te(df: pd.DataFrame) -> pd.DataFrame:
    """Feature engineering that does not compute target encodings.
    Use target encoding later in-fold to avoid leakage."""
    df = df.copy()
    # year/age
    if 'model_year' in df.columns:
        df['car_age'] = CURRENT_YEAR - df['model_year']
        df['car_age_squared'] = df['car_age'] ** 2
        df['car_age_log'] = np.log1p(df['car_age'].clip(lower=0))
        df['is_new_car'] = (df['car_age'] <= 2).astype(int)
        df['is_old_car'] = (df['car_age'] >= 15).astype(int)
        df['is_classic'] = (df['car_age'] >= 25).astype(int)
    # mileage
    if 'milage' in df.columns:
        df['milage'] = df['milage'].fillna(df['milage'].median())
        df['log_milage'] = np.log1p(df['milage'])
        df['sqrt_milage'] = np.sqrt(df['milage'])
        if 'car_age' in df.columns:
            df['mileage_per_year'] = df['milage'] / np.maximum(df['car_age'], 0.5)
            df['low_mileage_for_age'] = (df['mileage_per_year'] < 8000).astype(int)
            df['high_mileage_for_age'] = (df['mileage_per_year'] > 15000).astype(int)
        mileage_bins = [0, 15000, 50000, 100000, 150000, np.inf]
        df['mileage_bin'] = pd.cut(df['milage'], bins=mileage_bins, labels=[0,1,2,3,4]).astype(int)
        df['very_low_mileage'] = (df['milage'] < 15000).astype(int)
        df['low_mileage'] = ((df['milage'] >= 15000) & (df['milage'] < 50000)).astype(int)
        df['medium_mileage'] = ((df['milage'] >= 50000) & (df['milage'] < 100000)).astype(int)
        df['high_mileage'] = ((df['milage'] >= 100000) & (df['milage'] < 150000)).astype(int)
        df['very_high_mileage'] = (df['milage'] >= 150000).astype(int)
    # engine
    if 'engine' in df.columns:
        eng = df['engine'].apply(extract_advanced_engine_features)
        df['horsepower'] = [t[0] for t in eng]
        df['engine_displacement'] = [t[1] for t in eng]
        df['cylinders'] = [t[2] for t in eng]
        df['is_turbo'] = [t[3] for t in eng]
        df['is_supercharged'] = [t[4] for t in eng]
        df['is_diesel_engine'] = [t[5] for t in eng]
        df['hp_per_cylinder'] = np.where(df['cylinders'] > 0, df['horsepower'] / df['cylinders'], 0)
        df['hp_per_liter'] = np.where(df['engine_displacement'] > 0, df['horsepower'] / df['engine_displacement'], 0)
        for col in ['hp_per_cylinder', 'hp_per_liter', 'horsepower']:
            if col in df.columns:
                p99 = df[col].quantile(0.99)
                df[col] = df[col].clip(0, p99)
        df['is_high_performance'] = (df['horsepower'] > 300).astype(int)
        df['is_economy'] = (df['horsepower'] < 150).astype(int)
        df['engine_displacement'] = df['engine_displacement'].fillna(0).clip(lower=0)
        engine_size_bins = [0, 1.5, 2.5, 3.5, 5.0, np.inf]
        df['engine_size_category'] = pd.cut(df['engine_displacement'], bins=engine_size_bins, labels=[0,1,2,3,4], include_lowest=True).astype(int)
    # brand/model/textual features
    if 'brand' in df.columns:
        df['brand'] = df['brand'].fillna('Unknown')
        df['brand_frequency'] = df['brand'].map(df['brand'].value_counts()).fillna(1)
    if 'model' in df.columns:
        df['model'] = df['model'].fillna('Unknown')
        df['model_popularity'] = df['model'].map(df['model'].value_counts()).fillna(1)
        df['is_rare_model'] = (df['model_popularity'] < 5).astype(int)
    if 'fuel_type' in df.columns:
        df['fuel_type'] = df['fuel_type'].fillna('Gasoline')
        fuel_efficiency = {'Electric':5, 'Hybrid':4, 'Diesel':3, 'Gasoline':2, 'E85 Flex Fuel':1.5}
        df['fuel_efficiency_score'] = df['fuel_type'].map(fuel_efficiency).fillna(2)
        df['is_electric'] = (df['fuel_type'] == 'Electric').astype(int)
        df['is_hybrid'] = df['fuel_type'].str.contains('Hybrid', case=False, na=False).astype(int)
    if 'transmission' in df.columns:
        df['transmission'] = df['transmission'].fillna('Unknown')
        df['is_automatic'] = df['transmission'].str.contains('A/T|Auto|CVT|Automatic', case=False, na=False).astype(int)
        df['is_manual'] = df['transmission'].str.contains('Manual|M/T', case=False, na=False).astype(int)
    if 'accident' in df.columns:
        df['accident'] = df['accident'].fillna('None reported')
        df['has_accident'] = (~df['accident'].str.contains('None|No', case=False, na=False)).astype(int)
    if 'clean_title' in df.columns:
        df['clean_title'] = df['clean_title'].fillna('Yes')
        df['title_is_clean'] = (df['clean_title'] == 'Yes').astype(int)
    if 'ext_col' in df.columns:
        df['ext_col'] = df['ext_col'].fillna('Unknown')
        popular_colors = {'White', 'Black', 'Silver', 'Gray', 'Grey'}
        df['has_popular_color'] = df['ext_col'].isin(popular_colors).astype(int)
    if 'int_col' in df.columns:
        df['int_col'] = df['int_col'].fillna('Unknown')
        luxury_interior = ['Leather', 'Premium']
        df['has_luxury_interior'] = df['int_col'].str.contains('|'.join(luxury_interior), case=False, na=False).astype(int)

    df = create_text_features(df)
    df = create_interaction_features(df)
    return df

## Target Encoding Functions

In [None]:
def smooth_target_encoding(train_series: pd.Series, target: pd.Series, apply_series: pd.Series,
                           min_samples_leaf: int = 100, smoothing: float = 10.0) -> pd.Series:
    """Smoothed target encoding for a single categorical column.
    Returns encoded values for apply_series based on statistics from train_series/target."""
    tmp = pd.concat([train_series, target], axis=1)
    col_name = train_series.name
    tmp.columns = [col_name, 'target']
    agg = tmp.groupby(col_name)['target'].agg(['mean','count'])
    prior = target.mean()
    # smoothing factor
    agg['smoothing'] = 1 / (1 + np.exp(-(agg['count'] - min_samples_leaf) / smoothing))
    agg['te'] = prior * (1 - agg['smoothing']) + agg['mean'] * agg['smoothing']
    mapping = agg['te'].to_dict()
    return apply_series.map(mapping).fillna(prior)

## Label Encoding Functions

In [None]:
def fit_label_encoders(df: pd.DataFrame, cat_cols: List[str]) -> Dict[str, Dict]:
    encoders = {}
    for c in cat_cols:
        vals = df[c].fillna('Unknown').astype(str).unique().tolist()
        mapping = {v: i for i, v in enumerate(vals)}
        encoders[c] = mapping
    return encoders

def apply_label_encoders(df: pd.DataFrame, encoders: Dict[str, Dict], cat_cols: List[str]) -> pd.DataFrame:
    df = df.copy()
    for c in cat_cols:
        mapping = encoders.get(c, {})
        df[c] = df[c].fillna('Unknown').astype(str).map(lambda x: mapping.get(x, -1)).astype(int)
    return df

## Model Definitions

In [None]:
def get_base_models(random_state=SEED):
    models = {
        'lgb': lgb.LGBMRegressor(n_estimators=1000, learning_rate=0.03, num_leaves=64, max_depth=10,
                                 subsample=0.8, colsample_bytree=0.8, reg_alpha=0.1, reg_lambda=0.1,
                                 random_state=random_state, n_jobs=-1, verbosity=-1),
        'xgb': xgb.XGBRegressor(n_estimators=1000, learning_rate=0.03, max_depth=9,
                                subsample=0.8, colsample_bytree=0.8, reg_alpha=0.1, reg_lambda=0.1,
                                random_state=random_state, verbosity=0, n_jobs=-1),
        'cat': CatBoostRegressor(iterations=1000, learning_rate=0.03, depth=8, l2_leaf_reg=3,
                                 random_state=random_state, verbose=False),
        'rf': RandomForestRegressor(n_estimators=500, max_depth=20, min_samples_split=5,
                                    min_samples_leaf=2, max_features='sqrt', random_state=random_state, n_jobs=-1),
        'et': ExtraTreesRegressor(n_estimators=500, max_depth=20, min_samples_split=5,
                                  min_samples_leaf=2, max_features='sqrt', random_state=random_state, n_jobs=-1),
        'ridge': Ridge(alpha=8.0, random_state=random_state)
    }
    return models

## Improved Stacking Class

In [None]:
class ImprovedStacking:
    def __init__(self, n_splits=5, random_state=SEED):
        self.n_splits = n_splits
        self.random_state = random_state
        self.base_models = get_base_models(random_state)
        self.fold_models = {k: [] for k in self.base_models.keys()}
        self.meta_model = Ridge(alpha=1.0)
        self.selected_features_union = None
        # final encoders/scalers to be fitted on full train
        self.full_label_encoders = {}
        self.full_scaler = None
        self.full_target_encodings = {}
        self.final_base_models = {}

    def fit(self, raw_train: pd.DataFrame, target_col: str = 'price'):
        # raw_train includes price
        X_raw = raw_train.drop(columns=[target_col]).reset_index(drop=True)
        y = raw_train[target_col].reset_index(drop=True)
        n = len(X_raw)
        oof_preds = {name: np.zeros(n) for name in self.base_models.keys()}
        selected_features_per_fold = []

        kf = KFold(n_splits=self.n_splits, shuffle=True, random_state=self.random_state)
        fold_idx = 0
        for train_idx, val_idx in kf.split(X_raw, y):
            fold_idx += 1
            print(f"\n=== Fold {fold_idx}/{self.n_splits} ===")
            X_train_raw = X_raw.loc[train_idx].reset_index(drop=True)
            X_val_raw = X_raw.loc[val_idx].reset_index(drop=True)
            y_train = y.loc[train_idx].reset_index(drop=True)
            y_val = y.loc[val_idx].reset_index(drop=True)

            # Feature engineering without target encoding
            X_train_fe = advanced_feature_engineering_no_te(X_train_raw)
            X_val_fe = advanced_feature_engineering_no_te(X_val_raw)

            # Target encoding brand and model (smooth) inside fold
            if 'brand' in X_train_fe.columns:
                X_train_fe['brand_te'] = smooth_target_encoding(X_train_fe['brand'], y_train, X_train_fe['brand'], min_samples_leaf=50, smoothing=10)
                X_val_fe['brand_te'] = smooth_target_encoding(X_train_fe['brand'], y_train, X_val_fe['brand'], min_samples_leaf=50, smoothing=10)
            else:
                X_train_fe['brand_te'] = 0.0
                X_val_fe['brand_te'] = 0.0

            if 'model' in X_train_fe.columns:
                # if model variety is huge smoothing helps
                X_train_fe['model_te'] = smooth_target_encoding(X_train_fe['model'], y_train, X_train_fe['model'], min_samples_leaf=50, smoothing=10)
                X_val_fe['model_te'] = smooth_target_encoding(X_train_fe['model'], y_train, X_val_fe['model'], min_samples_leaf=50, smoothing=10)
            else:
                X_train_fe['model_te'] = 0.0
                X_val_fe['model_te'] = 0.0

            # Drop large text columns we don't want as-is
            drop_cols = ['engine', 'accident', 'clean_title']
            for c in drop_cols:
                if c in X_train_fe: X_train_fe.drop(columns=[c], inplace=True)
                if c in X_val_fe: X_val_fe.drop(columns=[c], inplace=True)

            # Identify categorical/object columns to label encode
            cat_cols = X_train_fe.select_dtypes(include=['object']).columns.tolist()
            # Fit simple label encoders on train fold
            fold_encoders = fit_label_encoders(X_train_fe, cat_cols)
            X_train_enc = apply_label_encoders(X_train_fe, fold_encoders, cat_cols)
            X_val_enc = apply_label_encoders(X_val_fe, fold_encoders, cat_cols)

            # Ensure numeric columns exist and fill missing
            num_cols = X_train_enc.select_dtypes(include=[np.number]).columns.tolist()
            # Remove id/price if present
            for c in ['id', 'price']:
                if c in num_cols: num_cols.remove(c)

            for c in num_cols:
                fill = X_train_enc[c].median() if X_train_enc[c].nunique() > 10 else X_train_enc[c].mode().iloc[0] if len(X_train_enc[c].mode())>0 else 0
                X_train_enc[c] = X_train_enc[c].fillna(fill)
                X_val_enc[c] = X_val_enc[c].fillna(fill)

            # Scale numeric features
            scaler = RobustScaler()
            X_train_enc[num_cols] = scaler.fit_transform(X_train_enc[num_cols])
            X_val_enc[num_cols] = scaler.transform(X_val_enc[num_cols])

            # Feature selection on train fold
            k = min(120, X_train_enc.shape[1])
            selector = SelectKBest(score_func=f_regression, k=k)
            # need to supply y_train (use original y)
            try:
                sel_X_train = selector.fit_transform(X_train_enc, y_train)
                sel_X_val = selector.transform(X_val_enc)
                selected_mask = selector.get_support()
                selected_cols = X_train_enc.columns[selected_mask].tolist()
            except Exception as e:
                # fallback: no selection
                selected_cols = X_train_enc.columns.tolist()
                sel_X_train = X_train_enc[selected_cols].values
                sel_X_val = X_val_enc[selected_cols].values

            selected_features_per_fold.append(selected_cols)
            print(f"Fold {fold_idx} selected {len(selected_cols)} features")

            # Train base models on log1p(y)
            y_train_log = np.log1p(y_train)
            for name, model in self.base_models.items():
                print(f" Training {name} on fold {fold_idx} ...", end="")
                mdl = clone(model)
                try:
                    if name == 'lgb':
                        mdl.fit(sel_X_train, y_train_log, eval_set=[(sel_X_val, np.log1p(y_val))],
                                callbacks=[lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(0)])
                    elif name == 'xgb':
                        mdl.fit(sel_X_train, y_train_log, eval_set=[(sel_X_val, np.log1p(y_val))],
                                early_stopping_rounds=50, verbose=False)
                    elif name == 'cat':
                        mdl.fit(sel_X_train, y_train_log, eval_set=(sel_X_val, np.log1p(y_val)),
                                early_stopping_rounds=50, verbose=False)
                    else:
                        mdl.fit(sel_X_train, y_train_log)
                except Exception as e:
                    # fallback simple fit
                    mdl = clone(model)
                    mdl.fit(sel_X_train, y_train_log)

                # store fold models for averaging later
                self.fold_models[name].append((mdl, selected_cols, scaler, fold_encoders))
                # Predict on validation (expm1 to revert log)
                pred_val = np.expm1(mdl.predict(sel_X_val))
                pred_val = np.maximum(pred_val, 0)
                oof_preds[name][val_idx] = pred_val
                print(" done")

            # per-fold done

        # After folds: compute OOF scores and prepare meta features
        oof_df = pd.DataFrame(oof_preds)
        oof_scores = {name: rmse(y.values, oof_df[name].values) for name in oof_df.columns}
        for name, sc in oof_scores.items():
            print(f"OOF {name}: {sc:.6f}")
        # Train meta model (Ridge) on OOF predictions
        print("\nTraining meta-model (Ridge) on OOF predictions...")
        self.meta_model = Ridge(alpha=1.0)
        self.meta_model.fit(oof_df.values, y.values)
        meta_oof_pred = self.meta_model.predict(oof_df.values)
        print(f"Meta OOF RMSE: {rmse(y.values, meta_oof_pred):.6f}")

        # Determine union of selected features across folds
        union_feats = sorted(set().union(*selected_features_per_fold))
        self.selected_features_union = union_feats
        print(f"\nUnion selected features count: {len(self.selected_features_union)}")

        # Fit full-train encoders / scalers / target encodings and final base models
        print("\nFitting final encoders, target encodings and base models on full train...")
        X_full_fe = advanced_feature_engineering_no_te(X_raw)
        # compute target encodings on full train
        if 'brand' in X_full_fe.columns:
            X_full_fe['brand_te'] = smooth_target_encoding(X_full_fe['brand'], y, X_full_fe['brand'], min_samples_leaf=50, smoothing=10)
            self.full_target_encodings['brand'] = X_full_fe[['brand','brand_te']].drop_duplicates().set_index('brand')['brand_te'].to_dict()
        else:
            self.full_target_encodings['brand'] = {}
        if 'model' in X_full_fe.columns:
            X_full_fe['model_te'] = smooth_target_encoding(X_full_fe['model'], y, X_full_fe['model'], min_samples_leaf=50, smoothing=10)
            self.full_target_encodings['model'] = X_full_fe[['model','model_te']].drop_duplicates().set_index('model')['model_te'].to_dict()
        else:
            self.full_target_encodings['model'] = {}

        for c in ['engine','accident','clean_title']:
            if c in X_full_fe: X_full_fe.drop(columns=[c], inplace=True)

        # full label encoders fitted on full train
        cat_cols_full = X_full_fe.select_dtypes(include=['object']).columns.tolist()
        self.full_label_encoders = fit_label_encoders(X_full_fe, cat_cols_full)
        X_full_enc = apply_label_encoders(X_full_fe, self.full_label_encoders, cat_cols_full)

        # add target enc columns if not present (they were added above)
        # fill missing and scale
        num_cols_full = X_full_enc.select_dtypes(include=[np.number]).columns.tolist()
        for c in ['id','price']:
            if c in num_cols_full: num_cols_full.remove(c)
        self.full_scaler = RobustScaler()
        X_full_enc[num_cols_full] = self.full_scaler.fit_transform(X_full_enc[num_cols_full])

        # ensure union features exist and take them
        missing_feats = [c for c in self.selected_features_union if c not in X_full_enc.columns]
        if missing_feats:
            # create zero columns for missing features
            for c in missing_feats:
                X_full_enc[c] = 0.0
        X_final_full = X_full_enc[self.selected_features_union].copy()

        # Fit final base models on the entire training set (log1p target)
        y_full_log = np.log1p(y.values)
        for name, model in self.base_models.items():
            print(f" Fitting final {name} on full train...", end="")
            mdl = clone(model)
            try:
                if name == 'lgb':
                    mdl.fit(X_final_full.values, y_full_log, verbose=False)
                elif name == 'xgb':
                    mdl.fit(X_final_full.values, y_full_log, verbose=False)
                elif name == 'cat':
                    mdl.fit(X_final_full.values, y_full_log, verbose=False)
                else:
                    mdl.fit(X_final_full.values, y_full_log)
            except Exception:
                mdl = clone(model)
                mdl.fit(X_final_full.values, y_full_log)
            self.final_base_models[name] = mdl
            print(" done")

        # Optionally keep oof_df and y for inspection
        self.oof_preds_df = oof_df
        self.y_train = y.reset_index(drop=True)

        return oof_scores

    def predict(self, raw_test: pd.DataFrame) -> np.ndarray:
        # raw_test is the test dataframe without price
        X_test_raw = raw_test.reset_index(drop=True).copy()
        X_test_fe = advanced_feature_engineering_no_te(X_test_raw)

        # apply full-train target encodings to test
        if 'brand' in X_test_fe.columns:
            X_test_fe['brand_te'] = X_test_fe['brand'].map(self.full_target_encodings.get('brand', {})).fillna(self.y_train.mean())
        else:
            X_test_fe['brand_te'] = 0.0
        if 'model' in X_test_fe.columns:
            X_test_fe['model_te'] = X_test_fe['model'].map(self.full_target_encodings.get('model', {})).fillna(self.y_train.mean())
        else:
            X_test_fe['model_te'] = 0.0

        for c in ['engine','accident','clean_title']:
            if c in X_test_fe: X_test_fe.drop(columns=[c], inplace=True)

        # label encode using full label encoders
        cat_cols_full = list(self.full_label_encoders.keys())
        X_test_enc = apply_label_encoders(X_test_fe, self.full_label_encoders, cat_cols_full)

        # fill numeric na's and scale using full_scaler
        num_cols = X_test_enc.select_dtypes(include=[np.number]).columns.tolist()
        for c in ['id','price']:
            if c in num_cols:
                num_cols.remove(c)
        # ensure all scaler columns exist
        for c in getattr(self.full_scaler, 'feature_names_in_', num_cols):
            if c not in X_test_enc.columns:
                X_test_enc[c] = 0.0
        X_test_enc[num_cols] = self.full_scaler.transform(X_test_enc[num_cols])

        # ensure selected features exist
        missing_feats = [c for c in self.selected_features_union if c not in X_test_enc.columns]
        for c in missing_feats:
            X_test_enc[c] = 0.0

        X_test_final = X_test_enc[self.selected_features_union].values

        # get base model predictions (averaged single final models)
        base_preds = {}
        for name, mdl in self.final_base_models.items():
            pred_log = mdl.predict(X_test_final)
            pred = np.expm1(pred_log)
            pred = np.maximum(pred, 0)
            base_preds[name] = pred

        base_preds_df = pd.DataFrame(base_preds)
        # use meta-model to combine them
        final_pred = self.meta_model.predict(base_preds_df.values)
        final_pred = np.maximum(final_pred, 0)

        return final_pred

## Main Pipeline

In [None]:
def main():
    seed_everything(SEED)
    print("Loading data...")
    train_df = pd.read_csv("/kaggle/input/dsn-car-price/train.csv")
    test_df = pd.read_csv("/kaggle/input/dsn-car-price/test.csv")
    sample_sub = pd.read_csv("/kaggle/input/dsn-car-price/sample_submission.csv")

    print(f"Train shape: {train_df.shape}, Test shape: {test_df.shape}")

    # Basic cleanup: ensure numeric columns are numeric
    # If 'milage' spelled inconsistently adjust here
    if 'milage' not in train_df.columns and 'mileage' in train_df.columns:
        train_df.rename(columns={'mileage': 'milage'}, inplace=True)
        test_df.rename(columns={'mileage': 'milage'}, inplace=True)

    # Keep original price for later stats
    print("\nFeature engineering and training with proper CV...")
    trainer = ImprovedStacking(n_splits=N_SPLITS, random_state=SEED)
    oof_scores = trainer.fit(train_df, target_col='price')

    print("\nMaking predictions on test set...")
    preds = trainer.predict(test_df)

    # Post-processing on predictions
    lower_bound = train_df['price'].quantile(0.005)
    upper_bound = train_df['price'].quantile(0.995)
    preds = np.clip(preds, lower_bound, upper_bound)
    preds = np.maximum(preds, 1000)

    print("\nPrediction stats:")
    print(f"Min: {preds.min():.0f}, Max: {preds.max():.0f}, Mean: {preds.mean():.0f}, Median: {np.median(preds):.0f}")

    submission = pd.DataFrame({'id': test_df['id'], 'Price': preds})
    submission.to_csv("submission.csv", index=False)
    print("Saved submission.csv")

## Execute Pipeline

In [None]:
if __name__ == "__main__":
    main()