In [8]:
%load_ext cudf.pandas

import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
import re

pd.set_option('display.float_format', lambda x: '%.6f' % x)

def categorize(fuel: str):
    cfv = ["Gasoline", "Diesel"]
    afv = ["E85 Flex Fuel"]
    hev = ["Hybrid", "Plug-In Hybrid"]

    if fuel in cfv:
        return "CFV"
    elif fuel in afv:
        return "AFV"
    elif fuel in hev:
        return "HEV"
    else:
        return None

def iqr_apply(df: pd.DataFrame, column_name: str):
    normalized_column = df[column_name]
    initial_size = len(normalized_column)

    q1 = np.percentile(normalized_column, 25)
    q3 = np.percentile(normalized_column, 75)
    iqr = q3 - q1
    lower_bound = q1 - iqr * 1.5
    higher_bound = q3 + iqr * 1.5

    df = df[lower_bound <= normalized_column][higher_bound >= normalized_column]
    return df, initial_size - len(df[column_name])

def get_regex_group_string(refs: list[str]):
    return r"(?=(" + "|".join(refs) + r"))"

def preprocess(file_path) -> pd.DataFrame:
    df = pd.read_csv(file_path)

    df["fuel_type"] = df["fuel_type"].apply(categorize)

    df['damage_occurred'] = df['accident'].map({'None reported': 0, 'At least 1 accident or damage reported': 1})
    df.drop(['accident'], axis=1, inplace=True)

    luxury_brands = ["mercedes-benz","bmw","audi","porsche","lexus","cadillac","jaguar","bentley","genesis","maserati","lamborghini","rolls-royce","ferrari","mclaren","aston","lotus","bugatti","maybach"]
    df["is_luxury_brand"] = df["brand"].apply(lambda x: 1.0 if x.lower() in luxury_brands else 0.0)

    milage = df["milage"]
    milage_bins = np.linspace(milage.min() - 1, milage.max(), 5)
    df["milage_bins"] = pd.cut(milage, bins=milage_bins, labels=range(1, 5)).astype(float) / len(df)
#     df, c4 = iqr_apply(df, column_name="milage")
    df["milage"] = (df["milage"] // 1000) * 1000

    df["age"] = 2024.0 - df["model_year"]

    df = pd.get_dummies(df, columns=["fuel_type"], dtype=np.float64).reset_index(drop=True)

    automatic_references = [
        "A/T", "AT", "Automatic", "Transmission w/Dual Shift Mode", "Transmission Overdrive Switch"
    ]
    manual_references = [
        "M/T", "Manual", "Mt"
    ]
    cvt_references = [
        "CVT Transmission", "Automatic CVT", "CVT-F"
    ]
    specialized_references = [
        "Single-Speed Fixed Gear", "Scheduled for or in Production", "Variable"
    ]

    automatic = get_regex_group_string(automatic_references)
    manual = get_regex_group_string(manual_references)
    cvt = get_regex_group_string(cvt_references)
    specialized = get_regex_group_string(specialized_references)

    transmission_map = {
        automatic: "Automatic",
        manual: "Manual",
        cvt: "CVT",
        specialized: "Specialized",
    }

    bins = []
    for transmission in df["transmission"]:
        for ref in transmission_map.keys():
            if len(re.findall(ref, transmission)) > 0:
                bins.append(transmission_map[ref])

    df["transmission_bins"] = pd.Series(bins)
    df = pd.get_dummies(df, columns=["transmission_bins"], dtype=np.float64).reset_index(drop=True)

    do_columns = ["is_luxury_brand", "age", "milage_bins"]
    ohe_columns = []
    for key in df.keys():
        if key.startswith(("fuel_type", "transmission_bins")):
            ohe_columns.append(key)

    imputer = KNNImputer(n_neighbors=5)
    df['damage_occurred'] = imputer.fit_transform(df[do_columns + ohe_columns], 'damage_occurred')

    df['engine_hp'] = df['engine'].str.extract(r'(\d+\.\d+)(?=HP)').astype(float)
    df['engine_cc'] = df['engine'].str.extract(r'(\d+\.\d+)(?=L)').astype(float)

    engine_columns = ["is_luxury_brand", "age"] + ohe_columns
    df['engine_hp'] = imputer.fit_transform(df[engine_columns], 'engine_hp')
    df['engine_cc'] = imputer.fit_transform(df[engine_columns], 'engine_cc')

    df['clean_title'] = df.fillna({'clean_title': 'No'})['clean_title'].map({'Yes': 1.0, 'No': 0.0})

    return df.select_dtypes(exclude=['object'])

The cudf.pandas extension is already loaded. To reload it, use:
  %reload_ext cudf.pandas


In [9]:
from sklearn.model_selection import train_test_split
import xgboost as xgb

fp = "/kaggle/input/playground-series-s4e9/train.csv"
df = preprocess(fp).drop(['model_year', 'milage', 'id'], axis=1)

target_feature = "price"

X = df.drop(target_feature, axis=1)
y = df[target_feature]

In [10]:
import lightgbm as lgb
from sklearn.model_selection import KFold, RandomizedSearchCV
from random import random, randint, randrange, uniform
from lightgbm import LGBMRegressor
from lightgbm import log_evaluation, early_stopping

#Model Evaluation
from sklearn.metrics import *
from sklearn.metrics import mean_squared_error

kf = KFold(n_splits=7, shuffle=True)

fold_rmse_scores = []

for fold, (train_index, val_index) in enumerate(kf.split(X)):

    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    
    params = {
        'subsample': 1.0, 
        'reg_lambda': 0.021544346900318832, 
        'reg_alpha': 10.0, 
        'num_leaves': 550, 
        'n_estimators': 500, 
        'min_data_in_leaf': 130, 
        'min_child_weight': 0.023000000000000003, 
        'max_depth': 12, 
        'learning_rate': 0.025000000000000005, 
        'feature_fraction': 0.9111111111111111, 
        'colsample_bytree': 1.0, 
        'cat_smooth': 60, 
        'bagging_freq': 9, 
        'bagging_fraction': 0.8333333333333333
    }
    
    
    lgb_model = lgb.LGBMRegressor(**params, verbose=-1, device='gpu')  
    lgb_model.fit(X_train, y_train)

    y_pred = lgb_model.predict(X_val)

    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    fold_rmse_scores.append(rmse)
    print(f"Fold {fold + 1} RMSE: {rmse}")

print(f"Average Valid RMSE: {np.mean(fold_rmse_scores)}")



Fold 1 RMSE: 72145.54726588765




Fold 2 RMSE: 70338.62162355665




Fold 3 RMSE: 73811.48705166085




Fold 4 RMSE: 78913.97601546702




Fold 5 RMSE: 72654.97321661568




Fold 6 RMSE: 76393.52374876899




Fold 7 RMSE: 81179.73561235105
Average Valid RMSE: 75062.55207632971


In [11]:
sub_fp = "/kaggle/input/playground-series-s4e9/test.csv"
sub_df = preprocess(sub_fp).drop(['model_year', 'milage'], axis=1)

print(sub_df.info())
# preds_y = lgb_model.predict(val_X)
# submission = pd.DataFrame({
#     'id': sub_df['id'], 
#     'price': preds_y 
# })

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 125690 entries, 0 to 125689
Data columns (total 15 columns):
 #   Column                         Non-Null Count   Dtype
---  ------                         --------------   -----
 0   id                             125690 non-null  int64
 1   clean_title                    125690 non-null  float64
 2   damage_occurred                125690 non-null  float64
 3   is_luxury_brand                125690 non-null  float64
 4   milage_bins                    125690 non-null  float64
 5   age                            125690 non-null  float64
 6   fuel_type_AFV                  125690 non-null  float64
 7   fuel_type_CFV                  125690 non-null  float64
 8   fuel_type_HEV                  125690 non-null  float64
 9   transmission_bins_Automatic    125690 non-null  float64
 10  transmission_bins_CVT          125690 non-null  float64
 11  transmission_bins_Manual       125690 non-null  float64
 12  transmission_bins_Specialized  125

In [15]:
preds_y = lgb_model.predict(sub_df.drop(['id'], axis=1))
submission = pd.DataFrame({
    'id': sub_df['id'], 
    'price': preds_y 
})
# (125690,), (125690,)
submission.shape



(125690, 2)

In [19]:
submission.to_csv('submission.csv', index=False)