In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgbm
from sklearn.model_selection import GroupKFold, KFold
from sklearn.metrics import mean_squared_error

In [None]:
path_in = "/kaggle/input/datahon-white-box-2024"

df_train = pd.read_csv(
    f'{path_in}/train.csv'
)
df_test = pd.read_csv(
    f'{path_in}/test.csv'
)
print(len(df_train))
df_train = df_train.loc[df_train["price"]<=df_train["price"].quantile(0.99)]
print(len(df_train))


In [None]:
# Preprocessing
df_train['year'] = df_train['year'].clip(2000, 2024)
df_test['year'] = df_test['year'].clip(2000, 2024)

df_train['kms'] = df_train['kms'].apply(lambda x: np.log1p(x))
df_test['kms'] = df_test['kms'].apply(lambda x: np.log1p(x))


dict_map_fuel = {
    "Diesel": 0,
    "Diésel": 0,
    "Eléctrico": 1,
    "GNC": 2,
    "Gas": 3,
    "Gas licuado (GLP)": 3,
    "Gas natural (CNG)": 3,
    "Gasolina": 4,
    "Gasolina/Etanol": 4,
    "Híbrido": 5,
    "Híbrido Diésel": 5,
    "Híbrido Enchufable": 5,
    "Híbrido Gasolina": 5,
    "Híbrido enchufable": 5,
    "Otros": 6,
    "diesel": 0,
    "electricity": 7,
    "gas": 3,
    "hybrid": 5,
    "other": 6,
    "petrol": 8
}
df_train['fuel'] = df_train['fuel'].map(dict_map_fuel)
df_test['fuel'] = df_test['fuel'].map(dict_map_fuel)



dict_map_shift = {
    'Automática': 0,
    'automatic_gear': 0,
    'no link (0018)': 1,
    'automática secuencial': 0,
    'semiautomatic_gear': 2,
    'Carlos Motor en Alcorcón /  Coches en Alcorcón /  Volkswagen en Alcorcón 1 / 8 VOLKSWAGEN - POLO 12.100 €  Alcorcón (Madrid) Detalles Kilómetros  26.500 kms  Año  2016  Combustible  Gasolina  Cambio  Manual  CV  75 CV  Puertas  5 puertas  Color  Blanco  D': 3,
    'nan': 3,
    'Automático': 0,
    'automatic': 0,
    'automática continua': 0,
    'automática': 0,
    'Manual': 1,
    'manual_gear': 1,
    'manual automatizada': 1,
    'directo, sin caja de cambios': 3,
    'manual': 1
}
df_train['shift'] = df_train['shift'].map(dict_map_shift)
df_test['shift'] = df_test['shift'].map(dict_map_shift)

# Nulls
df_train['fuel'] = df_train['fuel'].fillna(6)
df_train['shift'] = df_train['shift'].fillna(3)
df_test['fuel'] = df_test['fuel'].fillna(6)
df_test['shift'] = df_test['shift'].fillna(3)

In [None]:
LIST_FEATURES = [
    'year', 'kms', 'power', 'doors', 'photos', 'fuel', 'dealer_is_professional', 'shift'
]
IDS = ['website']
TARGET = ['price']
NUM_FOLDS = 5

In [None]:
df_train_1 = df_train.loc[
    df_train['website'].isin(["website_1", "website_3", "website_4"])
].reset_index()[IDS + LIST_FEATURES + TARGET]
df_train_2 = df_train[
    df_train['website'].isin(["website_2", "website_5"])
].reset_index()[IDS + LIST_FEATURES + TARGET]

In [None]:
def get_k_fold(df):
    df['fold'] = -1
    kfold = KFold(n_splits=NUM_FOLDS, random_state=12, shuffle=True)
    for id_fold, (tr_index, val_index) in enumerate(kfold.split(df)):
        df.loc[val_index, 'fold'] = id_fold
    return df


def get_metric(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))


def train_model(df):
    list_models, list_true_all, list_pred_all = [], [], []
    for num_fold in range(NUM_FOLDS):  
        df_tr_fold = df.loc[
            df['fold'] == num_fold,
            :
        ].reset_index()
        df_val_fold = df.loc[
            df['fold'] != num_fold,
            :
        ].reset_index()
        X_train, y_train = df_tr_fold[LIST_FEATURES], df_tr_fold[TARGET]
        X_val, y_val = df_val_fold[LIST_FEATURES], df_val_fold[TARGET]
        params = {
            'boosting_type': 'gbdt',
            'objective': 'regression', 
            'metric': 'rmse', 
            'num_leaves': 31,
            "n_estimators": 600,
            'learning_rate': 0.055,
            'bagging_fraction': 0.75,
            "feature_fraction": 0.8,
            "n_jobs": 8,
        }
        model = lgbm.LGBMRegressor(**params, random_state=12)
        # bst = lgb.train(params, train_data, num_round)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)

        rmse = get_metric(y_val, y_pred)
        print(f'Fold: {num_fold}, RMSE: {rmse}')
        list_models.append(model)
        # OOF
        df_val_fold['y_pred'] = y_pred
        list_true_all.append(y_val)
        list_pred_all.append(y_pred)
        
    # Get Results 
    y_val_all = np.concatenate(list_true_all)
    y_val_pred_all = np.concatenate(list_pred_all) 
    rmse_all = get_metric(y_val_all, y_val_pred_all)
    print(f'RMSE ALL: {rmse_all}')
    return list_models, rmse_all, y_val_all, y_val_pred_all

In [None]:
df_train_1 = get_k_fold(df_train_1)
df_train_2 = get_k_fold(df_train_2)
print(len(df_train_1), len(df_train_2))

list_models_1, metrics_1, y_val_1, y_val_pred_1 = train_model(df_train_1)
list_models_2, metrics_2, y_val_2, y_val_pred_2 = train_model(df_train_2)
print(metrics_1, metrics_2)

y_val_all = np.concatenate([y_val_1, y_val_2])
y_val_pred_all = np.concatenate([y_val_pred_1, y_val_pred_2])
print(get_metric(y_val_all, y_val_pred_all))

In [None]:
df_test_1 = df_test.loc[
    df_test['website'].isin(['website_1', 'website_4'])
].reset_index()[['id'] + LIST_FEATURES]
df_test_2 = df_test.loc[
    df_test['website'].isin(['website_2', 'website_3', 'website_5'])
].reset_index()[['id'] + LIST_FEATURES]

In [None]:
for id_model, (model_1, model_2) in enumerate(zip(list_models_1, list_models_2)):
    preds_1 = model_1.predict(df_test_1[LIST_FEATURES])
    preds_2 = model_2.predict(df_test_2[LIST_FEATURES])
    if id_model == 0:
        preds_final_1 = preds_1.copy()
        preds_final_2 = preds_2.copy()
    else:
        preds_final_1 += preds_1
        preds_final_2 += preds_2

preds_final_1 /= NUM_FOLDS
preds_final_2 /= NUM_FOLDS

In [None]:
df_test_1['price'] = preds_final_1
df_test_2['price'] = preds_final_2
df_test_sub = pd.concat([
    df_test_1[['id', 'price']],
    df_test_2[['id', 'price']]
])

In [None]:
df_test_sub.to_csv('./submission.csv', index=False)