In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import missingno as msno
import gc
from dython.nominal import associations
import seaborn as sns
from datetime import datetime
from sklearn.preprocessing import LabelEncoder,OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from catboost import CatBoostRegressor

In [None]:
# Feature Engineering

def extract_horsepower(engine):
    ''' Extracts horsepower from engine string'''
    try:
        return float(engine.split('HP')[0])
    except:
        return None
    
def extract_engine_size(engine):
    ''' Extracts engine size from engine string'''
    try:
        return float(engine.split(' ')[1].replace('L', ''))
    except:
        return None
        
        
def feature_engineering(df_engine):
    
    current_year = datetime.now().year

    df_engine['age'] = current_year - df_engine['model_year']
    df_engine['milage_per_year'] = df_engine['milage']/df_engine['age']

    df_engine['horsepower'] = df_engine['engine'].apply(extract_horsepower)
    df_engine['engine_size'] = df_engine['engine'].apply(extract_engine_size)
    df_engine['power_to_weight_ratio'] = df_engine['horsepower']/df_engine['engine_size']

    luxury_brands =  ['Mercedes-Benz', 'BMW', 'Audi', 'Porsche', 'Land', 
                    'Lexus', 'Jaguar', 'Bentley', 'Maserati', 'Lamborghini', 
                    'Rolls-Royce', 'Ferrari', 'McLaren', 'Aston', 'Maybach']
    #df_clean['Is_Luxury_Brand'] = df_clean['brand'].apply(lambda x: 1 if x in luxury_brands else 0)

    #df_clean['Accident_Impact'] = df_clean.apply(lambda x: 1 if x['accident'] == 1 and x['clean_title'] == 0 else 0, axis=1)
    
    return df_engine

In [None]:
# filtrar outliers en price

def filtrar_outliers_price(df, col='price', factor=1.5):
    ''' 
    Filtra outliers en una columna numérica utilizando el método del rango intercuartílico (IQR);
    retorna - DataFrame sin outliers
    '''
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    limite_inf = Q1 - factor * IQR
    limite_sup = Q3 + factor * IQR
    df_filtrado = df[(df[col] >= limite_inf) & (df[col] <= limite_sup)].reset_index(drop=True)
    print(f"Filtrado: {len(df) - len(df_filtrado)} filas eliminadas ({100*(1 - len(df_filtrado)/len(df)):.2f}%)")
    return df_filtrado

In [None]:
def train_catboost(X_train, y_train, X_test, y_test, categorical_cols):
    ''' entrena un modelo CatBoostRegressor y retorna el modelo entrenado'''
    
    catboost_model = CatBoostRegressor(
        iterations=1500,
        learning_rate=0.07,
        depth=4,
        eval_metric='RMSE',
        random_seed=42,
        verbose=100
    )
    
    catboost_model.fit(
        X_train, y_train,
        cat_features=categorical_cols,
        eval_set=(X_test, y_test)
    )
    
    return catboost_model

In [None]:
def evaluate_model(model, X_train, y_train, X_test, y_test):
    ''' evalua el modelo entrenado y retorna las metricas RMSE y R2 para train y test'''
    
    # predicciones
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    # metricas
    rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
    r2_train = r2_score(y_train, y_pred_train)
    
    rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
    r2_test = r2_score(y_test, y_pred_test)
    
    overfitting_pct = (rmse_test - rmse_train) / rmse_train * 100

    print(f"RMSE Train: {rmse_train:.2f} | R2 Train: {r2_train:.2f}")
    print(f"RMSE Test : {rmse_test:.2f} | R2 Test : {r2_test:.2f}")
    print(f"Overfitting relativo: {overfitting_pct:.2f}%")

    # medir outfitting
    #rmse_train = catboost_model.get_best_score()['learn']['RMSE']
    #rmse_test = catboost_model.get_best_score()['validation']['RMSE']

    overfitting_pct = (rmse_test - rmse_train) / rmse_train * 100
    print(f"Overfitting relativo: {overfitting_pct:.2f}%")
    
    return rmse_train, r2_train, rmse_test, r2_test

In [None]:
# reentrenar el modelo 
def retrain_full_model(X, y, categorical_cols, save_path = '../models/catboost_final_model.cbm'):
    
    ''' reentrena el modelo con todo el dataset (train + test) y retorna/guarda el modelo final'''
    
    final_model = CatBoostRegressor(
        iterations=1500,
        learning_rate=0.07,
        depth=4,
        eval_metric='RMSE',
        random_seed=42,
        verbose=100
    )
    final_model.fit(X, y, cat_features=categorical_cols)
    final_model.save_model(save_path)
    return final_model

In [None]:
def pretrain(train_df):
    ''' Prepara el dataset para el entrenamiento del modelo CatBoost; retorna - X, y, categorical_cols'''
    # Apply feature engineering in the training set
    feature_engineering(train_df)
    train_df = train_df.dropna().reset_index(drop=True)
    train_df = train_df.drop(['model_year','engine', 'clean_title'],axis=1)
    train_df.drop('id',inplace=True,axis=1) # remove 'id' column from both training

    # filtrar outliers en price
    train_df = filtrar_outliers_price(train_df)
    # Crear columna 'coches_ultralujo'
    train_df['coches_ultralujo'] = np.where(train_df['price'] > 150000, 'ultralujo', 'normal')

    # separar features y target
    X = train_df.drop('price', axis=1)
    y = train_df['price']

    # identificar columnas categoricas
    categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
    
    return X, y, categorical_cols

    
        

In [None]:
# PRINCIPAL

# cargar datos
train_df = pd.read_csv('../datasets/train.csv')

# preparar datos
X, y, categorical_cols = pretrain(train_df)

# dividir en train y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state= 42)

# entrenar modelo
catboost_model = train_catboost(X_train, y_train, X_test, y_test, categorical_cols)

# evaluar modelo
evaluate_model(catboost_model, X_train, y_train, X_test, y_test)

# reentrenar el modelo con todo el dataset y guardar el modelo final
final_model = retrain_full_model(X, y, categorical_cols, X_test, y_test)
