In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import missingno as msno
import gc
from dython.nominal import associations
import seaborn as sns
from datetime import datetime
from sklearn.preprocessing import LabelEncoder,OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from catboost import CatBoostRegressor

In [None]:
# load data
train_df = pd.read_csv('./datasets/train.csv')

# remove 'id' column from both training
train_df.drop('id',inplace=True,axis=1)


In [None]:
categorical_columns = train_df.select_dtypes(include=['object']).columns
unique_values = {col: train_df[col].nunique() for col in categorical_columns}
for col, unique_count in unique_values.items():
    print(f"{col}: {unique_count} unique values")
    
gc.collect()

In [None]:
# matriz de correlacion incluyendo variables categoricas
def matriz_correlacion_categoricas(df):
    associations_df = associations(train_df, nominal_columns=categorical_columns.tolist(), plot = False)
    corr_matrix = associations_df['corr']
    plt.figure(figsize=(15, 10))
    plt.title('Correlation Matrix Including Categorical Variables', fontsize=16)
    sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm', cbar=True, square=True, linewidths=.5)
    plt.show()

In [None]:
matriz_correlacion_categoricas(train_df)

In [None]:
# Feature Engineering

def extract_horsepower(engine):
    ''' Extracts horsepower from engine string'''
    try:
        return float(engine.split('HP')[0])
    except:
        return None
    
def extract_engine_size(engine):
    ''' Extracts engine size from engine string'''
    try:
        return float(engine.split(' ')[1].replace('L', ''))
    except:
        return None
        
        
def feature_engineering(df_clean):
    
    current_year = datetime.now().year

    df_clean['age'] = current_year - df_clean['model_year']
    df_clean['milage_per_year'] = df_clean['milage']/df_clean['age']

    df_clean['horsepower'] = df_clean['engine'].apply(extract_horsepower)
    df_clean['engine_size'] = df_clean['engine'].apply(extract_engine_size)
    df_clean['power_to_weight_ratio'] = df_clean['horsepower']/df_clean['engine_size']

    luxury_brands =  ['Mercedes-Benz', 'BMW', 'Audi', 'Porsche', 'Land', 
                    'Lexus', 'Jaguar', 'Bentley', 'Maserati', 'Lamborghini', 
                    'Rolls-Royce', 'Ferrari', 'McLaren', 'Aston', 'Maybach']
    #df_clean['Is_Luxury_Brand'] = df_clean['brand'].apply(lambda x: 1 if x in luxury_brands else 0)

    #df_clean['Accident_Impact'] = df_clean.apply(lambda x: 1 if x['accident'] == 1 and x['clean_title'] == 0 else 0, axis=1)
    
    return df_clean

In [None]:

# Apply feature engineering in the training set
feature_engineering(train_df)
train_df = train_df.dropna().reset_index(drop=True)
train_df = train_df.drop(['model_year','engine', 'clean_title'],axis=1)
train_df.head()

In [None]:
#matriz_correlacion_categoricas(train_df)

In [None]:
train_df.columns

In [None]:


# filtrar outliers en price

def filtrar_outliers_price(df, col='price', factor=1.5):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    limite_inf = Q1 - factor * IQR
    limite_sup = Q3 + factor * IQR
    df_filtrado = df[(df[col] >= limite_inf) & (df[col] <= limite_sup)].reset_index(drop=True)
    print(f"Filtrado: {len(df) - len(df_filtrado)} filas eliminadas ({100*(1 - len(df_filtrado)/len(df)):.2f}%)")
    return df_filtrado

train_df = filtrar_outliers_price(train_df, col='price', factor=1.5)


# Crear columna 'coches_ultralujo'

train_df['coches_ultralujo'] = np.where(train_df['price'] > 150000, 'ultralujo', 'normal')

# separar features y target
y = train_df['price']
X = train_df.drop(columns=['price'])

# Detectar columnas categóricas
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()



X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


catboost_model = CatBoostRegressor(
    iterations=1500,
    learning_rate=0.07,
    depth=4,
    eval_metric='RMSE',
    random_seed=42,
    verbose=100
)

catboost_model.fit(
    X_train, y_train,
    cat_features=categorical_cols,
    eval_set=(X_test, y_test)
)

#predicciones y metricas

y_pred_train = catboost_model.predict(X_train)
y_pred_test = catboost_model.predict(X_test)

rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

overfitting_pct = (rmse_test - rmse_train) / rmse_train * 100

print(f"RMSE Train: {rmse_train:.2f} | R2 Train: {r2_train:.2f}")
print(f"RMSE Test : {rmse_test:.2f} | R2 Test : {r2_test:.2f}")
print(f"Overfitting relativo: {overfitting_pct:.2f}%")



# medir outfitting
rmse_train = catboost_model.get_best_score()['learn']['RMSE']
rmse_test = catboost_model.get_best_score()['validation']['RMSE']

overfitting_pct = (rmse_test - rmse_train) / rmse_train * 100
print(f"Overfitting relativo: {overfitting_pct:.2f}%")
