# 游뚱 Predicci칩n de Precios de Coches Usados
Este notebook contiene el an치lisis exploratorio (EDA), ingenier칤a de variables y entrenamiento de un modelo de ensamble para predecir el precio de veh칤culos.

In [8]:
import sys
!{sys.executable} -m pip install pandas numpy matplotlib seaborn scikit-learn xgboost optuna joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb
import optuna
import joblib
import warnings
warnings.filterwarnings('ignore')

sns.set(style="whitegrid")



## 1. Carga de Datos

In [12]:

train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
print(f"Dimensiones Train: {train_df.shape}")
print(f"Dimensiones Test: {test_df.shape}")

Dimensiones Train: (188533, 13)
Dimensiones Test: (125690, 12)


## 2. An치lisis Exploratorio (EDA)

In [None]:
# Distribuci칩n de la variable objetivo (Price)
plt.figure(figsize=(10, 6))
sns.histplot(train_df['price'], kde=True, bins=50, color='royalblue')
plt.title('Distribuci칩n de Precios de Coches')
plt.xlabel('Precio')
plt.ylabel('Frecuencia')
plt.show()

# Correlaci칩n de variables num칠ricas
plt.figure(figsize=(12, 8))
sns.heatmap(train_df.corr(numeric_only=True), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Matriz de Correlaci칩n')
plt.show()

## 3. Ingenier칤a de Caracter칤sticas (Feature Engineering)

In [None]:
def clean_engine(engine_str):
    try:
        if pd.isna(engine_str): return 0.0
        hp = re.findall(r'(\d+\.?\d*)HP', engine_str)
        hp = float(hp[0]) if hp else 0.0
        return hp
    except:
        return 0.0

def preprocess_data(df):
    df = df.copy()
    # Extraer HP de engine
    df['hp'] = df['engine'].apply(clean_engine)

    # Antig칲edad del coche
    df['car_age'] = 2024 - df['model_year']

    # Binarios b치sicos
    df['has_accident'] = df['accident'].apply(lambda x: 1 if x == 'At least 1 accident or damage reported' else 0)

    # Manejo de nulos en variables clave
    df['fuel_type'] = df['fuel_type'].fillna('Unknown')

    # Seleccionar columnas para el modelo
    features = ['brand', 'fuel_type', 'transmission', 'hp', 'car_age', 'milage', 'has_accident']

    # One-hot encoding simple para el ejemplo (en prod usar codificaci칩n m치s avanzada)
    df_processed = pd.get_dummies(df[features], columns=['brand', 'fuel_type', 'transmission'], drop_first=True)

    return df_processed

X = preprocess_data(train_df)
y = train_df['price']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

## 4. Optimizaci칩n con Optuna

In [None]:
def objective(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'tree_method': 'hist'
    }

    model = xgb.XGBRegressor(**param)
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, preds))
    return rmse

# Ejecutar estudio (limitado para el ejemplo)
# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=20)
# print(study.best_params)

## 5. Entrenamiento Modelo Final (XGBoost Ensemble)

In [None]:
best_params = {
    'n_estimators': 500,
    'max_depth': 6,
    'learning_rate': 0.05,
    'subsample': 0.8,
    'colsample_bytree': 0.8
}

model = xgb.XGBRegressor(**best_params)
model.fit(X_train, y_train)

train_preds = model.predict(X_train)
val_preds = model.predict(X_val)

print("--- M칠tricas de Regresi칩n ---")
print(f"MAE: {mean_absolute_error(y_val, val_preds):.2f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_val, val_preds)):.2f}")
print(f"R2 Score: {r2_score(y_val, val_preds):.4f}")

overfitting = (r2_score(y_train, train_preds) - r2_score(y_val, val_preds)) * 100
print(f"Diferencia de Rendimiento (Overfitting): {overfitting:.2f}%")

## 6. Exportaci칩n del Modelo

In [None]:
joblib.dump(model, 'car_price_model.pkl')
joblib.dump(X.columns.tolist(), 'model_features.pkl')
print("Modelo y columnas guardados correctamente.")