#0

In [None]:
# Importar librerías necesarias
import pandas as pd
import numpy as np
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split  # Corrección aquí
import matplotlib.pyplot as plt
import seaborn as sns

# Cargar los datos de entrenamiento y evaluación
df_train = pd.read_excel("/content/df_train_with_weights_replaced.xlsx")
df_eval = pd.read_excel("/content/df_eval_with_weights_replaced.xlsx")

# Crear la columna 'Age'
df_train['Age'] = 2024 - df_train['Prod. year']
df_eval['Age'] = 2024 - df_eval['Prod. year']

# Crear características adicionales basadas en correlaciones observadas y relaciones avanzadas
# Manteniendo tus mejores características
df_train['Mileage_Engine_ratio'] = df_train['Mileage'] / (df_train['Engine volume'] + 1)
df_train['Mileage_per_Cylinder'] = df_train['Mileage'] / (df_train['Cylinders'] + 1)
df_train['EngineVolume_per_Age'] = df_train['Engine volume'] / (df_train['Age'] + 1)
df_train['Age_SalesFee'] = df_train['Age'] * df_train['Sales Fee']
df_train['Mileage_Age_ratio'] = df_train['Mileage'] / (df_train['Age'] + 1)
df_train['Age_FuelType_ratio'] = df_train['Age'] / (df_train['Fuel type'] + 1)
df_train['Mileage_per_EngineVolume'] = df_train['Mileage'] / (df_train['Engine volume'] + 1)
df_train['log_Mileage'] = np.log1p(df_train['Mileage'])
df_train['log_EngineVolume'] = np.log1p(df_train['Engine volume'])
df_train['Age_squared'] = df_train['Age'] ** 2
df_train['Mileage_Age'] = df_train['Mileage'] * df_train['Age']
df_train['Cylinders_squared'] = df_train['Cylinders'] ** 2
df_train['Mileage_Age_Log'] = np.log1p(df_train['Mileage_Age'])
df_train['SalesFee_log_Mileage'] = df_train['log_Mileage'] * df_train['Sales Fee']

# Aplicar las mismas características al conjunto de evaluación
df_eval['Mileage_Engine_ratio'] = df_eval['Mileage'] / (df_eval['Engine volume'] + 1)
df_eval['Mileage_per_Cylinder'] = df_eval['Mileage'] / (df_eval['Cylinders'] + 1)
df_eval['EngineVolume_per_Age'] = df_eval['Engine volume'] / (df_eval['Age'] + 1)
df_eval['Age_SalesFee'] = df_eval['Age'] * df_eval['Sales Fee']
df_eval['Mileage_Age_ratio'] = df_eval['Mileage'] / (df_eval['Age'] + 1)
df_eval['Age_FuelType_ratio'] = df_eval['Age'] / (df_eval['Fuel type'] + 1)
df_eval['Mileage_per_EngineVolume'] = df_eval['Mileage'] / (df_eval['Engine volume'] + 1)
df_eval['log_Mileage'] = np.log1p(df_eval['Mileage'])
df_eval['log_EngineVolume'] = np.log1p(df_eval['Engine volume'])
df_eval['Age_squared'] = df_eval['Age'] ** 2
df_eval['Mileage_Age'] = df_eval['Mileage'] * df_eval['Age']
df_eval['Cylinders_squared'] = df_eval['Cylinders'] ** 2
df_eval['Mileage_Age_Log'] = np.log1p(df_eval['Mileage_Age'])
df_eval['SalesFee_log_Mileage'] = df_eval['log_Mileage'] * df_eval['Sales Fee']

# Dividir en características (X) y variable objetivo (y)
X = df_train.drop(['price', 'Id'], axis=1)
y = df_train['price']

# Separar un conjunto de validación
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Definir y entrenar el modelo XGBoost
xgboost_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=6540,
    max_depth=15,
    learning_rate=0.00085,
    subsample=0.87,
    colsample_bytree=0.82,
    alpha=0.0385,
    reg_lambda=4.01,
    gamma=0.29,
    min_child_weight=4,
    random_state=42
)
xgboost_model.fit(X_train, y_train)

# Definir y entrenar el modelo CatBoost
catboost_model = CatBoostRegressor(
    iterations=6540,
    depth=10,
    learning_rate=0.01,
    l2_leaf_reg=4.01,
    random_seed=42,
    verbose=False
)
catboost_model.fit(X_train, y_train)

# Evaluar RMSE en el conjunto de validación para ambos modelos
y_val_pred_xgb = xgboost_model.predict(X_val)
rmse_val_xgb = np.sqrt(mean_squared_error(y_val, y_val_pred_xgb))
print("RMSE en validación con XGBoost:", rmse_val_xgb)

y_val_pred_cat = catboost_model.predict(X_val)
rmse_val_cat = np.sqrt(mean_squared_error(y_val, y_val_pred_cat))
print("RMSE en validación con CatBoost:", rmse_val_cat)

# Promediar las predicciones de ambos modelos
y_val_pred_ensemble = (y_val_pred_xgb + y_val_pred_cat) / 2
rmse_val_ensemble = np.sqrt(mean_squared_error(y_val, y_val_pred_ensemble))
print("RMSE en validación con Ensemble:", rmse_val_ensemble)

# Generar predicciones en el conjunto de evaluación
X_eval = df_eval.drop('Id', axis=1)
eval_pred_xgb = xgboost_model.predict(X_eval)
eval_pred_cat = catboost_model.predict(X_eval)
eval_pred_ensemble = (eval_pred_xgb + eval_pred_cat) / 2

# Remover valores negativos en las predicciones
eval_pred_ensemble = np.maximum(eval_pred_ensemble, 0)

# Crear DataFrame para submission
submission_df = pd.DataFrame({
    "Id": df_eval["Id"],
    "Predicted_Price": eval_pred_ensemble
})

# Guardar el archivo de submission en formato CSV
submission_df.to_csv("submission_ensemble_xgboost_catboost.csv", index=False)

print("Archivo de submission generado: 'submission_ensemble_xgboost_catboost.csv'")
submission_df.head()


#1

In [None]:
# Importar librerías necesarias
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

# Cargar los datos de entrenamiento y evaluación
df_train = pd.read_excel("/content/df_train_with_weights_replaced.xlsx")
df_eval = pd.read_excel("/content/df_eval_with_weights_replaced.xlsx")

# Análisis de correlación para identificar relaciones fuertes
correlation_matrix = df_train.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Matriz de Correlación entre Características")
plt.show()

# Crear características adicionales basadas en correlaciones observadas y relaciones avanzadas
df_train['Mileage_Engine_ratio'] = df_train['Mileage'] / (df_train['Engine volume'] + 1)
df_train['Age'] = 2024 - df_train['Prod. year']
df_train['Mileage_per_Cylinder'] = df_train['Mileage'] / (df_train['Cylinders'] + 1)
df_train['EngineVolume_per_Age'] = df_train['Engine volume'] / (df_train['Age'] + 1)
df_train['Age_SalesFee'] = df_train['Age'] * df_train['Sales Fee']
df_train['Mileage_Age_ratio'] = df_train['Mileage'] / (df_train['Age'] + 1)
df_train['Age_FuelType_ratio'] = df_train['Age'] / (df_train['Fuel type'] + 1)
df_train['Mileage_per_EngineVolume'] = df_train['Mileage'] / (df_train['Engine volume'] + 1)
df_train['log_Mileage'] = np.log1p(df_train['Mileage'])
df_train['log_EngineVolume'] = np.log1p(df_train['Engine volume'])
df_train['Age_squared'] = df_train['Age'] ** 2
df_train['Mileage_Age'] = df_train['Mileage'] * df_train['Age']
df_train['Cylinders_squared'] = df_train['Cylinders'] ** 2
df_train['Mileage_Age_Log'] = np.log1p(df_train['Mileage_Age'])
df_train['SalesFee_log_Mileage'] = df_train['log_Mileage'] * df_train['Sales Fee']

# Aplicar las mismas características al conjunto de evaluación
df_eval['Mileage_Engine_ratio'] = df_eval['Mileage'] / (df_eval['Engine volume'] + 1)
df_eval['Age'] = 2024 - df_eval['Prod. year']
df_eval['Mileage_per_Cylinder'] = df_eval['Mileage'] / (df_eval['Cylinders'] + 1)
df_eval['EngineVolume_per_Age'] = df_eval['Engine volume'] / (df_eval['Age'] + 1)
df_eval['Age_SalesFee'] = df_eval['Age'] * df_eval['Sales Fee']
df_eval['Mileage_Age_ratio'] = df_eval['Mileage'] / (df_eval['Age'] + 1)
df_eval['Age_FuelType_ratio'] = df_eval['Age'] / (df_eval['Fuel type'] + 1)
df_eval['Mileage_per_EngineVolume'] = df_eval['Mileage'] / (df_eval['Engine volume'] + 1)
df_eval['log_Mileage'] = np.log1p(df_eval['Mileage'])
df_eval['log_EngineVolume'] = np.log1p(df_eval['Engine volume'])
df_eval['Age_squared'] = df_eval['Age'] ** 2
df_eval['Mileage_Age'] = df_eval['Mileage'] * df_eval['Age']
df_eval['Cylinders_squared'] = df_eval['Cylinders'] ** 2
df_eval['Mileage_Age_Log'] = np.log1p(df_eval['Mileage_Age'])
df_eval['SalesFee_log_Mileage'] = df_eval['log_Mileage'] * df_eval['Sales Fee']

# Dividir en características (X) y variable objetivo (y) para el conjunto de entrenamiento
X_train_filtered = df_train.drop(['price', 'Id'], axis=1)
y_train_filtered = df_train['price']

# Separar un conjunto de validación del conjunto de entrenamiento
X_train, X_val, y_train, y_val = train_test_split(X_train_filtered, y_train_filtered, test_size=0.2, random_state=42)

# Definir el modelo XGBoost con ajustes levemente incrementales
xgboost_model_optimized = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=6540,
    max_depth=15,
    learning_rate=0.00085,
    subsample=0.87,
    colsample_bytree=0.82,
    alpha=0.0385,
    reg_lambda=4.01,
    gamma=0.29,
    min_child_weight=4,
    random_state=42
)

# Entrenar el modelo
xgboost_model_optimized.fit(X_train, y_train)

# Evaluar el RMSE en el conjunto de entrenamiento
y_train_pred = xgboost_model_optimized.predict(X_train)
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
print("RMSE en el conjunto de entrenamiento con XGBoost optimizado:", rmse_train)

# Evaluar el RMSE en el conjunto de validación
y_val_pred = xgboost_model_optimized.predict(X_val)
rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))
print("RMSE en el conjunto de validación con XGBoost optimizado:", rmse_val)

# Generar predicciones para el conjunto de evaluación
X_eval = df_eval.drop('Id', axis=1)
eval_predictions = xgboost_model_optimized.predict(X_eval)

# Remover valores negativos en las predicciones
eval_predictions = np.maximum(eval_predictions, 0)

# Crear un DataFrame para el submission
submission_df = pd.DataFrame({
    "Id": df_eval["Id"],
    "Predicted_Price": eval_predictions
})

# Guardar el archivo de submission en formato CSV
submission_df.to_csv("submission_xgboost_optimized_v11.csv", index=False)

print("Archivo de submission generado: 'submission_xgboost_optimized_v11.csv'")
submission_df.head()


#2

In [None]:
# Importar librerías necesarias
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

# Cargar los datos de entrenamiento y evaluación
df_train = pd.read_excel("/content/df_train_with_weights_replaced.xlsx")
df_eval = pd.read_excel("/content/df_eval_with_weights_replaced.xlsx")

# Análisis de correlación para identificar relaciones fuertes
correlation_matrix = df_train.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Matriz de Correlación entre Características")
plt.show()

# Crear características adicionales basadas en correlaciones observadas y relaciones avanzadas
df_train['Mileage_Engine_ratio'] = df_train['Mileage'] / (df_train['Engine volume'] + 1)
df_train['Age'] = 2024 - df_train['Prod. year']
df_train['Mileage_per_Cylinder'] = df_train['Mileage'] / (df_train['Cylinders'] + 1)
df_train['EngineVolume_per_Age'] = df_train['Engine volume'] / (df_train['Age'] + 1)
df_train['Age_SalesFee'] = df_train['Age'] * df_train['Sales Fee']
df_train['Mileage_Age_ratio'] = df_train['Mileage'] / (df_train['Age'] + 1)
df_train['Age_FuelType_ratio'] = df_train['Age'] / (df_train['Fuel type'] + 1)
df_train['Mileage_per_EngineVolume'] = df_train['Mileage'] / (df_train['Engine volume'] + 1)
df_train['log_Mileage'] = np.log1p(df_train['Mileage'])
df_train['log_EngineVolume'] = np.log1p(df_train['Engine volume'])
df_train['Age_squared'] = df_train['Age'] ** 2
df_train['Mileage_Age'] = df_train['Mileage'] * df_train['Age']
df_train['Cylinders_squared'] = df_train['Cylinders'] ** 2
df_train['Mileage_Age_Log'] = np.log1p(df_train['Mileage_Age'])
df_train['SalesFee_log_Mileage'] = df_train['log_Mileage'] * df_train['Sales Fee']

# Aplicar las mismas características al conjunto de evaluación
df_eval['Mileage_Engine_ratio'] = df_eval['Mileage'] / (df_eval['Engine volume'] + 1)
df_eval['Age'] = 2024 - df_eval['Prod. year']
df_eval['Mileage_per_Cylinder'] = df_eval['Mileage'] / (df_eval['Cylinders'] + 1)
df_eval['EngineVolume_per_Age'] = df_eval['Engine volume'] / (df_eval['Age'] + 1)
df_eval['Age_SalesFee'] = df_eval['Age'] * df_eval['Sales Fee']
df_eval['Mileage_Age_ratio'] = df_eval['Mileage'] / (df_eval['Age'] + 1)
df_eval['Age_FuelType_ratio'] = df_eval['Age'] / (df_eval['Fuel type'] + 1)
df_eval['Mileage_per_EngineVolume'] = df_eval['Mileage'] / (df_eval['Engine volume'] + 1)
df_eval['log_Mileage'] = np.log1p(df_eval['Mileage'])
df_eval['log_EngineVolume'] = np.log1p(df_eval['Engine volume'])
df_eval['Age_squared'] = df_eval['Age'] ** 2
df_eval['Mileage_Age'] = df_eval['Mileage'] * df_eval['Age']
df_eval['Cylinders_squared'] = df_eval['Cylinders'] ** 2
df_eval['Mileage_Age_Log'] = np.log1p(df_eval['Mileage_Age'])
df_eval['SalesFee_log_Mileage'] = df_eval['log_Mileage'] * df_eval['Sales Fee']

# Dividir en características (X) y variable objetivo (y) para el conjunto de entrenamiento
X_train_filtered = df_train.drop(['price', 'Id'], axis=1)
y_train_filtered = df_train['price']

# Separar un conjunto de validación del conjunto de entrenamiento
X_train, X_val, y_train, y_val = train_test_split(X_train_filtered, y_train_filtered, test_size=0.2, random_state=42)

# Definir el modelo XGBoost con parámetros ajustados levemente para optimizar el rendimiento
xgboost_model_optimized = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=6520,
    max_depth=15,
    learning_rate=0.00088,
    subsample=0.86,
    colsample_bytree=0.81,
    alpha=0.039,
    reg_lambda=4.02,
    gamma=0.295,
    min_child_weight=5,
    random_state=42
)

# Entrenar el modelo
xgboost_model_optimized.fit(X_train, y_train)

# Evaluar el RMSE en el conjunto de entrenamiento
y_train_pred = xgboost_model_optimized.predict(X_train)
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
print("RMSE en el conjunto de entrenamiento con XGBoost optimizado:", rmse_train)

# Evaluar el RMSE en el conjunto de validación
y_val_pred = xgboost_model_optimized.predict(X_val)
rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))
print("RMSE en el conjunto de validación con XGBoost optimizado:", rmse_val)

# Generar predicciones para el conjunto de evaluación
X_eval = df_eval.drop('Id', axis=1)
eval_predictions = xgboost_model_optimized.predict(X_eval)

# Remover valores negativos en las predicciones
eval_predictions = np.maximum(eval_predictions, 0)

# Crear un DataFrame para el submission
submission_df = pd.DataFrame({
    "Id": df_eval["Id"],
    "Predicted_Price": eval_predictions
})

# Guardar el archivo de submission en formato CSV
submission_df.to_csv("submission_xgboost_optimized_v10.csv", index=False)

print("Archivo de submission generado: 'submission_xgboost_optimized_v10.csv'")
submission_df.head()


#3

In [None]:
# Importar librerías necesarias
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

# Cargar los datos de entrenamiento y evaluación
df_train = pd.read_excel("/content/df_train_with_weights_replaced.xlsx")
df_eval = pd.read_excel("/content/df_eval_with_weights_replaced.xlsx")

# Análisis de correlación para identificar relaciones fuertes
correlation_matrix = df_train.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Matriz de Correlación entre Características")
plt.show()

# Crear características adicionales basadas en correlaciones observadas y relaciones avanzadas
df_train['Mileage_Engine_ratio'] = df_train['Mileage'] / (df_train['Engine volume'] + 1)
df_train['Age'] = 2024 - df_train['Prod. year']
df_train['Mileage_per_Cylinder'] = df_train['Mileage'] / (df_train['Cylinders'] + 1)
df_train['EngineVolume_per_Age'] = df_train['Engine volume'] / (df_train['Age'] + 1)
df_train['Age_SalesFee'] = df_train['Age'] * df_train['Sales Fee']
df_train['Mileage_Age_ratio'] = df_train['Mileage'] / (df_train['Age'] + 1)
df_train['Age_FuelType_ratio'] = df_train['Age'] / (df_train['Fuel type'] + 1)
df_train['Mileage_per_EngineVolume'] = df_train['Mileage'] / (df_train['Engine volume'] + 1)
df_train['log_Mileage'] = np.log1p(df_train['Mileage'])
df_train['log_EngineVolume'] = np.log1p(df_train['Engine volume'])
df_train['Age_squared'] = df_train['Age'] ** 2
df_train['Mileage_Age'] = df_train['Mileage'] * df_train['Age']
df_train['Cylinders_squared'] = df_train['Cylinders'] ** 2
df_train['Mileage_Age_Log'] = np.log1p(df_train['Mileage_Age'])
df_train['SalesFee_log_Mileage'] = df_train['log_Mileage'] * df_train['Sales Fee']

# Aplicar las mismas características al conjunto de evaluación
df_eval['Mileage_Engine_ratio'] = df_eval['Mileage'] / (df_eval['Engine volume'] + 1)
df_eval['Age'] = 2024 - df_eval['Prod. year']
df_eval['Mileage_per_Cylinder'] = df_eval['Mileage'] / (df_eval['Cylinders'] + 1)
df_eval['EngineVolume_per_Age'] = df_eval['Engine volume'] / (df_eval['Age'] + 1)
df_eval['Age_SalesFee'] = df_eval['Age'] * df_eval['Sales Fee']
df_eval['Mileage_Age_ratio'] = df_eval['Mileage'] / (df_eval['Age'] + 1)
df_eval['Age_FuelType_ratio'] = df_eval['Age'] / (df_eval['Fuel type'] + 1)
df_eval['Mileage_per_EngineVolume'] = df_eval['Mileage'] / (df_eval['Engine volume'] + 1)
df_eval['log_Mileage'] = np.log1p(df_eval['Mileage'])
df_eval['log_EngineVolume'] = np.log1p(df_eval['Engine volume'])
df_eval['Age_squared'] = df_eval['Age'] ** 2
df_eval['Mileage_Age'] = df_eval['Mileage'] * df_eval['Age']
df_eval['Cylinders_squared'] = df_eval['Cylinders'] ** 2
df_eval['Mileage_Age_Log'] = np.log1p(df_eval['Mileage_Age'])
df_eval['SalesFee_log_Mileage'] = df_eval['log_Mileage'] * df_eval['Sales Fee']

# Dividir en características (X) y variable objetivo (y) para el conjunto de entrenamiento
X_train_filtered = df_train.drop(['price', 'Id'], axis=1)
y_train_filtered = df_train['price']

# Separar un conjunto de validación del conjunto de entrenamiento
X_train, X_val, y_train, y_val = train_test_split(X_train_filtered, y_train_filtered, test_size=0.2, random_state=42)

# Definir el modelo XGBoost con parámetros ajustados y un mayor número de estimadores
xgboost_model_optimized = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=6500,
    max_depth=15,
    learning_rate=0.0009,
    subsample=0.85,
    colsample_bytree=0.8,
    alpha=0.04,
    reg_lambda=4.0,
    gamma=0.3,
    min_child_weight=5,
    random_state=42
)

# Entrenar el modelo
xgboost_model_optimized.fit(X_train, y_train)

# Evaluar el RMSE en el conjunto de entrenamiento
y_train_pred = xgboost_model_optimized.predict(X_train)
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
print("RMSE en el conjunto de entrenamiento con XGBoost optimizado:", rmse_train)

# Evaluar el RMSE en el conjunto de validación
y_val_pred = xgboost_model_optimized.predict(X_val)
rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))
print("RMSE en el conjunto de validación con XGBoost optimizado:", rmse_val)

# Generar predicciones para el conjunto de evaluación
X_eval = df_eval.drop('Id', axis=1)
eval_predictions = xgboost_model_optimized.predict(X_eval)

# Remover valores negativos en las predicciones
eval_predictions = np.maximum(eval_predictions, 0)

# Crear un DataFrame para el submission
submission_df = pd.DataFrame({
    "Id": df_eval["Id"],
    "Predicted_Price": eval_predictions
})

# Guardar el archivo de submission en formato CSV
submission_df.to_csv("submission_xgboost_optimized_v8.csv", index=False)

print("Archivo de submission generado: 'submission_xgboost_optimized_v8.csv'")
submission_df.head()


#4

In [None]:
# Importar librerías necesarias
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

# Cargar los datos de entrenamiento y evaluación
df_train = pd.read_excel("/content/df_train_with_weights_replaced.xlsx")
df_eval = pd.read_excel("/content/df_eval_with_weights_replaced.xlsx")

# Análisis de correlación para identificar relaciones fuertes
correlation_matrix = df_train.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Matriz de Correlación entre Características")
plt.show()

# Crear características adicionales basadas en correlaciones observadas y relaciones avanzadas
df_train['Mileage_Engine_ratio'] = df_train['Mileage'] / (df_train['Engine volume'] + 1)
df_train['Age'] = 2024 - df_train['Prod. year']
df_train['Mileage_per_Cylinder'] = df_train['Mileage'] / (df_train['Cylinders'] + 1)
df_train['EngineVolume_per_Age'] = df_train['Engine volume'] / (df_train['Age'] + 1)
df_train['Age_SalesFee'] = df_train['Age'] * df_train['Sales Fee']
df_train['Mileage_Age_ratio'] = df_train['Mileage'] / (df_train['Age'] + 1)
df_train['Age_FuelType_ratio'] = df_train['Age'] / (df_train['Fuel type'] + 1)
df_train['Mileage_per_EngineVolume'] = df_train['Mileage'] / (df_train['Engine volume'] + 1)
df_train['log_Mileage'] = np.log1p(df_train['Mileage'])
df_train['log_EngineVolume'] = np.log1p(df_train['Engine volume'])
df_train['Age_squared'] = df_train['Age'] ** 2
df_train['Mileage_Age'] = df_train['Mileage'] * df_train['Age']
df_train['Cylinders_squared'] = df_train['Cylinders'] ** 2
df_train['Mileage_Age_Log'] = np.log1p(df_train['Mileage_Age'])
df_train['SalesFee_log_Mileage'] = df_train['log_Mileage'] * df_train['Sales Fee']

# Aplicar las mismas características al conjunto de evaluación
df_eval['Mileage_Engine_ratio'] = df_eval['Mileage'] / (df_eval['Engine volume'] + 1)
df_eval['Age'] = 2024 - df_eval['Prod. year']
df_eval['Mileage_per_Cylinder'] = df_eval['Mileage'] / (df_eval['Cylinders'] + 1)
df_eval['EngineVolume_per_Age'] = df_eval['Engine volume'] / (df_eval['Age'] + 1)
df_eval['Age_SalesFee'] = df_eval['Age'] * df_eval['Sales Fee']
df_eval['Mileage_Age_ratio'] = df_eval['Mileage'] / (df_eval['Age'] + 1)
df_eval['Age_FuelType_ratio'] = df_eval['Age'] / (df_eval['Fuel type'] + 1)
df_eval['Mileage_per_EngineVolume'] = df_eval['Mileage'] / (df_eval['Engine volume'] + 1)
df_eval['log_Mileage'] = np.log1p(df_eval['Mileage'])
df_eval['log_EngineVolume'] = np.log1p(df_eval['Engine volume'])
df_eval['Age_squared'] = df_eval['Age'] ** 2
df_eval['Mileage_Age'] = df_eval['Mileage'] * df_eval['Age']
df_eval['Cylinders_squared'] = df_eval['Cylinders'] ** 2
df_eval['Mileage_Age_Log'] = np.log1p(df_eval['Mileage_Age'])
df_eval['SalesFee_log_Mileage'] = df_eval['log_Mileage'] * df_eval['Sales Fee']

# Dividir en características (X) y variable objetivo (y) para el conjunto de entrenamiento
X_train_filtered = df_train.drop(['price', 'Id'], axis=1)
y_train_filtered = df_train['price']

# Separar un conjunto de validación del conjunto de entrenamiento
X_train, X_val, y_train, y_val = train_test_split(X_train_filtered, y_train_filtered, test_size=0.2, random_state=42)

# Definir el modelo XGBoost con parámetros ajustados y un mayor número de estimadores
xgboost_model_optimized = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=6000,
    max_depth=16,
    learning_rate=0.001,
    subsample=0.8,
    colsample_bytree=0.75,
    alpha=0.03,
    reg_lambda=4.2,
    gamma=0.25,
    min_child_weight=7,
    random_state=42
)

# Entrenar el modelo
xgboost_model_optimized.fit(X_train, y_train)

# Evaluar el RMSE en el conjunto de entrenamiento
y_train_pred = xgboost_model_optimized.predict(X_train)
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
print("RMSE en el conjunto de entrenamiento con XGBoost optimizado:", rmse_train)

# Evaluar el RMSE en el conjunto de validación
y_val_pred = xgboost_model_optimized.predict(X_val)
rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))
print("RMSE en el conjunto de validación con XGBoost optimizado:", rmse_val)

# Generar predicciones para el conjunto de evaluación
X_eval = df_eval.drop('Id', axis=1)
eval_predictions = xgboost_model_optimized.predict(X_eval)

# Remover valores negativos en las predicciones
eval_predictions = np.maximum(eval_predictions, 0)

# Crear un DataFrame para el submission
submission_df = pd.DataFrame({
    "Id": df_eval["Id"],
    "Predicted_Price": eval_predictions
})

# Guardar el archivo de submission en formato CSV
submission_df.to_csv("submission_xgboost_optimized_v6.csv", index=False)

print("Archivo de submission generado: 'submission_xgboost_optimized_v6.csv'")
submission_df.head()
