In [1]:
import pandas as pd
import lightgbm as lgb
import os
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

XGBoost

In [None]:
import pandas as pd
import xgboost as xgb
import os
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Crear carpeta "resultado" si no existe
os.makedirs("resultado", exist_ok=True)

# Cargar datos preprocesados
train_sj = pd.read_csv("analisis/train_sj_clean2.csv")
train_iq = pd.read_csv("analisis/train_iq_clean2.csv")
test_features = pd.read_csv("dengue_features_test.csv")
submission_format = pd.read_csv("submission_format.csv")

# Definir características (sin nuevas variables)
features = [col for col in train_sj.columns if col not in ["total_cases", "week_start_date"]]

# Preparar conjuntos de entrenamiento
X_sj, y_sj = train_sj[features], train_sj["total_cases"]
X_iq, y_iq = train_iq[features], train_iq["total_cases"]

# Preprocesamiento
imputer_sj = SimpleImputer(strategy="most_frequent")
imputer_iq = SimpleImputer(strategy="median")
scaler = StandardScaler()

X_sj = scaler.fit_transform(imputer_sj.fit_transform(X_sj))
X_iq = scaler.fit_transform(imputer_iq.fit_transform(X_iq))

# Separar test por ciudad y aplicar preprocesamiento
test_sj = test_features[test_features["city"] == "sj"][features]
test_iq = test_features[test_features["city"] == "iq"][features]

test_sj = scaler.transform(imputer_sj.transform(test_sj))
test_iq = scaler.transform(imputer_iq.transform(test_iq))

# 🚀 **Hiperparámetros del mejor modelo anterior (24.5120)**
params_sj = {
    "objective": "reg:squarederror",
    "learning_rate": 0.03,  # 🔹 Mantener la tasa de aprendizaje óptima
    "max_depth": 7,  
    "n_estimators": 300,  # 🔹 Volver a 300 estimadores
    "subsample": 0.85,  # 🔹 Evitar sobreajuste
    "colsample_bytree": 0.85,
    "reg_lambda": 1.0,
    "reg_alpha": 0.5
}
params_iq = {
    "objective": "reg:squarederror",
    "learning_rate": 0.03,
    "max_depth": 5,  
    "n_estimators": 300,
    "subsample": 0.85,
    "colsample_bytree": 0.85,
    "reg_lambda": 1.0,
    "reg_alpha": 0.5
}
# 🚀 **Hiperparámetros del mejor modelo (24.3077 en DrivenData)**

# Entrenar modelos XGBoost con los hiperparámetros anteriores
model_xgb_sj = xgb.XGBRegressor(**params_sj)
model_xgb_sj.fit(X_sj, y_sj)

model_xgb_iq = xgb.XGBRegressor(**params_iq)
model_xgb_iq.fit(X_iq, y_iq)

# Hacer predicciones
pred_sj = model_xgb_sj.predict(test_sj).round().astype(int).clip(min=0)
pred_iq = model_xgb_iq.predict(test_iq).round().astype(int).clip(min=0)

# Crear el archivo de submission
submission = submission_format.copy()
submission.loc[submission["city"] == "sj", "total_cases"] = pred_sj
submission.loc[submission["city"] == "iq", "total_cases"] = pred_iq

# Guardar el archivo final
submission_path = "resultado/submission_xgboost_optimized2.csv"
submission.to_csv(submission_path, index=False)

print(f"✅ Archivo guardado en: {submission_path}")


✅ Archivo guardado en: resultado/submission_xgboost_optimized.csv
