<a href="https://colab.research.google.com/github/DianaOquendoEng/Proyecto-Kaggle-UDEA/blob/main/03-modelo_con_preprocesado_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##MODELO XGBoost con pipeline

In [1]:
#1. IMPORTAR LAS LIBRERIAS
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

from xgboost import XGBClassifier

In [2]:
# 2. CARGAR DATOS CRUDOS
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")

print("Train:", train.shape, "| Test:", test.shape)

# renombramos para trabajar más fácil
train = train.rename(columns={"RENDIMIENTO_GLOBAL": "target"})



Train: (692500, 21) | Test: (296786, 20)


In [3]:
# 3. SEPARAR X y TRANSFORMAR Y

X = train.drop(columns=["target"])

# Convertir y (alto, bajo, etc.) a números
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(train["target"])

print("Clases transformadas:", label_encoder.classes_)



Clases transformadas: ['alto' 'bajo' 'medio-alto' 'medio-bajo']


In [4]:
# 4. TRAIN / VALIDATION SPLIT
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)




In [5]:
# 5. COLUMNAS NUMÉRICAS Y CATEGÓRICAS

num_cols = X_train.select_dtypes(include=['number']).columns
cat_cols = X_train.select_dtypes(include=['object']).columns

print("Numéricas:", len(num_cols))
print("Categóricas:", len(cat_cols))



Numéricas: 6
Categóricas: 14


In [6]:
# 6. PREPROCESAMIENTO

preprocesamiento = ColumnTransformer([
    ("num", Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]), num_cols),

    ("cat", Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ]), cat_cols)
])



In [7]:
# 7. MODELO XGBOOST
modelo = Pipeline([
    ("prep", preprocesamiento),
    ("clf", XGBClassifier(
        n_estimators=300,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.9,
        colsample_bytree=0.9,
        objective="multi:softmax",   # porque son clases 0,1,2,3
        eval_metric="mlogloss",
        random_state=42
    ))
])


In [8]:
# 8. Entrenar
modelo.fit(X_train, y_train)


# PREDICCIÓN SOBRE TRAIN (para calcular train accuracy)

y_train_pred = modelo.predict(X_train)
print("\nAccuracy entrenamiento:", accuracy_score(y_train, y_train_pred))



Accuracy entrenamiento: 0.4384476534296029


In [9]:
# 9. EVALUACIÓN VALIDACIÓN

y_val_pred = modelo.predict(X_val)

print("\nAccuracy validación:", accuracy_score(y_val, y_val_pred))
print("\nReporte de clasificación (etiquetas numéricas):")
print(classification_report(y_val, y_val_pred))

# Convertir a etiquetas reales
y_val_pred_labels = label_encoder.inverse_transform(y_val_pred)
y_val_labels      = label_encoder.inverse_transform(y_val)

print("\nReporte con etiquetas reales:")
print(classification_report(y_val_labels, y_val_pred_labels))



Accuracy validación: 0.42397833935018053

Reporte de clasificación (etiquetas numéricas):
              precision    recall  f1-score   support

           0       0.53      0.63      0.57     35124
           1       0.44      0.56      0.50     34597
           2       0.32      0.23      0.27     34324
           3       0.33      0.27      0.30     34455

    accuracy                           0.42    138500
   macro avg       0.41      0.42      0.41    138500
weighted avg       0.41      0.42      0.41    138500


Reporte con etiquetas reales:
              precision    recall  f1-score   support

        alto       0.53      0.63      0.57     35124
        bajo       0.44      0.56      0.50     34597
  medio-alto       0.32      0.23      0.27     34324
  medio-bajo       0.33      0.27      0.30     34455

    accuracy                           0.42    138500
   macro avg       0.41      0.42      0.41    138500
weighted avg       0.41      0.42      0.41    138500



In [10]:
# 10. PREDICCIÓN PARA KAGGLE

pred_test = modelo.predict(test)

# Convertir los valores numéricos a las etiquetas reales (alto, bajo, etc.)
pred_test_labels = label_encoder.inverse_transform(pred_test)

submission = pd.DataFrame({
    "ID": test["ID"],
    "RENDIMIENTO_GLOBAL": pred_test_labels
})

submission.to_csv("submission.csv", index=False)
print("\nArchivo submission.csv creado correctamente.")


Archivo submission.csv creado correctamente.
