<a href="https://colab.research.google.com/github/DeisyData/BIT_IA_Bootcamp/blob/main/S7_C3_pipeline_Inicio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Importar las bibliotecas necesarias
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
import numpy as np

# Generar un conjunto de datos de ejemplo
X, y = make_classification(n_samples=1000, n_features=10, n_classes=2, random_state=42)

# Dividir en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Parte 2. Crear diferentes pipelines con distintas técnicas de preprocesamiento

In [None]:

pipelines = {
    "Min-Max Scaling": Pipeline([
        ("scaler", MinMaxScaler()),
        ("classifier", DecisionTreeClassifier(random_state=42))
    ]),
    "Standard Scaling": Pipeline([
        ("scaler", StandardScaler()),
        ("classifier", DecisionTreeClassifier(random_state=42))
    ]),
    "L2 Normalization": Pipeline([
        ("scaler", Normalizer(norm="l2")),
        ("classifier", DecisionTreeClassifier(random_state=42))
    ])
}

# Parte 3. Evaluar cada pipeline usando validación cruzada

In [None]:

scores = {}
for name, pipeline in pipelines.items():
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring="accuracy")
    scores[name] = np.mean(cv_scores)
    print(f"{name} - Exactitud promedio (validación cruzada): {np.mean(cv_scores):.4f}")

# Entrenar y evaluar el mejor modelo en el conjunto de prueba
best_pipeline_name = max(scores, key=scores.get)
best_pipeline = pipelines[best_pipeline_name]

# Entrenar el modelo con el conjunto de entrenamiento completo
best_pipeline.fit(X_train, y_train)
test_score = best_pipeline.score(X_test, y_test)

print(f"\nMejor pipeline: {best_pipeline_name}")
print(f"Exactitud en el conjunto de prueba: {test_score:.4f}")
