<a href="https://colab.research.google.com/github/DeisyData/BIT_IA_Bootcamp/blob/main/S7_C3_pipeline_Clase.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Importar las bibliotecas necesarias
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
import numpy as np

# Generar un conjunto de datos de ejemplo
X, y = make_classification(n_samples=1000, n_features=10, n_classes=2, random_state=42)

# Dividir en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Parte 2. Crear diferentes pipelines con distintas técnicas de preprocesamiento

In [31]:
pipeline_1 = {
    "Min-Max Scaling": Pipeline([
        ("scaler", MinMaxScaler()),
        ("classifier", DecisionTreeClassifier(random_state=42))
    ])
}

In [34]:
name_1, pipeline_2 = list(pipeline_1.items())[0]
pipeline_2

In [40]:
cv_scores = cross_val_score(pipeline_2, X_train, y_train, cv=5, scoring="accuracy")
cv_scores

array([0.86428571, 0.92142857, 0.9       , 0.86428571, 0.85714286])

In [44]:
pipelines = {
    "Min-Max Scaling": Pipeline([
        ("scaler", MinMaxScaler()),
        ("classifier", DecisionTreeClassifier(random_state=42))
    ]),
    "Standard Scaling": Pipeline([
        ("scaler", StandardScaler()),
        ("classifier", DecisionTreeClassifier(random_state=42))
    ]),
    "L2 Normalization": Pipeline([
        ("scaler", Normalizer(norm="l2")),
        ("classifier", DecisionTreeClassifier(random_state=42))
    ])
}

In [5]:
pipelines.items()

dict_items([('Min-Max Scaling', Pipeline(steps=[('scaler', MinMaxScaler()),
                ('classifier', DecisionTreeClassifier(random_state=42))])), ('Standard Scaling', Pipeline(steps=[('scaler', StandardScaler()),
                ('classifier', DecisionTreeClassifier(random_state=42))])), ('L2 Normalization', Pipeline(steps=[('scaler', Normalizer()),
                ('classifier', DecisionTreeClassifier(random_state=42))]))])

# Parte 3. Evaluar cada pipeline usando validación cruzada



[CV_metrics](https://scikit-learn.org/1.5/modules/model_evaluation.html)

[CV_folks](https://scikit-learn.org/1.5/modules/cross_validation.html)

In [57]:
scores = {}
for name, pipeline in pipelines.items():
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring="f1")
    scores[name] = np.mean(cv_scores)
    print(f"{name} - Exactitud promedio (validación cruzada): {np.mean(cv_scores):.10f}")

Min-Max Scaling - Exactitud promedio (validación cruzada): 0.8728893740
Standard Scaling - Exactitud promedio (validación cruzada): 0.8728893740
L2 Normalization - Exactitud promedio (validación cruzada): 0.8337396993


In [None]:
# Evaluar el mejor modelo en el conjunto de prueba
best_pipeline_name = max(scores, key=scores.get)
best_pipeline = pipelines[best_pipeline_name]
best_pipeline

In [49]:
# Entrenar el modelo con el conjunto de entrenamiento completo
best_pipeline.fit(X_train, y_train)
test_score = best_pipeline.score(X_test, y_test)

print(f"\nMejor pipeline: {best_pipeline_name}")
print(f"Exactitud en el conjunto de prueba: {test_score:.4f}")


Mejor pipeline: Min-Max Scaling
Exactitud en el conjunto de prueba: 0.8633


# Parte 4. Prueba con el empate

In [None]:
name_1, pipeline_3 = list(pipelines.items())[1]
pipeline_3

In [53]:
# Entrenar el modelo con el conjunto de entrenamiento completo
best_pipeline = pipeline_3
best_pipeline_name = name_1
best_pipeline.fit(X_train, y_train)
test_score = best_pipeline.score(X_test, y_test)

print(f"\nMejor pipeline: {best_pipeline_name}")
print(f"Exactitud en el conjunto de prueba: {test_score:.4f}")


Mejor pipeline: Standard Scaling
Exactitud en el conjunto de prueba: 0.8633
