<a href="https://colab.research.google.com/github/DeisyData/BIT_IA_Bootcamp/blob/main/S7_C3_pipeline_Clase_nMetrics_train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [87]:
# Importar las bibliotecas necesarias
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
import numpy as np

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, classification_report

# Generar un conjunto de datos de ejemplo
X, y = make_classification(n_samples=1000, n_features=10, n_classes=2,   random_state=42) #weights=[0.9, 0.1],

# Dividir en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Parte 2. Crear diferentes pipelines con distintas técnicas de preprocesamiento

In [67]:
pipeline_1 = {
    "Min-Max Scaling": Pipeline([
        ("scaler", MinMaxScaler()),
        ("classifier", DecisionTreeClassifier(random_state=42))
    ])
}

In [68]:
name_1, pipeline_2 = list(pipeline_1.items())[0]
pipeline_2

In [69]:
cv_scores = cross_val_score(pipeline_2, X_train, y_train, cv=5, scoring="accuracy")
cv_scores

array([0.86428571, 0.92142857, 0.9       , 0.86428571, 0.85714286])

In [70]:
pipelines = {
    "Min-Max Scaling": Pipeline([
        ("scaler", MinMaxScaler()),
        ("classifier", DecisionTreeClassifier(random_state=42))
    ]),
    "Standard Scaling": Pipeline([
        ("scaler", StandardScaler()),
        ("classifier", DecisionTreeClassifier(random_state=42))
    ]),
    "L2 Normalization": Pipeline([
        ("scaler", Normalizer(norm="l2")),
        ("classifier", DecisionTreeClassifier(random_state=42))
    ])
}

In [71]:
pipelines.items()

dict_items([('Min-Max Scaling', Pipeline(steps=[('scaler', MinMaxScaler()),
                ('classifier', DecisionTreeClassifier(random_state=42))])), ('Standard Scaling', Pipeline(steps=[('scaler', StandardScaler()),
                ('classifier', DecisionTreeClassifier(random_state=42))])), ('L2 Normalization', Pipeline(steps=[('scaler', Normalizer()),
                ('classifier', DecisionTreeClassifier(random_state=42))]))])

# Parte 3. Evaluar cada pipeline usando validación cruzada



[CV_metrics](https://scikit-learn.org/1.5/modules/model_evaluation.html)

[CV_folks](https://scikit-learn.org/1.5/modules/cross_validation.html)

In [124]:
from sklearn.model_selection import cross_validate
# Define a dictionary to store scores
scores = {}
# Define multiple scoring metrics
scoring_metrics = {
    'f1': 'f1',
    'accuracy': 'accuracy',
    'recall': 'recall',
    'AUC': 'roc_auc'
}
# Iterate through pipelines and compute cross-validation scores
for name, pipeline in pipelines.items():
    cv_results = cross_validate(pipeline, X_train, y_train, cv=5, scoring=scoring_metrics, return_train_score=False)

    # Store mean scores for each metric
    scores[name] = {metric: np.mean(cv_results[f'test_{metric}']) for metric in scoring_metrics}
    # Print results
    print(f"{name} - Cross-validation Scores:")
    for metric, value in scores[name].items():
        print(f"   {metric.capitalize()}: {value:.10f}")

Min-Max Scaling - Cross-validation Scores:
   F1: 0.8728893740
   Accuracy: 0.8814285714
   Recall: 0.8590230665
   Auc: 0.8802705114
Standard Scaling - Cross-validation Scores:
   F1: 0.8728893740
   Accuracy: 0.8814285714
   Recall: 0.8590230665
   Auc: 0.8802705114
L2 Normalization - Cross-validation Scores:
   F1: 0.8337396993
   Accuracy: 0.8371428571
   Recall: 0.8563545907
   Auc: 0.8379514531


In [125]:
from sklearn.model_selection import cross_validate
import numpy as np
import pandas as pd

# Diccionario para almacenar los promedios de las métricas
scores = {}

# Definir múltiples métricas de evaluación
scoring_metrics = {
    'f1': 'f1',
    'accuracy': 'accuracy',
    'recall': 'recall',
    'AUC': 'roc_auc'
}

# Iterar sobre los pipelines y calcular los puntajes de validación cruzada
for name, pipeline in pipelines.items():
    cv_results = cross_validate(pipeline, X_train, y_train, cv=5, scoring=scoring_metrics, return_train_score=False)

    # Guardar los puntajes medios para cada métrica
    scores[name] = {metric: np.mean(cv_results[f'test_{metric}']) for metric in scoring_metrics}
    print(f"{name} - Puntajes de validación cruzada:")
    for metric, value in scores[name].items():
        print(f"   {metric.capitalize()}: {value:.10f}")

# Convertir el diccionario de puntajes en un DataFrame
scores_df = pd.DataFrame(scores).T

# Calcular el promedio de las métricas para cada pipeline y añadirlo al DataFrame
scores_df['average_score'] = scores_df.mean(axis=1)

# Seleccionar el pipeline con el puntaje promedio más alto
best_pipeline_name = scores_df['average_score'].idxmax()
best_pipeline = pipelines[best_pipeline_name]

# Mostrar el DataFrame completo
print("\nPuntajes de todos los pipelines:")
print(scores_df)

# Mostrar el mejor pipeline
print(f"\nEl mejor pipeline es: {best_pipeline_name} con un puntaje promedio de {scores_df.loc[best_pipeline_name, 'average_score']:.10f}")

Min-Max Scaling - Puntajes de validación cruzada:
   F1: 0.8728893740
   Accuracy: 0.8814285714
   Recall: 0.8590230665
   Auc: 0.8802705114
Standard Scaling - Puntajes de validación cruzada:
   F1: 0.8728893740
   Accuracy: 0.8814285714
   Recall: 0.8590230665
   Auc: 0.8802705114
L2 Normalization - Puntajes de validación cruzada:
   F1: 0.8337396993
   Accuracy: 0.8371428571
   Recall: 0.8563545907
   Auc: 0.8379514531

Puntajes de todos los pipelines:
                        f1  accuracy    recall       AUC  average_score
Min-Max Scaling   0.872889  0.881429  0.859023  0.880271       0.873403
Standard Scaling  0.872889  0.881429  0.859023  0.880271       0.873403
L2 Normalization  0.833740  0.837143  0.856355  0.837951       0.841297

El mejor pipeline es: Min-Max Scaling con un puntaje promedio de 0.8734028808


## 3.1 Predicción y metricas

In [122]:
def metricas(modelo, y_test, y_predict):
  print(f'Metricas del modelo {modelo}')
  print("F1 Score:", f1_score(y_test, y_predict))
  print("Accuracy:", accuracy_score(y_test, y_predict))
  print("Recall:", recall_score(y_test, y_predict))
  print("AUC:", roc_auc_score(y_test, y_predict))
  print("Classification Report:\n", classification_report(y_test, y_predict))

In [123]:
y_predic = best_pipeline.predict(X_test)
metricas(best_pipeline_name, y_test, y_predic)

Metricas del modelo Min-Max Scaling
F1 Score: 0.8690095846645367
Accuracy: 0.8633333333333333
Recall: 0.8242424242424242
AUC: 0.8676767676767677
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.91      0.86       135
           1       0.92      0.82      0.87       165

    accuracy                           0.86       300
   macro avg       0.86      0.87      0.86       300
weighted avg       0.87      0.86      0.86       300



# Parte 4. Prueba con el empate

In [82]:
name_1, pipeline_3 = list(pipelines.items())[1]
pipeline_3

In [76]:
# Entrenar el modelo con el conjunto de entrenamiento completo
best_pipeline = pipeline_3
best_pipeline_name = name_1
best_pipeline.fit(X_train, y_train)
test_score = best_pipeline.score(X_test, y_test)

print(f"\nMejor pipeline: {best_pipeline_name}")
print(f"Exactitud en el conjunto de prueba: {test_score:.4f}")


Mejor pipeline: Standard Scaling
Exactitud en el conjunto de prueba: 0.8633
