## Pipelines con diferentes tecnologías

#### 1. Scikit-Learn

In [1]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score

# Cargar dataset
data = load_iris()
X, y = data.data, data.target

# Dividir datos en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Definir el pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),            # Paso 1: Estandarización
    ('pca', PCA(n_components=2)),            # Paso 2: Reducción de dimensionalidad con PCA
    ('classifier', RandomForestClassifier()) # Paso 3: Modelo RandomForest
])

# Entrenar el pipeline
pipeline.fit(X_train, y_train)

# Predecir en el conjunto de prueba
y_pred = pipeline.predict(X_test)

# Evaluar el rendimiento del modelo
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


Accuracy: 0.96


##### 2. Pandas

In [1]:
import pandas as pd

# Crear un dataframe de ejemplo
data = {
    'age': [25, 32, 47, 51, 62],
    'income': [50000, 64000, 120000, 97000, 150000],
    'employed': [True, True, False, True, False]
}
df = pd.DataFrame(data)

# Funciones de procesamiento de datos
def categorize_age(df):
    df['age_group'] = pd.cut(df['age'], bins=[0, 30, 50, 100], labels=['Young', 'Middle-aged', 'Senior'])
    return df

def income_to_thousands(df):
    df['income'] = df['income'] / 1000  # Convertir a miles
    return df

def flag_unemployed(df):
    df['is_unemployed'] = ~df['employed']  # Flag si no está empleado
    return df

# Definir pipeline de procesamiento de datos
df_pipeline = (df
               .pipe(categorize_age)
               .pipe(income_to_thousands)
               .pipe(flag_unemployed))

print(df_pipeline)


   age  income  employed    age_group  is_unemployed
0   25    50.0      True        Young          False
1   32    64.0      True  Middle-aged          False
2   47   120.0     False  Middle-aged           True
3   51    97.0      True       Senior          False
4   62   150.0     False       Senior           True


#### 3. Airflow

In [3]:
from airflow import DAG
from airflow.operators.python import PythonOperator
from datetime import datetime

# Definir funciones de ETL
def extract():
    print("Extrayendo datos...")

def transform():
    print("Transformando datos...")

def load():
    print("Cargando datos...")

# Definir DAG
default_args = {
    'owner': 'airflow',
    'start_date': datetime(2023, 1, 1),
    'retries': 1
}

with DAG('etl_pipeline', default_args=default_args, schedule='@daily') as dag:

    
    extract_task = PythonOperator(
        task_id='extract',
        python_callable=extract
    )
    
    transform_task = PythonOperator(
        task_id='transform',
        python_callable=transform
    )
    
    load_task = PythonOperator(
        task_id='load',
        python_callable=load
    )
    
    # Definir orden de ejecución de las tareas
    extract_task >> transform_task >> load_task


