## Ejemplo 18

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
np.random.seed(42)
n_samples = 1000
edades = np.random.randint(18, 70, n_samples)
salarios = np.random.randint(20000, 100000, n_samples)
tipo_producto = np.random.choice(['A', 'B', 'C'], n_samples)
interacciones = np.random.randint(1, 20, n_samples)
churn = np.where(
    (salarios < 30000) | 
    (interacciones > 15), 1, 0
)

In [3]:
df = pd.DataFrame({
    'edad': edades,
    'salario': salarios,
    'tipo_producto': tipo_producto,
    'interacciones': interacciones,
    'Churn': churn
})

In [4]:
indices_faltantes = np.random.choice(df.index, size=int(0.05 * n_samples), replace=False)
df.loc[indices_faltantes, 'salario'] = np.nan
print(df.head())

   edad  salario tipo_producto  interacciones  Churn
0    56  54674.0             B              9      0
1    69  55854.0             A              6      0
2    46  66271.0             A             18      1
3    32  93688.0             A              6      0
4    60  58518.0             B             19      1


In [5]:
# Preprocesamiento
# Llenar valores faltantes y eliminar registros con edades erróneas
df['salario'].fillna(df['salario'].mean(), inplace=True)
df = df[df['edad'] < 120]

In [6]:
# Transformación
# Variables categóricas y numéricas
cat_features = ['tipo_producto']
num_features = ['edad', 'salario', 'interacciones']

In [7]:
# Crear transformadores para variables categóricas y numéricas
transformers = [
    ('num', StandardScaler(), num_features),
    ('cat', OneHotEncoder(drop='first'), cat_features)
]
preprocessor = ColumnTransformer(transformers)

In [8]:
# Minería de datos
# Dividir los datos en entrenamiento y prueba
X = df.drop('Churn', axis=1)
y = df['Churn']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

In [11]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.805
              precision    recall  f1-score   support

           0       0.85      0.87      0.86       136
           1       0.70      0.67      0.69        64

    accuracy                           0.81       200
   macro avg       0.78      0.77      0.77       200
weighted avg       0.80      0.81      0.80       200

