In [None]:
# Librerías base
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Preprocesamiento
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Modelos
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Evaluación
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
    roc_auc_score,
    roc_curve
)

# Búsqueda de hiperparámetros
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import randint


In [None]:
# Montar Drive si estás en Google Colab
from google.colab import drive
drive.mount('/content/drive')

# Leer CSV desde ruta
path = "/content/drive/MyDrive/datasets/insurance.csv"
df = pd.read_csv(path)

# Vista general
df.head()


In [None]:
# Resumen de tipos de datos
df.info()

# Estadísticas generales
df.describe()

# Comprobamos valores únicos por columna categórica
for col in df.select_dtypes(include=['object']).columns:
    print(f"{col}: {df[col].unique()}")


In [None]:
# Valores nulos
print("Valores nulos por columna:")
print(df.isnull().sum())

# Boxplots para ver outliers en columnas numéricas
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns

for col in numeric_cols:
    plt.figure(figsize=(6, 3))
    sns.boxplot(data=df, x=col, color='skyblue')
    plt.title(f'Distribución y outliers: {col}')
    plt.tight_layout()
    plt.show()


In [None]:
# Crear variable binaria: 1 si charges > mediana, 0 si no
median_charge = df['charges'].median()
df['target'] = (df['charges'] > median_charge).astype(int)

# Ver distribución de clases
print(df['target'].value_counts(normalize=True))
sns.countplot(data=df, x='target')
plt.title("Distribución de target (gasto alto vs bajo)")
plt.show()


In [None]:
# Definir columnas categóricas y numéricas
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
num_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
num_cols.remove('charges')  # excluimos charges original

# Separar X e y
X = df.drop(columns=['charges', 'target'])
y = df['target']

# Pipeline para numéricas
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Pipeline para categóricas
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, num_cols),
    ('cat', categorical_transformer, cat_cols)
])


In [None]:
# División train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Modelos
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(random_state=42)
}

# Evaluación con validación cruzada
for name, model in models.items():
    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    scores = cross_val_score(pipe, X_train, y_train, cv=5, scoring='accuracy')
    print(f"{name}: Accuracy = {scores.mean():.4f} ± {scores.std():.4f}")


In [None]:
# Hiperparámetros para Decision Tree
param_grid = {
    'classifier__max_depth': [3, 5, 10, None],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

param_dist = {
    'classifier__max_depth': [None] + list(range(3, 20)),
    'classifier__min_samples_split': randint(2, 20),
    'classifier__min_samples_leaf': randint(1, 10)
}

pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

# GridSearchCV
grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

print("GridSearch - Mejor score:", grid_search.best_score_)
print("GridSearch - Mejores parámetros:", grid_search.best_params_)

# RandomizedSearchCV
random_search = RandomizedSearchCV(pipe, param_distributions=param_dist, n_iter=30,
                                   cv=5, scoring='accuracy', n_jobs=-1, verbose=1, random_state=42)
random_search.fit(X_train, y_train)

print("RandomSearch - Mejor score:", random_search.best_score_)
print("RandomSearch - Mejores parámetros:", random_search.best_params_)


In [None]:
# Elegimos el mejor modelo (de RandomizedSearch)
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)

# Reporte
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Matriz de confusión
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.show()

# ROC-AUC
y_probs = best_model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_probs)
fpr, tpr, _ = roc_curve(y_test, y_probs)

plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC Curve")
plt.legend()
plt.show()
