# 4 – Data Drift Analysis (Windows Friendly, No Alibi-Detect)

Este notebook analiza data drift usando únicamente SciPy, NumPy y Scikit-Learn — totalmente compatible con Windows.

In [None]:
!pip install numpy pandas scikit-learn scipy matplotlib h2o

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import ks_2samp, chi2_contingency
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import rbf_kernel
import h2o

plt.rcParams['figure.figsize'] = (8,5)

## 1. Cargar dataset original

In [None]:
DATA_PATH = '../data/raw/insurance_company_original.csv'
TARGET_COL = 'CARAVAN'

df = pd.read_csv(DATA_PATH, delimiter=';')
df.head()

## 2. Preparar datos y split train/val

In [None]:
y = df[TARGET_COL]
X = df.drop(columns=[TARGET_COL])

for col in X.columns:
    try:
        X[col] = pd.to_numeric(X[col], errors='ignore')
    except:
        pass

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_val.head()

## 3. Iniciar H2O y cargar modelo

In [None]:
h2o.init()

MODEL_PATH = '../models/h2o_automl_model'
model = h2o.load_model(MODEL_PATH)
model

## 4. Métricas baseline

In [None]:
hX_val = h2o.H2OFrame(X_val)
pred = model.predict(hX_val).as_data_frame()

y_pred = pred['predict'].astype(int).values
y_proba = pred['p1'].values if 'p1' in pred.columns else None

baseline = {
    'accuracy': accuracy_score(y_val, y_pred),
    'f1': f1_score(y_val, y_pred),
    'auc': roc_auc_score(y_val, y_proba) if y_proba is not None else np.nan
}

baseline

## 5. Generar dataset con drift sintético

In [None]:
X_drift = X_val.copy()

num_cols = X_drift.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X_drift.select_dtypes(exclude=[np.number]).columns.tolist()

for col in num_cols:
    shift = np.random.normal(0.3, 0.1)
    noise = np.random.normal(0, 0.5, len(X_drift))
    X_drift[col] = X_drift[col] * (1 + shift) + noise

for col in cat_cols:
    mode = X_drift[col].mode()[0]
    mask = np.random.rand(len(X_drift)) < 0.7
    X_drift.loc[mask, col] = mode

X_drift.head()

## 6. Evaluar desempeño con drift

In [None]:
hX_drift = h2o.H2OFrame(X_drift)
pred2 = model.predict(hX_drift).as_data_frame()

y_pred2 = pred2['predict'].astype(int).values
y_proba2 = pred2['p1'].values if 'p1' in pred2.columns else None

drift_metrics = {
    'accuracy': accuracy_score(y_val, y_pred2),
    'f1': f1_score(y_val, y_pred2),
    'auc': roc_auc_score(y_val, y_proba2) if y_proba2 is not None else np.nan
}

drift_metrics

## 7. Detección de drift sin Alibi (KS, Chi-Square, MMD)

In [None]:
def detect_drift_ks(X_ref, X_test, num_cols):
    results = {}
    for col in num_cols:
        p = ks_2samp(X_ref[col], X_test[col]).pvalue
        results[col] = p
    return results

def detect_drift_chi2(X_ref, X_test, cat_cols):
    results = {}
    for col in cat_cols:
        cont = pd.crosstab(X_ref[col], X_test[col])
        chi2, p, _, _ = chi2_contingency(cont)
        results[col] = p
    return results

def compute_mmd(X_ref, X_test, gamma=1.0):
    Xr = np.asarray(X_ref)
    Xt = np.asarray(X_test)
    Kxx = rbf_kernel(Xr, Xr, gamma=gamma)
    Kyy = rbf_kernel(Xt, Xt, gamma=gamma)
    Kxy = rbf_kernel(Xr, Xt, gamma=gamma)
    return Kxx.mean() + Kyy.mean() - 2 * Kxy.mean()

ks_results = detect_drift_ks(X_val, X_drift, num_cols)
chi_results = detect_drift_chi2(X_val, X_drift, cat_cols) if len(cat_cols)>0 else {}
mmd_value = compute_mmd(X_val[num_cols], X_drift[num_cols])

ks_results, chi_results, mmd_value

## 8. Gráfica performance baseline vs drift

In [None]:
metrics = list(baseline.keys())
before = [baseline[m] for m in metrics]
after = [drift_metrics[m] for m in metrics]

plt.figure()
plt.bar(np.arange(len(metrics))-0.2,before,0.4,label='Baseline')
plt.bar(np.arange(len(metrics))+0.2,after,0.4,label='Drift')
plt.xticks(np.arange(len(metrics)),metrics)
plt.ylim(0,1)
plt.title('Performance Before vs After Drift')
plt.legend()
plt.show()