In [0]:
# # 03 - Entrenamiento Modelo con scikit-learn (sin MLlib)

# COMMAND ----------

# Importar librerías Python
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

# Leer datos de features desde Spark y convertir a pandas
df_spark = spark.table("datalottery.lotterybets.lottery_bets_dirty_features")

# Convertir a pandas (ojo con el tamaño de datos)
df = df_spark.toPandas()

# Revisar columnas
print(df.columns)

# Variables predictoras (features)
feature_cols = [
    "bets_last_7d", "win_rate_last_30d", "ip_risk", "geo_risk",
    "num_picks", "avg_stake_amount"
]

X = df[feature_cols]
y = df["suspicious"]

# División en train/test (80%/20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Crear modelo Random Forest
model = RandomForestClassifier(random_state=42, n_estimators=100)

# Entrenar modelo
model.fit(X_train, y_train)

# Predecir probabilidades en test
y_pred_prob = model.predict_proba(X_test)[:, 1]

# Evaluar con AUC
auc = roc_auc_score(y_test, y_pred_prob)
print(f"✅ AUC: {auc:.3f}")


In [0]:
# 📌 Verificación: Tabla de features y modelo
df_features = spark.table("datalottery.lotterybets.lottery_bets_dirty_features")
print(f"✅ Features cargadas: {df_features.count()} registros")

# Convertir a pandas
df_pd = df_features.select(
    "bets_last_7d", "win_rate_last_30d", "ip_risk", "geo_risk", 
    "num_picks", "avg_stake_amount", "suspicious"
).toPandas()

# 🔁 Convertir columnas categóricas a numéricas
from sklearn.preprocessing import LabelEncoder

for col in ["ip_risk", "geo_risk"]:
    if df_pd[col].dtype == "object":
        le = LabelEncoder()
        df_pd[col] = le.fit_transform(df_pd[col].astype(str))

# Separar X, y
X = df_pd.drop(columns=["suspicious"])
y = df_pd["suspicious"]

# Intentar cargar el modelo
try:
    model = joblib.load("/tmp/ProyectoMLOps_rf_model.joblib")
    y_pred = model.predict_proba(X)[:, 1]
    auc = roc_auc_score(y, y_pred)
    print(f"🎯 AUC del modelo actual: {auc:.3f}")
except Exception as e:
    print("⚠️ No se pudo cargar el modelo. Error:", e)