In [0]:

#notebooks/04_evaluation_and_registry.py
# Databricks notebook source
# MAGIC %md
# MAGIC # Inferência de modelo de fraude

# COMMAND ----------

import os
import mlflow
import mlflow.spark
from mlflow import MlflowClient
from pyspark.sql import functions as F
from pyspark.ml.functions import vector_to_array

# Tabelas e modelo
INPUT_TABLE = "workspace.default.features_delta"
OUTPUT_TABLE = "workspace.default.predictions_delta"
REGISTERED_MODEL_NAME = "workspace.default.fraud_detection_rf"

# Configurar MLflow para UC
mlflow.set_registry_uri("databricks-uc")

# Definir diretório temporário em um UC Volume
os.environ["MLFLOW_DFS_TMP"] = "/Volumes/workspace/default/fraud_data/tmp"

# COMMAND ----------

# Carregar dados de features
df = spark.table(INPUT_TABLE)

display(df.limit(10))

# COMMAND ----------

# Carregar modelo pela versão (exemplo: versão 1)
model = mlflow.spark.load_model(
    f"models:/{REGISTERED_MODEL_NAME}/1",
    dfs_tmpdir="/Volumes/workspace/default/fraud_data/tmp"  # ✅ caminho UC
)

# Se quiser usar alias 'latest', primeiro crie com MlflowClient:
# client = MlflowClient()
# client.set_registered_model_alias(
#     name=REGISTERED_MODEL_NAME,
#     alias="latest",
#     version=1
# )
# Depois pode carregar com:
# model = mlflow.spark.load_model(
#     f"models:/{REGISTERED_MODEL_NAME}@latest",
#     dfs_tmpdir="/Volumes/workspace/default/fraud_data/tmp"
# )

# COMMAND ----------

# Aplicar modelo
predictions = model.transform(df)

# Converter probability (VectorUDT) em array
predictions = predictions.withColumn("probability_array", vector_to_array("probability"))

# Selecionar colunas relevantes
predictions_df = predictions.select(
    "transaction_id",
    "user_id",
    "amount",
    "country",
    "channel",
    "merchant_category",
    "transaction_time",
    "is_fraud",
    F.col("probability_array")[1].alias("fraud_probability"),  # índice 1 = probabilidade de fraude
    F.col("prediction").alias("fraud_prediction")
)

# ✅ Adicionar coluna 'date' para particionamento
predictions_df = predictions_df.withColumn("date", F.to_date("transaction_time"))

display(predictions_df.limit(10))

# COMMAND ----------

# Salvar previsões como tabela gerenciada
predictions_df.write.saveAsTable(
    name=OUTPUT_TABLE,
    format="delta",
    mode="overwrite",
    partitionBy="date"
)

# COMMAND ----------

# Otimização
spark.sql(f"OPTIMIZE {OUTPUT_TABLE} ZORDER BY (user_id, fraud_probability)")
spark.sql(f"VACUUM {OUTPUT_TABLE} RETAIN 168 HOURS")

# COMMAND ----------
# MAGIC %md
# MAGIC ### Pronto: previsões salvas em `workspace.default.predictions_delta`

transaction_id,user_id,account_age_days,total_transactions_user,avg_amount_user,amount,country,bin_country,channel,merchant_category,promo_used,avs_match,cvv_result,three_ds_flag,transaction_time,shipping_distance_km,is_fraud,date,country_mismatch,is_night,shipping_distance_bucket,amount_vs_avg
228,5,497,53,71.23,34.91,PL,PL,web,gaming,0,1,1,1,2024-06-12T08:18:55.000Z,199.08,0,2024-06-12,0,0,long,0.4833171812266371
1073,22,1738,52,71.82,64.8,IT,IT,web,electronics,0,0,0,1,2024-06-12T06:39:16.000Z,14.47,0,2024-06-12,0,1,medium,0.8898654215874761
1706,34,691,45,31.26,64.72,ES,ES,web,fashion,0,1,1,1,2024-06-12T07:45:10.000Z,384.21,0,2024-06-12,0,0,long,2.006199628022318
1810,36,1129,60,26.55,22.63,US,US,app,electronics,0,1,1,1,2024-06-12T07:14:01.000Z,273.77,0,2024-06-12,0,0,long,0.8214156079854809
2039,40,129,51,109.6,102.35,US,US,web,electronics,0,1,0,0,2024-06-12T21:58:31.000Z,104.74,0,2024-06-12,0,0,long,0.9254068716094032
2501,49,782,58,126.93,342.23,TR,TR,web,gaming,0,1,1,1,2024-06-12T03:45:55.000Z,79.53,0,2024-06-12,0,1,medium,2.675134839365278
3429,68,1586,55,245.56,202.11,PL,PL,app,gaming,0,0,1,1,2024-06-12T04:06:44.000Z,29.25,0,2024-06-12,0,1,medium,0.819719338092148
3520,70,1200,50,47.22,108.58,TR,TR,app,travel,0,1,1,1,2024-06-12T22:21:03.000Z,215.89,0,2024-06-12,0,0,long,2.2517627540439653
3768,75,1256,58,6.56,6.18,DE,DE,web,gaming,0,1,1,1,2024-06-12T07:19:24.000Z,349.0,0,2024-06-12,0,0,long,0.8174603174603174
4476,90,1388,56,150.4,237.72,FR,FR,web,gaming,0,1,1,1,2024-06-12T21:13:34.000Z,237.28,0,2024-06-12,0,0,long,1.5701453104359313




transaction_id,user_id,amount,country,channel,merchant_category,transaction_time,is_fraud,fraud_probability,fraud_prediction,date
569,12,80.47,RO,web,electronics,2024-04-22T01:56:35.000Z,0,0.0029404866718085,0.0,2024-04-22
1016,21,396.13,TR,web,electronics,2024-04-22T18:19:37.000Z,0,0.0099856588666963,0.0,2024-04-22
1858,37,230.78,PL,app,fashion,2024-04-22T05:06:00.000Z,0,0.0013662585287661,0.0,2024-04-22
2128,42,68.51,RO,app,grocery,2024-04-22T19:41:45.000Z,0,0.001426215181188,0.0,2024-04-22
2235,44,108.03,NL,app,travel,2024-04-22T18:53:32.000Z,0,0.0014554524136558,0.0,2024-04-22
2829,56,269.53,ES,web,gaming,2024-04-22T15:33:26.000Z,0,0.0027629546608608,0.0,2024-04-22
3281,65,275.36,DE,app,grocery,2024-04-22T00:30:33.000Z,0,0.0043651896686332,0.0,2024-04-22
3472,69,159.1,DE,web,gaming,2024-04-22T13:39:02.000Z,0,0.3262313723516047,0.0,2024-04-22
3702,74,352.79,DE,app,electronics,2024-04-22T14:22:36.000Z,0,0.0047101877960237,0.0,2024-04-22
3914,78,105.52,IT,app,fashion,2024-04-22T18:17:26.000Z,0,0.0016634313625728,0.0,2024-04-22


DataFrame[path: string]