In [0]:
# ============================================
# INSTALACAO DO FEATURE STORE
# ============================================

print("verificando feature store")

try:
    from databricks.feature_store import FeatureStoreClient
    print("feature store ja instalado")
except ImportError:
    print("\ninstalando databricks-feature-store")
    
    import subprocess
    import sys
    
    result = subprocess.run([
        sys.executable, "-m", "pip", "install", 
        "databricks-feature-store"
    ], capture_output=True, text=True)
    
    print(result.stdout)
    
    if result.returncode == 0:
        print("\nfeature store instalado")
        print("reiniciando python")
        dbutils.library.restartPython()
    else:
        print("\nerro na instalacao")
        print(result.stderr)

In [0]:
# ============================================
# DATABRICKS FEATURE STORE
# ============================================

from databricks.feature_store import FeatureStoreClient
from pyspark.sql import functions as F

fs = FeatureStoreClient()

print("feature store client inicializado")
print("objetivo: criar tabela de features reutilizavel")

# ============================================
# CARREGAR DADOS
# ============================================

df_silver = spark.read.table("finance_silver.transacoes_silver")

# ============================================
# CRIAR FEATURES AGREGADAS
# ============================================

print("\ncriando features agregadas por categoria")

df_features_categoria = df_silver.groupBy("categoria").agg(
    F.avg("valor").alias("valor_medio"),
    F.stddev("valor").alias("valor_desvio_padrao"),
    F.min("valor").alias("valor_minimo"),
    F.max("valor").alias("valor_maximo"),
    F.count("*").alias("total_transacoes"),
    F.countDistinct("ano_mes").alias("meses_ativos"),
    F.sum(F.when(F.col("alto_valor") == True, 1).otherwise(0)).alias("qtd_alto_valor"),
    F.mode("dia_semana").alias("dia_semana_mais_comum")
).withColumn(
    "pct_alto_valor",
    F.round((F.col("qtd_alto_valor") / F.col("total_transacoes")) * 100, 2)
)

print(f"\nfeatures criadas: {len(df_features_categoria.columns)} colunas")
print(f"categorias: {df_features_categoria.count()}")

print("\namostra das features:")
display(df_features_categoria)

In [0]:
# ============================================
# REGISTRAR FEATURES NO FEATURE STORE
# ============================================

print("registrando features no feature store")

feature_table_name = "finance_ml_project.categoria_features"

try:
    fs.create_table(
        name=feature_table_name,
        primary_keys=["categoria"],
        df=df_features_categoria,
        description="Features agregadas por categoria para classificacao de transacoes"
    )
    print(f"\ntabela criada: {feature_table_name}")
    
except Exception as e:
    if "already exists" in str(e).lower():
        print(f"\ntabela ja existe, atualizando")
        fs.write_table(
            name=feature_table_name,
            df=df_features_categoria,
            mode="overwrite"
        )
        print(f"tabela atualizada: {feature_table_name}")
    else:
        print(f"\nerro: {e}")

# ============================================
# VERIFICAR CRIACAO
# ============================================

print("\ninformacoes da tabela:")
print(f"nome: {feature_table_name}")
print(f"primary key: categoria")
print(f"features: {len(df_features_categoria.columns) - 1}")

print("\nuso das features:")
print("estas features podem ser usadas em:")
print("- modelos de ml (join automatico com feature store)")
print("- analises ad-hoc")
print("- dashboards")
print("- outras aplicacoes")

print("\nfeature store concluido")

In [0]:
# ============================================
# DATABRICKS FEATURE STORE
# ============================================

from databricks.feature_store import FeatureStoreClient
from pyspark.sql import functions as F

fs = FeatureStoreClient()

print("feature store client inicializado")

# ============================================
# CARREGAR DADOS
# ============================================

df_silver = spark.read.table("finance_silver.transacoes_silver")

# ============================================
# CRIAR FEATURES AGREGADAS
# ============================================

print("\ncriando features agregadas por categoria")

df_features_categoria = df_silver.groupBy("categoria").agg(
    F.avg("valor").alias("valor_medio"),
    F.stddev("valor").alias("valor_desvio_padrao"),
    F.min("valor").alias("valor_minimo"),
    F.max("valor").alias("valor_maximo"),
    F.count("*").alias("total_transacoes"),
    F.countDistinct("ano_mes").alias("meses_ativos"),
    F.sum(F.when(F.col("alto_valor") == True, 1).otherwise(0)).alias("qtd_alto_valor"),
    F.mode("dia_semana").alias("dia_semana_mais_comum")
).withColumn(
    "pct_alto_valor",
    F.round((F.col("qtd_alto_valor") / F.col("total_transacoes")) * 100, 2)
)

print(f"\nfeatures criadas: {len(df_features_categoria.columns)} colunas")
print(f"categorias: {df_features_categoria.count()}")

print("\namostra das features:")
display(df_features_categoria)

# ============================================
# VERIFICAR E CRIAR SCHEMA SE NECESSARIO
# ============================================

print("\nverificando schema")

catalog_name = "workspace"
schema_name = "finance_gold"

try:
    spark.sql(f"USE CATALOG {catalog_name}")
    spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema_name}")
    print(f"schema {catalog_name}.{schema_name} disponivel")
except Exception as e:
    print(f"aviso: {e}")
    catalog_name = "main"
    spark.sql(f"USE CATALOG {catalog_name}")
    spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema_name}")
    print(f"usando catalog {catalog_name}")

# ============================================
# REGISTRAR FEATURES NO FEATURE STORE
# ============================================

print("\nregistrando features no feature store")

feature_table_name = f"{catalog_name}.{schema_name}.categoria_features"

try:
    fs.create_table(
        name=feature_table_name,
        primary_keys=["categoria"],
        df=df_features_categoria,
        description="features agregadas por categoria para classificacao de transacoes"
    )
    print(f"\ntabela criada: {feature_table_name}")
    
except Exception as e:
    if "already exists" in str(e).lower():
        print(f"\ntabela ja existe, atualizando")
        fs.write_table(
            name=feature_table_name,
            df=df_features_categoria,
            mode="overwrite"
        )
        print(f"tabela atualizada: {feature_table_name}")
    else:
        print(f"\nerro: {e}")
        print("\ntentando abordagem alternativa")
        
        df_features_categoria.write \
            .format("delta") \
            .mode("overwrite") \
            .option("overwriteSchema", "true") \
            .saveAsTable(feature_table_name)
        
        print(f"tabela criada via delta: {feature_table_name}")

# ============================================
# VERIFICAR CRIACAO
# ============================================

print("\ninformacoes da tabela:")
print(f"nome: {feature_table_name}")
print(f"primary key: categoria")
print(f"features: {len(df_features_categoria.columns) - 1}")

print("\nuso das features:")
print("estas features podem ser usadas em:")
print("- modelos de ml")
print("- analises ad-hoc")
print("- dashboards")

print("\ncomo usar:")
print(f"""
df_features = spark.read.table('{feature_table_name}')

df_train = spark.read.table('finance_silver.transacoes_silver')
df_with_features = df_train.join(df_features, on='categoria', how='left')
""")

print("\nfeature store concluido")