In [0]:
%pip install databricks-feature-engineering
%restart_python 

In [0]:
dbutils.widgets.text("project_catalog_name", "", "project_catalog_name")
dbutils.widgets.text("project_schema_name", "", "project_schema_name")
dbutils.widgets.text("features", "", "features")
dbutils.widgets.text("name", "", "name")
dbutils.widgets.text("eol_view", "", "eol_view")
dbutils.widgets.text("label", "", "label")



In [0]:
project_catalog_name = dbutils.widgets.get("project_catalog_name")
project_schema_name = dbutils.widgets.get("project_schema_name")
features = dbutils.widgets.get("features")
name = dbutils.widgets.get("name")
eol_view = dbutils.widgets.get("eol_view")
label = dbutils.widgets.get("label")


In [0]:
import sys
import json
from typing import List, Dict, Any
from databricks.feature_engineering.entities.feature_lookup import FeatureLookup
from databricks.feature_engineering import FeatureEngineeringClient, FeatureFunction

fe = FeatureEngineeringClient()

In [0]:
import json

try:
    
    features = json.loads(features)

except json.JSONDecodeError as e:
    print(f"Error parsing features JSON: {e}")
    dbutils.notebook.exit("Failed to parse features JSON")
except Exception as e:
    print(f"Error processing features: {e}")
    dbutils.notebook.exit("Failed to process features")


# Generate FeatureLookup objects
feature_lookups = []

for feat in features:
    if isinstance(feat, str):
        try:
            # Try to parse as JSON first
            feat_dict = json.loads(feat)
        
        except json.JSONDecodeError:
            # Handle the case where feat is not a valid JSON string
            break;
    elif isinstance(feat, dict):
        feat_dict = feat
    else:
        print(f"Warning: Unexpected feature format: {feat}")
        break
    

    # Extract components
    
    table_name = feat_dict.get('table', '')
    feature_names = feat_dict.get('features', [])
    lookup_key = feat_dict.get('lookup_key')
    timestamp_key = feat_dict.get('timestamp_key', None)
    
    if not table_name:
        continue
        
    if not lookup_key:
        continue
    
    # Create FeatureLookup object
    if timestamp_key:
        feature_lookup_obj = FeatureLookup(
            table_name=table_name,
            feature_names=feature_names,
            lookup_key=lookup_key,
            timestamp_lookup_key=timestamp_key
        )
    else:
        feature_lookup_obj = FeatureLookup(
            table_name=table_name,
            feature_names=feature_names,
            lookup_key=lookup_key
        )
    
    feature_lookups.append(feature_lookup_obj)

In [0]:
import time

def create_table_with_epoch(df, split, catalog, schema, base_name):
    base_name = base_name.replace(' ', '_')
    epoch = int(time.time())
    train_table_name = f"{base_name}_train_{epoch}"
    eval_table_name = f"{base_name}_eval_{epoch}"
    train_df, val_df = df.randomSplit([1.0 - split, split], seed=42)
    train_df.write.saveAsTable(f"{catalog}.{schema}.{train_table_name}")
    val_df.write.saveAsTable(f"{catalog}.{schema}.{eval_table_name}")
    return train_table_name, eval_table_name
    

In [0]:
df = spark.sql(f"SELECT * FROM {eol_view}")

In [0]:
exclude_columns = [col for col in df.columns if col != label]
print(exclude_columns)

In [0]:
training_set = fe.create_training_set(
  df=df,
  feature_lookups=feature_lookups,
  label=label,
  exclude_columns = exclude_columns
)

training_df = training_set.load_df()

In [0]:
train_table_name, eval_table_name = create_table_with_epoch(training_df, 0.2, project_catalog_name, project_schema_name, name)

In [0]:
dbutils.jobs.taskValues.set(key="train_table_name", value=train_table_name)
dbutils.jobs.taskValues.set(key="eval_table_name", value=eval_table_name)

In [0]:
import json

dbutils.notebook.exit(json.dumps({
    'train_table_name': train_table_name,
    'eval_table_name': eval_table_name
}))
