In [0]:
"""
Telco Churn Feature Engineering Notebook

This notebook performs feature engineering for a telco churn prediction pipeline. It reads raw customer data, computes features, and saves both features and labels to specified tables for downstream machine learning tasks.

Workflow:
1. Parameterize input and output table paths, label table name, and primary key columns using Databricks widgets.
2. Read raw input data from the specified table.
3. Generate churn labels and save them to a separate table.
4. Compute customer-level features using a custom feature computation function.
5. Create and write the feature table using Databricks FeatureEngineeringClient.

Parameters:
- input_table_path: Path to the input raw data table.
- output_table_name: Name of the output feature table.
- label_table_name: Name of the output label table.
- primary_keys: Comma-separated list of primary key columns for the feature table.

Outputs:
- Feature table containing engineered features for each customer.
- Label table containing churn labels for each customer.

Dependencies:
- features.compute_features.compute_features_fn: Function to compute features from raw data.
- databricks.feature_engineering.FeatureEngineeringClient: Client for managing feature tables.

"""

In [0]:
dbutils.widgets.text(
    "input_table_path",
    "mlops_dbx_talk_dev.churn.telco_churn_inference_raw",
    label="Input Table Name",
)

# Feature table to store the computed features.
dbutils.widgets.text(
    "output_table_name",
    "mlops_dbx_talk_dev.churn.telco_cust_features",
    label="Output Feature Table Name",
)

dbutils.widgets.text(
    "label_table_name",
    "mlops_dbx_talk_dev.churn.telco_cust_labels",
    label="Output Feature Table Name",
)



# Primary Keys columns for the feature table;
dbutils.widgets.text(
    "primary_keys",
    "customer_id",
    label="Primary keys columns for the feature table, comma separated.",
)

In [0]:
import os
import os, sys
from pyspark.sql import functions as F

sys.path.append('..')

from features.compute_features import compute_features_fn
from databricks.feature_engineering import FeatureEngineeringClient
# notebook_path =  '/Workspace/' + os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get())
# %cd $notebook_path
# %cd ../features

In [0]:
input_table_path = dbutils.widgets.get("input_table_path")
output_table_name = dbutils.widgets.get("output_table_name")
label_table_name = dbutils.widgets.get("label_table_name")
pk_columns = dbutils.widgets.get("primary_keys")

In [0]:
df_raw = spark.table(input_table_path)

In [0]:
# Labels (NO van a Feature Store; van a una tabla aparte para training sets)
df_labels = (
    df_raw
    .select(
        F.col("customerID").alias("customer_id"),
        F.when(F.col("Churn") == "Yes", F.lit(1)).otherwise(F.lit(0)).cast("int").alias("churn")
    )
    .dropDuplicates(["customer_id"])
)

df_labels.write.mode("overwrite").saveAsTable(label_table_name)

display(df_labels.groupBy("churn").count())


In [0]:
# Compute features
features_df = compute_features_fn(df_raw)
display(features_df.limit(5))

In [0]:
fe = FeatureEngineeringClient()

# Create the feature table if it does not exist first.
# Note that this is a no-op if a table with the same name and schema already exists.
fe.create_table(
    name=output_table_name,    
    primary_keys=pk_columns,  # Include timeseries column in primary_keys
    df=features_df,
    description="Telco churn - customer-level features."
)

# Write the computed features dataframe.
fe.write_table(
    name=output_table_name,
    df=features_df,
    mode="merge",
)