# Churn Lab: TFDV + Wide & Deep (TensorFlow/Keras)

This notebook walks through a complete workflow:

1. **Load & inspect** a tabular dataset (synthetic churn)
2. **Data Validation with TFDV**
   - statistics → schema → anomalies
3. **Train a different Keras model**: **Wide & Deep** (Functional API)
4. **Evaluate + export artifacts**


In [None]:
import os
from pathlib import Path
import pandas as pd
import numpy as np

ROOT = Path(".").resolve()
DATA = ROOT / "data"
ART = ROOT / "artifacts"
ART.mkdir(exist_ok=True)


## 1) Load the dataset

In [None]:
df = pd.read_csv(DATA / "train.csv")
df.head()


In [None]:
df.describe(include="all").T.head(20)


## 2) TFDV: statistics → schema → anomalies

In [None]:
import tensorflow_data_validation as tfdv
import tensorflow as tf


In [None]:
# Helper: CSV -> TFRecord for TFDV
def _to_feature(v):
    import pandas as pd
    if v is None or (isinstance(v, float) and pd.isna(v)) or (isinstance(v, str) and v == ""):
        return None
    if isinstance(v, (int, bool)) or (isinstance(v, float) and float(v).is_integer()):
        return tf.train.Feature(int64_list=tf.train.Int64List(value=[int(v)]))
    if isinstance(v, (float,)):
        return tf.train.Feature(float_list=tf.train.FloatList(value=[float(v)]))
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[str(v).encode("utf-8")]))

def csv_to_tfrecord(csv_path: Path, out_path: Path):
    import pandas as pd
    df = pd.read_csv(csv_path)
    with tf.io.TFRecordWriter(str(out_path)) as w:
        for _, row in df.iterrows():
            feat = {}
            for col in df.columns:
                f = _to_feature(row[col])
                if f is not None:
                    feat[col] = f
            ex = tf.train.Example(features=tf.train.Features(feature=feat))
            w.write(ex.SerializeToString())


In [None]:
tfdv_dir = ART / "tfdv"
tfdv_dir.mkdir(exist_ok=True)

train_tfr = tfdv_dir / "train.tfrecord"
test_tfr = tfdv_dir / "test.tfrecord"
anom_tfr = tfdv_dir / "test_anomalous.tfrecord"

csv_to_tfrecord(DATA/"train.csv", train_tfr)
csv_to_tfrecord(DATA/"test.csv", test_tfr)
csv_to_tfrecord(DATA/"test_anomalous.csv", anom_tfr)

train_stats = tfdv.generate_statistics_from_tfrecord(data_location=str(train_tfr))
test_stats  = tfdv.generate_statistics_from_tfrecord(data_location=str(test_tfr))
anom_stats  = tfdv.generate_statistics_from_tfrecord(data_location=str(anom_tfr))

tfdv.visualize_statistics(train_stats)


In [None]:
schema = tfdv.infer_schema(train_stats)
tfdv.display_schema(schema)


In [None]:
# Validate clean test set
anomalies_test = tfdv.validate_statistics(test_stats, schema)
tfdv.display_anomalies(anomalies_test)


In [None]:
# Validate intentionally corrupted dataset
anomalies_bad = tfdv.validate_statistics(anom_stats, schema)
tfdv.display_anomalies(anomalies_bad)


## 3) Train a Wide & Deep Keras model

In [None]:
import tensorflow as tf

CATEGORICAL_STR = ["gender", "internet_service", "contract_type", "payment_method"]
CATEGORICAL_INT = ["senior_citizen", "partner", "dependents", "paperless_billing"]
NUMERIC = ["tenure_months", "monthly_charges", "total_charges"]
LABEL = "churn"
ID_COL = "customer_id"

def make_dataset(csv_path: Path, batch_size=64, shuffle=False):
    ds = tf.data.experimental.make_csv_dataset(
        file_pattern=str(csv_path),
        batch_size=batch_size,
        label_name=LABEL,
        num_epochs=1,
        header=True,
        na_value="",
        shuffle=shuffle,
        shuffle_buffer_size=4096,
        ignore_errors=True,
    )
    ds = ds.map(lambda x, y: ({k: v for k, v in x.items() if k != ID_COL}, y))
    return ds.prefetch(tf.data.AUTOTUNE)

train_df = pd.read_csv(DATA/"train.csv")
train_ds = make_dataset(DATA/"train.csv", shuffle=True)
val_ds   = make_dataset(DATA/"val.csv", shuffle=False)
test_ds  = make_dataset(DATA/"test.csv", shuffle=False)


In [None]:
def build_preprocessing(train_df: pd.DataFrame):
    inputs = {}
    encoded = []
    wide_features = []

    # string categorical -> embeddings
    for col in CATEGORICAL_STR:
        inp = tf.keras.Input(shape=(1,), name=col, dtype=tf.string)
        lookup = tf.keras.layers.StringLookup(output_mode="int")
        lookup.adapt(train_df[col].astype(str).values)
        vocab = lookup.vocabulary_size()
        x = lookup(inp)
        x = tf.keras.layers.Embedding(vocab, output_dim=min(16, max(4, vocab//2)))(x)
        x = tf.keras.layers.Reshape((-1,))(x)
        inputs[col] = inp
        encoded.append(x)

    # int categorical -> one hot (wide)
    for col in CATEGORICAL_INT:
        inp = tf.keras.Input(shape=(1,), name=col, dtype=tf.int32)
        lookup = tf.keras.layers.IntegerLookup(output_mode="one_hot")
        lookup.adapt(train_df[col].astype(int).values)
        x = lookup(inp)
        inputs[col] = inp
        wide_features.append(x)

    # numeric -> normalization
    for col in NUMERIC:
        inp = tf.keras.Input(shape=(1,), name=col, dtype=tf.float32)
        norm = tf.keras.layers.Normalization()
        norm.adapt(train_df[col].astype(float).values.reshape(-1,1))
        x = norm(inp)
        inputs[col] = inp
        encoded.append(x)

    wide = tf.keras.layers.Concatenate()(wide_features)
    deep = tf.keras.layers.Concatenate()(encoded)
    return inputs, wide, deep

inputs, wide, deep = build_preprocessing(train_df)

x = tf.keras.layers.Dense(64, activation="relu")(deep)
x = tf.keras.layers.Dropout(0.25)(x)
x = tf.keras.layers.Dense(32, activation="relu")(x)
combined = tf.keras.layers.Concatenate()([wide, x])
out = tf.keras.layers.Dense(1, activation="sigmoid")(combined)

model = tf.keras.Model(inputs=inputs, outputs=out)
model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-3),
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=[tf.keras.metrics.AUC(name="auc"), tf.keras.metrics.Precision(name="precision"), tf.keras.metrics.Recall(name="recall")]
)
model.summary()


In [None]:
history = model.fit(train_ds, validation_data=val_ds, epochs=5)
eval_metrics = model.evaluate(test_ds, return_dict=True)
eval_metrics


## 4) Export artifacts

In [None]:
model_dir = ART / "model"
model_dir.mkdir(exist_ok=True)
model.save(model_dir / "saved_model", include_optimizer=False)
print("Saved to", model_dir/"saved_model")
