# Feature Store â€” Data Quality + Leakage Checks

Industrial notebook that adds quality gates for feature tables:
- missingness / range checks
- drift monitoring (population stability index-like)
- leakage heuristics (target correlation spikes)

Uses the offline materialized Parquet generated in notebook 01.
Outputs are saved when executed.

In [1]:
import os
import numpy as np
import pandas as pd
from datetime import datetime, timedelta, timezone

FEAT_PATH = os.path.abspath(os.path.join('..','registry','offline','features_daily.parquet'))
CSV_PATH = os.path.abspath(os.path.join('..','registry','offline','features_daily.csv'))
os.makedirs(os.path.dirname(FEAT_PATH), exist_ok=True)

def parquet_supported() -> bool:
    try:
        import pyarrow  # noqa: F401
        return True
    except Exception:
        try:
            import fastparquet  # noqa: F401
            return True
        except Exception:
            return False

# Self-contained fallback: if features are missing, generate a minimal synthetic feature table
if (parquet_supported() and not os.path.exists(FEAT_PATH)) or ((not parquet_supported()) and not os.path.exists(CSV_PATH)):
    rng = np.random.default_rng(1337)
    n_users = 1500
    days = 45
    start = datetime.now(timezone.utc) - timedelta(days=days)

    rows = []
    for user_id in range(1, n_users + 1):
        last = 0
        for d in range(days):
            dt = (start + timedelta(days=d)).date()
            events_total = int(rng.poisson(5 + (user_id % 7) * 0.2))
            c_login = int(rng.binomial(events_total, 0.2))
            c_view = int(rng.binomial(events_total, 0.6))
            c_support = int(rng.binomial(events_total, 0.05))
            c_purchase = int(rng.binomial(events_total, 0.08))
            purchase_amount = float(c_purchase * rng.lognormal(3.8, 0.6))
            last = 0 if events_total > 0 else last + 1
            rows.append((user_id, dt, events_total, c_login, c_view, c_support, c_purchase, purchase_amount, last))

    df_gen = pd.DataFrame(rows, columns=[
        'user_id','event_date','events_total','c_login','c_view','c_support','c_purchase','purchase_amount','days_since_last_activity'
    ])
    df_gen['purchase_amount_7d'] = (
        df_gen.sort_values(['user_id','event_date'])
              .groupby('user_id')['purchase_amount']
              .rolling(7, min_periods=1)
              .sum()
              .reset_index(level=0, drop=True)
    )
    if parquet_supported():
        df_gen.to_parquet(FEAT_PATH, index=False)
    else:
        df_gen.to_csv(CSV_PATH, index=False)

if parquet_supported():
    df = pd.read_parquet(FEAT_PATH)
else:
    df = pd.read_csv(CSV_PATH)
df['event_date'] = pd.to_datetime(df['event_date'])
df.head(), df.shape

(   user_id event_date  events_total  c_login  c_view  c_support  c_purchase  \
 0        1 2025-12-26             6        1       4          1           0   
 1        1 2025-12-27             4        0       1          0           1   
 2        1 2025-12-28             4        1       0          0           0   
 3        1 2025-12-29             6        1       3          1           0   
 4        1 2025-12-30             4        1       1          1           0   
 
    purchase_amount  days_since_last_activity  purchase_amount_7d  
 0         0.000000                         0            0.000000  
 1        74.341604                         0           74.341604  
 2         0.000000                         0           74.341604  
 3         0.000000                         0           74.341604  
 4         0.000000                         0           74.341604  ,
 (67500, 10))

## 1) Basic quality report

In [2]:
num_cols = [c for c in df.columns if c not in ['user_id','event_date']]
missing = df[num_cols].isna().mean().sort_values(ascending=False)
ranges = df[num_cols].describe().T[['min','max','mean','std']]
missing.head(), ranges.head()

(events_total    0.0
 c_login         0.0
 c_view          0.0
 c_support       0.0
 c_purchase      0.0
 dtype: float64,
               min   max      mean       std
 events_total  0.0  18.0  5.587748  2.395759
 c_login       0.0   8.0  1.115348  1.059049
 c_view        0.0  13.0  3.356948  1.845744
 c_support     0.0   4.0  0.279941  0.529186
 c_purchase    0.0   5.0  0.442770  0.664467)

## 2) Simple drift check (train vs recent)
We compare distributions with a coarse PSI-like score.

In [3]:
def psi(a: np.ndarray, b: np.ndarray, bins=10) -> float:
    a = np.asarray(a); b=np.asarray(b)
    qs = np.quantile(a, np.linspace(0,1,bins+1))
    qs[0] -= 1e-9; qs[-1] += 1e-9
    pa,_ = np.histogram(a, bins=qs)
    pb,_ = np.histogram(b, bins=qs)
    pa = pa/ max(1, pa.sum()); pb = pb/ max(1, pb.sum())
    pa = np.clip(pa, 1e-6, 1); pb = np.clip(pb, 1e-6, 1)
    return float(np.sum((pb-pa)*np.log(pb/pa)))

cut = df['event_date'].quantile(0.8)
train = df[df['event_date'] <= cut]
recent = df[df['event_date'] > cut]

scores = []
for c in num_cols:
    scores.append({'feature': c, 'psi': psi(train[c].values, recent[c].values, bins=12)})
pd.DataFrame(scores).sort_values('psi', ascending=False).head(12)

Unnamed: 0,feature,psi
7,purchase_amount_7d,0.028597
2,c_view,0.000466
0,events_total,0.000375
3,c_support,0.000309
5,purchase_amount,0.000213
1,c_login,0.000205
4,c_purchase,0.00012
6,days_since_last_activity,0.0
