# Training with Materialized Features

This notebook consumes `registry/offline/features_daily.parquet`, builds a simple label (next-day purchase), then trains a baseline model with proper splits and metrics.

Industrial aspects: deterministic splits, leakage control, and basic model reporting.

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

FEAT_PATH = os.path.abspath(os.path.join('..', 'registry', 'offline', 'features_daily.parquet'))
assert os.path.exists(FEAT_PATH), f'Missing features at {FEAT_PATH}. Run notebook 01 first.'

df = pd.read_parquet(FEAT_PATH)
df['event_date'] = pd.to_datetime(df['event_date'])
df.head(), df.shape

## Label engineering
We predict whether the user will make **a purchase the next day**.

In [None]:
df = df.sort_values(['user_id', 'event_date']).reset_index(drop=True)
df['purchase_next_day'] = (
    df.groupby('user_id')['c_purchase'].shift(-1).fillna(0).astype(int) > 0
).astype(int)

features = [
    'events_total', 'c_login', 'c_view', 'c_support', 'c_purchase',
    'purchase_amount', 'purchase_amount_7d', 'days_since_last_activity'
]
X = df[features].astype(float)
y = df['purchase_next_day'].astype(int)

y.value_counts(normalize=True)

## Split (time-aware approximation)
We keep the last 20% of dates as validation to reduce leakage.

In [None]:
cut = df['event_date'].quantile(0.8)
train_idx = df['event_date'] <= cut
X_train, y_train = X[train_idx], y[train_idx]
X_val, y_val = X[~train_idx], y[~train_idx]

pipe = Pipeline([
    ('scaler', StandardScaler(with_mean=True)),
    ('clf', LogisticRegression(max_iter=200, class_weight='balanced'))
])

pipe.fit(X_train, y_train)
p_val = pipe.predict_proba(X_val)[:, 1]

roc = roc_auc_score(y_val, p_val)
ap = average_precision_score(y_val, p_val)
roc, ap

## Coefficients

In [None]:
coefs = pd.Series(pipe.named_steps['clf'].coef_[0], index=features).sort_values()
coefs