# 02_inference — Promotion Recommendation Scoring

This notebook loads saved artifacts and scores new baskets using the current `promotions.csv` as candidate source. It follows the same feature order and preprocessors saved during training.


In [None]:
# Load artifacts & config
import os, json, joblib, yaml
import numpy as np
import pandas as pd
from pathlib import Path

with open('../config/training_config.yaml', 'r', encoding='utf-8') as f:
    CFG = yaml.safe_load(f)

# Fix relative paths in config (since we're running from notebooks/)
PROJECT_ROOT = Path('../').resolve()
for key in CFG['paths']:
    path_value = CFG['paths'][key]
    if not Path(path_value).is_absolute():
        CFG['paths'][key] = str(PROJECT_ROOT / path_value)

ptype_model = joblib.load(os.path.join(CFG['paths']['models_dir'], 'ptype_model.pkl'))
ranker_model = joblib.load(os.path.join(CFG['paths']['models_dir'], 'ranker_model.pkl'))
scaler_need = joblib.load(os.path.join(CFG['paths']['preprocessors_dir'], 'scaler_need.pkl'))
pca_need = joblib.load(os.path.join(CFG['paths']['preprocessors_dir'], 'pca_need.pkl'))
kmeans_need = joblib.load(os.path.join(CFG['paths']['preprocessors_dir'], 'kmeans_need.pkl'))

with open(os.path.join(CFG['paths']['metadata_dir'], 'ptype_classes.json'),'r',encoding='utf-8') as f:
    ptype_classes = json.load(f)
with open(os.path.join(CFG['paths']['metadata_dir'], 'ptype_featcols.json'),'r',encoding='utf-8') as f:
    ptype_featcols = json.load(f)
with open(os.path.join(CFG['paths']['metadata_dir'], 'ranker_featcols.json'),'r',encoding='utf-8') as f:
    ranker_featcols = json.load(f)

print('Artifacts loaded.')


Artifacts loaded.


In [None]:
# Predict need-states for users
def predict_need_states(X_need, artifacts):
    """Predict need-states using saved preprocessors"""
    # Scale features
    X_scaled = artifacts['scaler_need'].transform(X_need)
    
    # Apply PCA
    X_pca = artifacts['pca_need'].transform(X_scaled)
    
    # Predict clusters
    need_states = artifacts['kmeans_need'].predict(X_pca)
    
    return need_states

# Predict need-states
need_states = predict_need_states(X_need, artifacts)
df['need_state'] = need_states

print(f"Predicted need-states for {len(need_states)} transactions")
print(f"Need-state distribution: {np.bincount(need_states)}")


NameError: name 'X_need' is not defined

In [None]:
# Helpers (align with training)
from datetime import datetime

def add_time_features(df: pd.DataFrame, time_col: str) -> pd.DataFrame:
    ts = pd.to_datetime(df[time_col], errors='coerce')
    df['hour'] = ts.dt.hour
    df['dayofweek'] = ts.dt.dayofweek
    df['month'] = ts.dt.month
    df['is_weekend'] = ((df['dayofweek'] >= 5).astype(int))
    return df


def transform_need_state(basket_df: pd.DataFrame, need_features: list, scaler, pca, kmeans):
    X_scaled = scaler.transform(basket_df[need_features].fillna(0.0))
    X_pca = pca.transform(X_scaled)
    clusters = kmeans.predict(X_pca)
    ns = pd.DataFrame(X_pca, index=basket_df.index, columns=[f'ns_pca_{i+1}' for i in range(X_pca.shape[1])])
    ns['ns_cluster'] = clusters
    return ns


def compute_scope_match(basket_row: pd.Series, cand_df: pd.DataFrame) -> pd.Series:
    prod_scope = CFG['column_map_promo'].get('product_scope')
    basket_products = basket_row.get('basket_products', set())
    if (prod_scope is None) or (prod_scope not in cand_df.columns) or len(basket_products) == 0:
        return pd.Series([0.0]*len(cand_df), index=cand_df.index)
    def score(val):
        if pd.isna(val):
            return 0.0
        try:
            scope_set = set(str(val).split(','))
            overlap = scope_set.intersection(set(map(str, basket_products)))
            return float(len(overlap)) / float(max(1, len(scope_set)))
        except Exception:
            return 0.0
    return cand_df[prod_scope].apply(score)


def recall_candidates(basket_row, promos_df, ptype_probs, ptype_classes, cfg):
    order = np.argsort(ptype_probs)[::-1]
    sel = [ptype_classes[i] for i in order if ptype_probs[i] >= cfg['recall']['relevance_threshold']][:cfg['recall']['topk_types']]
    ptype_set = set(sel)
    cand = promos_df.copy()
    if cfg['filters'].get('respect_channel', False) and ('channel' in cand.columns):
        if bool(basket_row[cfg['column_map_tx']['is_online']]):
            cand = cand[cand['channel'].astype(str).str.lower().eq('online')]
        else:
            cand = cand[cand['channel'].astype(str).str.lower().eq('offline')]
    if cfg['filters'].get('use_time_window', True):
        sd = cfg['column_map_promo'].get('start_date')
        ed = cfg['column_map_promo'].get('end_date')
        ts = basket_row[cfg['column_map_tx']['timestamp']]
        if (sd in cand.columns) and (ed in cand.columns):
            cand = cand[(cand[sd] <= ts) & (ts <= cand[ed])]
    if cfg['filters'].get('respect_store_scope', False) and ('store_scope' in cand.columns):
        # best-effort simple store list filter
        sid = str(basket_row[cfg['column_map_tx']['store_id']])
        def ok(rec):
            v = rec.get('store_scope')
            if pd.isna(v):
                return True
            return sid in str(v).split(',')
        cand = cand[cand.apply(ok, axis=1)]
    cand = cand[cand[cfg['column_map_promo']['promo_type']].isin(ptype_set)]
    amt = float(basket_row.get('amount_total', 0.0))
    minsp = cfg['column_map_promo'].get('min_spend')
    if minsp and (minsp in cand.columns):
        cand = cand.assign(min_spend_gap=np.maximum(0.0, cand[minsp].fillna(0) - amt))
    else:
        cand = cand.assign(min_spend_gap=0.0)
    cand = cand.assign(scope_match_score=compute_scope_match(basket_row, cand))
    return cand.reset_index(drop=True)


In [None]:
# Debug: Check what promo_type one-hot features we need
print(f"Ranker expects these promo_type features: {[f for f in ranker_featcols if f.startswith('promo_type__')]}")
print(f"\nUnique promo types in candidates: {promos[CFG['column_map_promo']['promo_type']].unique()}")


In [None]:
# Inference: load inputs
# Example expects your new baskets in data_tx schema (minimal columns). For demo, we reuse a small sample from data/tx_merge3.csv
baskets = pd.read_csv(CFG['paths']['data_tx']).head(200)
# Parse timestamp
baskets[CFG['column_map_tx']['timestamp']] = pd.to_datetime(baskets[CFG['column_map_tx']['timestamp']], errors='coerce')

# Aggregate to basket-level similar to training
c = CFG['column_map_tx']
if ('amount_total' not in baskets.columns) and (c.get('amount_total') not in baskets.columns):
    baskets['__amount'] = baskets[c['price']] * baskets[c['qty']]
else:
    baskets['__amount'] = baskets.get(c.get('amount_total'), baskets.get('amount_total', np.nan))
if ('qty_total' not in baskets.columns) and (c.get('qty_total') not in baskets.columns):
    baskets['__qty'] = baskets[c['qty']]
else:
    baskets['__qty'] = baskets.get(c.get('qty_total'), baskets.get('qty_total', np.nan))

agg = baskets.groupby(c['transaction_id'], as_index=False).agg({
    c['user_id']: 'first',
    c['store_id']: 'first',
    c['timestamp']: 'first',
    c['zone']: 'first',
    c['province']: 'first',
    c['profile']: 'first',
    c['is_online']: 'first',
    '__amount': 'sum',
    '__qty': 'sum'
}).rename(columns={'__amount':'amount_total','__qty':'qty_total'})

# Basket products set
prod_col = c.get('product_id', 'product_id')
if prod_col in baskets.columns:
    prod_by_tx = (baskets.groupby(c['transaction_id'])[prod_col]
                  .apply(lambda s: set(s.astype(str)))
                  .rename('basket_products'))
    agg = agg.merge(prod_by_tx, left_on=c['transaction_id'], right_index=True, how='left')
else:
    agg['basket_products'] = [set()]*len(agg)

# Time features
agg = add_time_features(agg, c['timestamp'])

# Need-state features
need_features = ['amount_total','qty_total']
for opt in ['expected_basket_items','loyalty_score','price_elasticity']:
    if opt in agg.columns:
        need_features.append(opt)
ns_df = transform_need_state(agg, need_features, scaler_need, pca_need, kmeans_need)

# Load promos snapshot
promos = pd.read_csv(CFG['paths']['data_promos'])
for dcol in ['start_date','end_date']:
    col = CFG['column_map_promo'].get(dcol)
    if col and col in promos.columns:
        promos[col] = pd.to_datetime(promos[col], errors='coerce')

print('Prepared inference inputs.')


Prepared inference inputs.


  promos[col] = pd.to_datetime(promos[col], errors='coerce')
  promos[col] = pd.to_datetime(promos[col], errors='coerce')


In [None]:
# Debug: Check feature alignment
print(f"Expected ranker features ({len(ranker_featcols)}): {ranker_featcols[:5]}... {ranker_featcols[-5:]}")
print(f"\nActual rank_infer columns ({len(rank_infer.columns)}): {rank_infer.columns.tolist()[:5]}... {rank_infer.columns.tolist()[-5:]}")
print(f"\nMissing features: {set(ranker_featcols) - set(rank_infer.columns)}")
print(f"Extra features: {set(rank_infer.columns) - set(ranker_featcols) - {'transaction_id', 'promo_id', 'promo_type', 'score'}}")
print(f"\nSample ranking scores: {rank_infer['score'].describe()}")


In [None]:
# Stage-A scoring (ptype)
# Build X_ptype in the same column order as training
X_base_cols = [c for c in ptype_featcols if not c.startswith('ns_pca_') and c != 'ns_cluster']
X_ptype_infer = pd.concat([agg.reindex(columns=[col for col in X_base_cols]), ns_df[[col for col in ptype_featcols if col in ns_df.columns]]], axis=1).fillna(0)

ptype_proba = ptype_model.predict_proba(X_ptype_infer.values)

# Keep mapping for fast access per row
ptype_probs_by_type = []
for i in range(len(agg)):
    row = ptype_proba[i]
    pmap = {ptype_classes[j]: float(row[j]) for j in range(len(ptype_classes))}
    ptype_probs_by_type.append(pmap)

agg['ptype_probs'] = list(ptype_proba)
agg['ptype_probs_by_type'] = ptype_probs_by_type
print('Stage-A scoring done.')


Stage-A scoring done.


In [None]:
# Candidate recall and ranking features build
rank_rows = []
ptype_col = CFG['column_map_promo']['promo_type']
pid_col = CFG['column_map_promo']['promo_id']

for idx, row in agg.iterrows():
    probs = np.array(row['ptype_probs'])
    cand = recall_candidates(row, promos, probs, ptype_classes, CFG)
    ts = row[CFG['column_map_tx']['timestamp']]
    sd = CFG['column_map_promo'].get('start_date')
    ed = CFG['column_map_promo'].get('end_date')
    for _, p in cand.iterrows():
        feat = {
            'transaction_id': row[CFG['column_map_tx']['transaction_id']],
            'promo_id': p[pid_col],
            'promo_type': p[ptype_col],
            'ptype_prob': float(row['ptype_probs_by_type'].get(p[ptype_col], 0.0)),
            'scope_match_score': float(p.get('scope_match_score', 0.0)),
            'min_spend_gap': float(p.get('min_spend_gap', 0.0)),
            'channel_match': int((bool(row[CFG['column_map_tx']['is_online']]) and str(p.get('channel','')).lower()=='online') or ((not bool(row[CFG['column_map_tx']['is_online']])) and str(p.get('channel','')).lower()=='offline')),
            'within_window': int((sd in p.index and ed in p.index and pd.notna(p[sd]) and pd.notna(p[ed]) and (p[sd] <= ts) and (ts <= p[ed])))
        }
        for col in CFG['features']['ranker']['time'] + CFG['features']['ranker']['store'] + CFG['features']['ranker']['channel'] + CFG['features']['ranker']['basket']:
            if col in agg.columns:
                feat[col] = row[col]
        # add need-state features
        for ccol in ns_df.columns:
            feat[ccol] = ns_df.loc[idx, ccol]
        feat[f"promo_type__{p[ptype_col]}"] = 1
        rank_rows.append(feat)

rank_infer = pd.DataFrame(rank_rows).fillna(0)
# Align column order to ranker_featcols
X_rank_infer = np.stack([ [row.get(col, 0) for col in ranker_featcols] for _, row in rank_infer.iterrows() ])

scores = ranker_model.predict_proba(X_rank_infer)[:,1] if hasattr(ranker_model, 'predict_proba') else ranker_model.predict(X_rank_infer)
rank_infer['score'] = scores
print('Built ranking features and scored candidates.')


Built ranking features and scored candidates.


In [None]:
# Format output Top-N per basket with reason codes
TOPN = int(CFG.get('business',{}).get('topN', 3))

rows = []
for tid, grp in rank_infer.groupby('transaction_id'):
    topk = grp.sort_values('score', ascending=False).head(TOPN)
    for _, r in topk.iterrows():
        reason = {
            'meets_min_spend': bool(r['min_spend_gap'] <= 0),
            'channel_match': bool(r['channel_match']),
            'within_window': bool(r['within_window']),
            'category_overlap': bool(r['scope_match_score'] > 0)
        }
        rows.append({
            'transaction_id': tid,
            'promo_id': r['promo_id'],
            'promo_type': r['promo_type'],
            'score': float(r['score']),
            'ptype_prob': float(r['ptype_prob']),
            'scope_match_score': float(r['scope_match_score']),
            'reason_codes': reason
        })

recs = pd.DataFrame(rows)
print('Recommendations built. Preview:')
recs.head(10)


Recommendations built. Preview:


Unnamed: 0,transaction_id,promo_id,promo_type,score,ptype_prob,scope_match_score,reason_codes
0,TX0000309,PR0027,Flash Sale,5.758817e-09,0.090188,0.0,"{'meets_min_spend': True, 'channel_match': Fal..."
1,TX0000309,PR0056,Flash Sale,5.758817e-09,0.090188,0.0,"{'meets_min_spend': True, 'channel_match': Fal..."
2,TX0000309,PR0070,Flash Sale,5.758817e-09,0.090188,0.0,"{'meets_min_spend': True, 'channel_match': Fal..."
3,TX0000384,PR0001,Product_Coupon,6.321599e-09,0.123566,0.0,"{'meets_min_spend': True, 'channel_match': Fal..."
4,TX0000384,PR0003,Product_Coupon,6.321599e-09,0.123566,0.0,"{'meets_min_spend': True, 'channel_match': Fal..."
5,TX0000384,PR0014,Product_Coupon,6.321599e-09,0.123566,0.0,"{'meets_min_spend': True, 'channel_match': Fal..."
6,TX0000459,PR0066,Buy 1 get 1,9.369385e-09,0.15788,0.0,"{'meets_min_spend': True, 'channel_match': Fal..."
7,TX0000459,PR0073,Buy 1 get 1,9.369385e-09,0.15788,0.0,"{'meets_min_spend': True, 'channel_match': Fal..."
8,TX0000459,PR0095,Buy 1 get 1,9.369385e-09,0.15788,0.0,"{'meets_min_spend': True, 'channel_match': Fal..."
9,TX0000477,PR0003,Product_Coupon,1.046999e-08,0.105884,0.0,"{'meets_min_spend': True, 'channel_match': Fal..."
