<a href="https://colab.research.google.com/github/Boonyaratt/LGBM_recommendation/blob/master/Baseline_models/Baseline_features_selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import warnings
from datetime import datetime, timedelta
from dateutil import easter
from typing import Dict, Text
import os, kagglehub
import lightgbm as lgb
import heapq
from functools import lru_cache
from collections import defaultdict
from xgboost import XGBRanker
import pathlib
import zipfile
from sklearn.metrics import ndcg_score

warnings.filterwarnings('ignore')
np.random.seed(42)

In [None]:
from google.colab import files
uploaded = files.upload()

# upload features_all3.zip

Saving features_all3.zip to features_all3.zip


In [None]:
!mkdir dataset

mkdir: cannot create directory ‘dataset’: File exists


In [None]:
# Define paths
zip_file_path = '/content/features_all3.zip' # Adjust path to your zip file
extract_path = '/content/dataset' # Desired extraction directory in Colab

# Create a ZipFile object and extract
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)


In [None]:
df = pd.read_csv('/content/dataset/features_all3.csv')
df.head()

Unnamed: 0,household_key,BASKET_ID,PROMO_KEY,day_t,qid,label,feat_min_recency,feat_mean_recency,feat_seen_products,overlap_decay_h3,...,feat_overlap_ratio,feat_jaccard,AGE_19_24,AGE_25_34,AGE_35_44,AGE_45_54,AGE_55_64,AGE_65P,AGE_UNK,age_mid
0,1,31172831466,54850010009|29,282,1_31172831466,0,8,34.0,4,0.182293,...,0.013889,0.011173,0,0,0,0,0,1,0,70
1,1,31172831466,57800000033|29,282,1_31172831466,0,9999,9999.0,0,0.0,...,0.0,0.0,0,0,0,0,0,1,0,70
2,1,31172831466,54300021057|29,282,1_31172831466,0,9999,9999.0,0,0.0,...,0.0,0.0,0,0,0,0,0,1,0,70
3,1,31172831466,51380041013|29,282,1_31172831466,0,9999,9999.0,0,0.0,...,0.0,0.0,0,0,0,0,0,1,0,70
4,1,31172831466,54300016033|29,282,1_31172831466,0,9999,9999.0,0,0.0,...,0.0,0.0,0,0,0,0,0,1,0,70


##Clean

In [None]:
def make_time_based_split(base, test_ratio=0.2, purge_days=7):
    days = np.sort(base['day_t'].unique())
    cutoff = days[int(len(days)*(1 - test_ratio))]
    tr_end = cutoff - purge_days
    train_mask = base['day_t'] <= tr_end
    valid_mask = base['day_t'] >= cutoff
    grp_tr = base.loc[train_mask].groupby('qid', sort=False).size().tolist()
    grp_va = base.loc[valid_mask].groupby('qid', sort=False).size().tolist()
    print(f"Train {base.loc[train_mask,'day_t'].min()}–{base.loc[train_mask,'day_t'].max()} | "
          f"Valid {base.loc[valid_mask,'day_t'].min()}–{base.loc[valid_mask,'day_t'].max()} | "
          f"Purge={purge_days}d")
    return train_mask, valid_mask, grp_tr, grp_va

train_mask, valid_mask, grp_tr, grp_va = make_time_based_split(df, test_ratio=0.2, purge_days=7)

Train 224–607 | Valid 614–711 | Purge=7d


In [None]:
def positive_valid_mask(df, valid_mask, qid_col='qid', label_col='label'):
    pos_per_q = df.loc[valid_mask].groupby(qid_col)[label_col].sum()
    return valid_mask & df[qid_col].isin(pos_per_q.index[pos_per_q > 0])

valid_pos_mask = positive_valid_mask(df, valid_mask)
grp_va_pos = df.loc[valid_pos_mask].groupby('qid', sort=False).size().tolist()
print("valid_pos queries:", len(grp_va_pos), " / all-valid queries:", len(grp_va))

valid_pos queries: 187  / all-valid queries: 9026


In [None]:
# 1) Age-segmented promo rate (Laplace smoothing) จาก train เท่านั้น
age_buckets = ['AGE_19_24','AGE_25_34','AGE_35_44','AGE_45_54','AGE_55_64','AGE_65P','AGE_UNK']
def to_age_bucket(row):
    for c in age_buckets:
        if c in row and row[c] == 1:
            return c
    return 'AGE_UNK'

df['age_bucket'] = df[age_buckets].apply(to_age_bucket, axis=1)

train_df = df.loc[train_mask]
g = train_df.groupby(['PROMO_KEY','age_bucket'])
agepromo_pos_rate = ((g['label'].sum() + 1) / (g['label'].count() + 2)).rename('agepromo_pos_rate').reset_index()
df = df.merge(agepromo_pos_rate, on=['PROMO_KEY','age_bucket'], how='left')
df['agepromo_pos_rate'] = df['agepromo_pos_rate'].fillna(df['feat_promo_pos_rate']).fillna(0.0)

In [None]:
def make_scored_frame_from_series(df, mask, score_series, rng=None):
    vf = df.loc[mask, ['qid','label']].copy()
    s = pd.to_numeric(score_series.loc[mask], errors='coerce').fillna(0.0)
    if rng is None:
        rng = np.random.RandomState(42)
    vf['score'] = s.values + rng.normal(0, 1e-6, size=len(s))
    return vf

def hitrate_at_k(dfv, k=10):
    top = dfv.sort_values(['qid','score'], ascending=[True, False]).groupby('qid').head(k)
    return (top.groupby('qid')['label'].max()).mean()

def recall_precision_at_k(dfv, k=10):
    got = dfv.sort_values(['qid','score'], ascending=[True, False]).groupby('qid').head(k)
    pos_per_q = dfv.groupby('qid')['label'].sum()
    hit_per_q = got.groupby('qid')['label'].sum()
    recall = (hit_per_q / pos_per_q.replace(0, np.nan)).mean()
    precision = got.groupby('qid')['label'].mean().mean()
    return float(recall), float(precision)

def mrr_at_k(dfv, k=10):
    def _mrr(g):
        labels = g.sort_values('score', ascending=False)['label'].to_numpy()[:k]
        for i, lab in enumerate(labels, 1):
            if lab == 1: return 1.0/i
        return 0.0
    return float(dfv.groupby('qid', group_keys=False).apply(_mrr).mean())

def map_at_k(dfv, k=10):
    def _ap(g):
        labels = g.sort_values('score', ascending=False)['label'].to_numpy()[:k]
        npos = labels.sum()
        if npos == 0: return 0.0
        c = labels.cumsum()
        precisions = (c / np.arange(1, len(labels)+1)) * labels
        return float(precisions.sum() / min(npos, k))
    return float(dfv.groupby('qid', group_keys=False).apply(_ap).mean())

def ndcg_at_k_from_valid_frame(dfv, k=10):
    ndcgs = []
    for qid, g in dfv.groupby('qid'):
        y_true = g['label'].to_numpy()
        if y_true.sum() == 0:
            continue
        y_pred = g['score'].to_numpy()
        ndcgs.append(ndcg_score(y_true.reshape(1,-1), y_pred.reshape(1,-1), k=k))
    return float(np.mean(ndcgs)) if ndcgs else 0.0

def report_at_ks(vf, name, ks=(1,3,5,10)):
    print(f"\n==== {name} ====")
    for K in ks:
        r, p = recall_precision_at_k(vf, K)
        hr = hitrate_at_k(vf, K)
        mrr = mrr_at_k(vf, K)
        mAP = map_at_k(vf, K)
        nd = ndcg_at_k_from_valid_frame(vf, k=K)
        print(f"K={K}  P={p:.4f}  R={r:.4f}  HR={hr:.4f}  MRR={mrr:.4f}  MAP={mAP:.4f}  NDCG={nd:.4f}")



In [None]:
# สร้าง promo_pos_rate_train จาก train เท่านั้น แบบไม่ใช้ merge เพื่อกัน suffix
train_df = df.loc[train_mask].copy()

promo_only = (
    train_df.groupby('PROMO_KEY')['label']
    .agg(sum='sum', count='count')
)
promo_only_rate = (promo_only['sum'] + 1) / (promo_only['count'] + 2)

# เขียนคอลัมน์ลง df โดยตรง (ปลอดภัยต่อการรันซ้ำ)
df['promo_pos_rate_train'] = df['PROMO_KEY'].map(promo_only_rate)

# global fallback จาก train
global_pos_rate_train = float((train_df['label'].sum() + 1) / (train_df['label'].count() + 2))

# chain fallback: agepromo → promo-only(train) → global(train) → 0
df['agepromo_pos_rate'] = (
    df['agepromo_pos_rate']
    .fillna(df['promo_pos_rate_train'])
    .fillna(global_pos_rate_train)
    .fillna(0.0)
).astype(float)

In [None]:
def fit_norm(s):
    m, sd = s.mean(), s.std()
    return (m, sd if sd > 1e-8 else 1.0)

def apply_norm(s, stats):
    m, sd = stats
    return (s - m) / sd

# สร้างคอมโพเนนต์
rec_exp = np.exp(-df['feat_min_recency'].clip(lower=0).fillna(9999.0)/30.0)
affinity = (
    df['feat_overlap_cnt'].fillna(0.0) +
    0.5*df['feat_user_dept_aff'].fillna(0.0) +
    0.5*df['feat_user_brand_aff'].fillna(0.0)
)
popularity = df['feat_promo_pos_rate'].fillna(0.0)
demographic_pop = df['agepromo_pos_rate'].fillna(0.0)
context_raw = df['feat_is_weekend'].fillna(0.0) + 0.1*df['feat_dayofweek'].fillna(0.0)

# fit stats จาก train
stats = {
    'rec_exp': fit_norm(rec_exp[train_mask]),
    'affinity': fit_norm(affinity[train_mask]),
    'popularity': fit_norm(popularity[train_mask]),
    'demographic_pop': fit_norm(demographic_pop[train_mask]),
    'context': fit_norm(context_raw[train_mask]),
}

# apply กับทั้ง df
rec_exp_n = apply_norm(rec_exp, stats['rec_exp'])
affinity_n = apply_norm(affinity, stats['affinity'])
popularity_n = apply_norm(popularity, stats['popularity'])
demographic_pop_n = apply_norm(demographic_pop, stats['demographic_pop'])
context_n = apply_norm(context_raw, stats['context'])

In [None]:
w = {'demo':0.35, 'aff':0.25, 'rec':0.20, 'pop':0.15, 'ctx':0.05}  # รวม=1
heuristic_score = (
    w['demo']*demographic_pop_n +
    w['aff']*affinity_n +
    w['rec']*rec_exp_n +
    w['pop']*popularity_n +
    w['ctx']*context_n
)

In [None]:
# for name, score in baselines.items():
#     print(f"\n--- {name} ---")
#     vf_pos = make_scored_frame_from_series(df, valid_pos_mask, score, rng)
#     vf_all = make_scored_frame_from_series(df, valid_mask, score, rng)
#     report_at_ks(vf_pos, f'{name} | pos-only')
#     report_at_ks(vf_all, f'{name} | all-valid')

# # รายงาน prevalence เพื่อช่วยตีความ all-valid
# pos_any = df.loc[valid_mask].groupby('qid')['label'].sum() > 0
# print("\nprevalence (all-valid):", float(pos_any.mean()))

vf_h_pos = make_scored_frame_from_series(df, valid_pos_mask, heuristic_score, rng=np.random.RandomState(42))
vf_h_all = make_scored_frame_from_series(df, valid_mask, heuristic_score, rng=np.random.RandomState(42))
report_at_ks(vf_h_pos, 'Heuristic+ | pos-only')
report_at_ks(vf_h_all, 'Heuristic+ | all-valid')


==== Heuristic+ | pos-only ====
K=1  P=0.1979  R=0.1234  HR=0.1979  MRR=0.1979  MAP=0.1979  NDCG=0.1979
K=3  P=0.1765  R=0.3182  HR=0.4492  MRR=0.3066  MAP=0.3079  NDCG=0.2728
K=5  P=0.1401  R=0.3970  HR=0.5241  MRR=0.3240  MAP=0.3224  NDCG=0.3040
K=10  P=0.1193  R=0.6424  HR=0.7487  MRR=0.3539  MAP=0.3382  NDCG=0.3926

==== Heuristic+ | all-valid ====
K=1  P=0.0042  R=0.1288  HR=0.0042  MRR=0.0042  MAP=0.0042  NDCG=0.2032
K=3  P=0.0037  R=0.3182  HR=0.0093  MRR=0.0064  MAP=0.0064  NDCG=0.2748
K=5  P=0.0029  R=0.3970  HR=0.0109  MRR=0.0068  MAP=0.0067  NDCG=0.3060
K=10  P=0.0025  R=0.6371  HR=0.0154  MRR=0.0074  MAP=0.0070  NDCG=0.3929
