<a href="https://colab.research.google.com/github/Boonyaratt/LGBM_recommendation/blob/master/Notebooks/Model3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import warnings
from datetime import datetime, timedelta
from dateutil import easter
from typing import Dict, Text
import os, kagglehub
import lightgbm as lgb
import heapq
from functools import lru_cache
from collections import defaultdict
from xgboost import XGBRanker
import pathlib
import zipfile
import itertools
import random
from sklearn.metrics import ndcg_score
random.seed(42); np.random.seed(42)
warnings.filterwarnings('ignore')
np.random.seed(42)

In [2]:
from google.colab import files
uploaded = files.upload()

# upload f

Saving features_forward_best_all3.zip to features_forward_best_all3.zip


In [4]:
!mkdir dataset

mkdir: cannot create directory ‘dataset’: File exists


In [8]:
# Define paths
zip_file_path = '/content/features_forward_best_all3.zip' # Adjust path to your zip file
extract_path = '/content/dataset' # Desired extraction directory in Colab

# Create a ZipFile object and extract
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)


In [9]:
features_all = pd.read_csv('/content/dataset/features_forward_best_all3.csv')
features_all.head()

Unnamed: 0,household_key,BASKET_ID,PROMO_KEY,day_t,qid,label,feat_hist_size,promo_offer_ct,feat_mean_recency,feat_min_recency,BRAND_Private
0,1,31172831466,54850010009|29,282,1_31172831466,0,288,12,34.0,8,0
1,1,31172831466,57800000033|29,282,1_31172831466,0,288,11,9999.0,9999,0
2,1,31172831466,54300021057|29,282,1_31172831466,0,288,11,9999.0,9999,0
3,1,31172831466,51380041013|29,282,1_31172831466,0,288,13,9999.0,9999,0
4,1,31172831466,54300016033|29,282,1_31172831466,0,288,13,9999.0,9999,0


##Clean

In [10]:
df = features_all.copy()

In [11]:
id_cols = [c for c in ['household_key','BASKET_ID','PROMO_KEY','day_t','qid','label'] if c in df.columns]
use_cols = [c for c in df.columns if c not in id_cols]

In [12]:
def cast_bool_uint8(X):
    X = X.copy()
    for c in X.columns:
        if X[c].dtype == bool:
            X[c] = X[c].astype(np.uint8)
    return X

In [13]:
def make_time_series_cv_splits(base, n_splits=5, purge_days=7):
    days = np.sort(base['day_t'].unique())
    cut_idx = np.linspace(0, len(days), n_splits+1, dtype=int)
    folds = []
    for i in range(n_splits):
        va_start = days[cut_idx[i]]
        va_end   = days[cut_idx[i+1]-1] if cut_idx[i+1]-1 < len(days) else days[-1]
        tr_end   = va_start - purge_days
        tr_m = base['day_t'] <= tr_end
        va_m = (base['day_t'] >= va_start) & (base['day_t'] <= va_end)
        if tr_m.sum() == 0 or va_m.sum() == 0:
            continue
        grp_tr = base.loc[tr_m].groupby('qid', sort=False).size().tolist()
        grp_va = base.loc[va_m].groupby('qid', sort=False).size().tolist()
        folds.append((tr_m, va_m, grp_tr, grp_va))
    return folds

def make_time_based_split(base, test_ratio=0.2, purge_days=7):
    days = np.sort(base['day_t'].unique())
    cutoff = days[int(len(days)*(1 - test_ratio))]
    tr_end = cutoff - purge_days
    train_mask = base['day_t'] <= tr_end
    valid_mask = base['day_t'] >= cutoff
    grp_tr = base.loc[train_mask].groupby('qid', sort=False).size().tolist()
    grp_va = base.loc[valid_mask].groupby('qid', sort=False).size().tolist()
    return train_mask, valid_mask, grp_tr, grp_va

def positive_valid_mask(df, base_mask, qid_col='qid', label_col='label'):
    pos_per_q = df.loc[base_mask].groupby(qid_col)[label_col].sum()
    return base_mask & df[qid_col].isin(pos_per_q.index[pos_per_q > 0])

def make_scored_frame(model, Xv, base, valid_mask):
    vf = base.loc[valid_mask, ['qid','label']].copy()
    vf['score'] = model.predict(Xv)
    return vf

def recall_precision_at_k(vf, k=10):
    got = vf.sort_values(['qid','score'], ascending=[True, False]).groupby('qid').head(k)
    pos_per_q = vf.groupby('qid')['label'].sum()
    hit_per_q = got.groupby('qid')['label'].sum()
    recall = (hit_per_q / pos_per_q.replace(0, np.nan)).mean()
    precision = got.groupby('qid')['label'].mean().mean()
    return float(recall), float(precision)

def mrr_at_k(df, k=10):
    def _mrr(g):
        labels = g.sort_values('score', ascending=False)['label'].to_numpy()[:k]
        for i, lab in enumerate(labels, 1):
            if lab == 1: return 1.0/i
        return 0.0
    return float(df.groupby('qid', group_keys=False).apply(_mrr).mean())

def map_at_k(df, k=10):
    def _ap(g):
        labels = g.sort_values('score', ascending=False)['label'].to_numpy()[:k]
        npos = labels.sum()
        if npos == 0: return 0.0
        c = labels.cumsum()
        precisions = (c / np.arange(1, len(labels)+1)) * labels
        return float(precisions.sum() / min(npos, k))
    return float(df.groupby('qid', group_keys=False).apply(_ap).mean())
from sklearn.metrics import ndcg_score

def ndcg_at_k_from_valid_frame(vf, k=10):
    ndcgs = []
    for _, g in vf.groupby('qid', sort=False):
        y_true = g['label'].to_numpy()
        if y_true.sum() == 0:
            continue
        y_pred = g['score'].to_numpy()
        ndcgs.append(ndcg_score(y_true.reshape(1,-1), y_pred.reshape(1,-1), k=k))
    return float(np.mean(ndcgs)) if ndcgs else 0.0

def hitrate_at_k(df, k=10):
    top = df.sort_values(['qid','score'], ascending=[True, False]).groupby('qid').head(k)
    return (top.groupby('qid')['label'].max()).mean()

def report_at_ks(vf, name='set', ks=(1,3,5,10)):
    print(f"\n==== {name} ====")
    for K in ks:
        r, p = recall_precision_at_k(vf, K)
        hr = hitrate_at_k(vf, K)
        mrr = mrr_at_k(vf, K)
        mAP = map_at_k(vf, K)
        nd = ndcg_at_k_from_valid_frame(vf, k=K)
        print(f"K={K}  P={p:.4f}  R={r:.4f}  HR={hr:.4f}  MRR={mrr:.4f}  MAP={mAP:.4f}  NDCG={nd:.4f}")



In [14]:
def compute_te(train_df, group_cols, target_col='label', m=50, prior=None, out_name='te'):
    g = train_df.groupby(group_cols)[target_col].agg(mean='mean', count='count').reset_index()
    if prior is None:
        prior = float(train_df[target_col].mean())
    g[out_name] = (g['count']*g['mean'] + m*prior) / (g['count'] + m)
    return g[[*(group_cols if isinstance(group_cols, (list, tuple)) else [group_cols]), out_name]]

def add_fold_te(base_df, train_mask):
    # ลบ TE เดิมถ้ามี แล้วคำนวณใหม่จากฝั่ง train เท่านั้น (กัน leakage)
    df2 = base_df.copy()
    for c in ['promo_te','hh_te','hh_promo_te','age_te','age_promo_te']:
        if c in df2.columns:
            df2.drop(columns=c, inplace=True)

    need_promo = all(c in df2.columns for c in ['PROMO_KEY'])
    need_hh    = all(c in df2.columns for c in ['household_key'])
    tr = df2.loc[train_mask, [c for c in ['household_key','PROMO_KEY','AGE_BAND','label'] if c in df2.columns]].copy()
    if tr.shape[0] == 0:
        return df2

    global_mean = float(tr['label'].mean())
    if need_promo:
        df2 = df2.merge(compute_te(tr, 'PROMO_KEY', m=100, prior=global_mean, out_name='promo_te'), on='PROMO_KEY', how='left')
        df2['promo_te'] = df2['promo_te'].fillna(global_mean).astype(np.float32)
    if need_hh:
        df2 = df2.merge(compute_te(tr, 'household_key', m=200, prior=global_mean, out_name='hh_te'), on='household_key', how='left')
        df2['hh_te'] = df2['hh_te'].fillna(global_mean).astype(np.float32)
    if need_promo and need_hh:
        df2 = df2.merge(compute_te(tr, ['household_key','PROMO_KEY'], m=200, prior=global_mean, out_name='hh_promo_te'),
                        on=['household_key','PROMO_KEY'], how='left')
        df2['hh_promo_te'] = df2['hh_promo_te'].fillna(df2.get('hh_te', global_mean)).astype(np.float32)
    if 'AGE_BAND' in df2.columns:
        age_te = compute_te(tr, 'AGE_BAND', m=100, prior=global_mean, out_name='age_te')
        df2 = df2.merge(age_te, on='AGE_BAND', how='left')
        df2['age_te'] = df2['age_te'].fillna(global_mean).astype(np.float32)
        if need_promo:
            age_promo_te = compute_te(tr, ['AGE_BAND','PROMO_KEY'], m=200, prior=global_mean, out_name='age_promo_te')
            df2 = df2.merge(age_promo_te, on=['AGE_BAND','PROMO_KEY'], how='left')
            df2['age_promo_te'] = df2['age_promo_te'].fillna(df2['age_te']).astype(np.float32)
    return df2

def downsample_train_by_qid(df_fold, train_mask, R=5, hardness_cols=('hh_promo_te','promo_te','feat_promo_pos_rate'), rng_seed=42):
    rng = np.random.RandomState(rng_seed)
    tr = df_fold.loc[train_mask].copy()
    if hardness_cols:
        cols_present = [c for c in hardness_cols if c in tr.columns]
        if cols_present:
            z = []
            for c in cols_present:
                s = pd.to_numeric(tr[c], errors='coerce').fillna(0.0)
                z.append(s.rank(pct=True).astype(float))
            tr['_hard'] = np.vstack(z).mean(axis=0)
        else:
            tr['_hard'] = 0.0
    else:
        tr['_hard'] = 0.0

    keep_idx = []
    for qid, g in tr.groupby('qid', sort=False):
        pos = g.index[g['label']==1].to_numpy()
        neg = g.index[g['label']==0].to_numpy()
        if len(pos)==0:
            continue
        n_keep = min(len(neg), R*len(pos))
        if n_keep > 0 and g['_hard'].notnull().any():
            neg_keep = g.loc[neg].sort_values('_hard', ascending=False).index.to_numpy()[:n_keep]
        else:
            neg_keep = rng.choice(neg, size=n_keep, replace=False) if n_keep>0 else np.array([], int)
        keep_idx.extend(pos.tolist() + neg_keep.tolist())

    keep_mask = pd.Series(False, index=df_fold.index)
    if keep_idx: keep_mask.loc[keep_idx] = True
    g = df_fold.loc[keep_mask].groupby('qid')['label'].agg(n='size', pos='sum')
    valid_qids = g[(g['n']>=2) & (g['pos']>=1) & (g['pos']<g['n'])].index
    keep_mask &= df_fold['qid'].isin(valid_qids)
    return keep_mask

In [15]:
def blended_ndcg(vf):
    nd5 = ndcg_at_k_from_valid_frame(vf, 5)
    nd10 = ndcg_at_k_from_valid_frame(vf, 10)
    return 0.8*nd5 + 0.2*nd10

def eval_params_cv(base_df, folds, use_cols, params):
    scores = []
    for tr_m, va_m, grp_tr, grp_va in folds:
        df_f = add_fold_te(base_df, tr_m)
        keep_tr = downsample_train_by_qid(df_f, tr_m, R=params['R'])
        tr_final = tr_m & keep_tr

        X_tr = cast_bool_uint8(df_f.loc[tr_final, use_cols].select_dtypes(include=['number','bool']))
        y_tr = df_f.loc[tr_final, 'label'].astype(int).values
        grp_tr_f = df_f.loc[tr_final].groupby('qid', sort=False).size().tolist()

        va_pos_m = positive_valid_mask(df_f, va_m)
        X_va = cast_bool_uint8(df_f.loc[va_pos_m, use_cols].select_dtypes(include=['number','bool']))
        y_va = df_f.loc[va_pos_m, 'label'].astype(int).values
        grp_va_f = df_f.loc[va_pos_m].groupby('qid', sort=False).size().tolist()

        ranker = lgb.LGBMRanker(
            objective='lambdarank', metric='ndcg', boosting_type='gbdt',
            n_estimators=4000, learning_rate=params['lr'],
            num_leaves=params['leaves'], min_data_in_leaf=params['min_leaf'],
            feature_fraction=params['ff'], bagging_fraction=params['bf'], bagging_freq=1,
            lambda_l2=params['l2'], max_bin=params['max_bin'],
            lambdarank_truncation_level=params['trunc'], random_state=42, verbose=-1
        )
        ranker.fit(
            X_tr, y_tr, group=grp_tr_f,
            eval_set=[(X_va, y_va)], eval_group=[grp_va_f],
            eval_at=[params['trunc'], 10], callbacks=[lgb.early_stopping(300, first_metric_only=True)]
        )
        vf = make_scored_frame(ranker, X_va, df_f, va_pos_m)
        scores.append(0.8*ndcg_at_k_from_valid_frame(vf, 5) + 0.2*ndcg_at_k_from_valid_frame(vf, 10))
    return float(np.mean(scores)) if scores else 0.0

In [16]:
cv_folds = make_time_series_cv_splits(df, n_splits=5, purge_days=7)

In [17]:
grid = {
    'R':        [3, 5, 8],
    'trunc':    [3, 5, 8],
    'leaves':   [31, 63, 95],
    'min_leaf': [20, 50, 100],
    'lr':       [0.03, 0.05, 0.08],
    'ff':       [0.7, 0.85, 0.95],
    'bf':       [0.7, 0.85, 0.95],
    'l2':       [0.0, 1.0, 2.0, 5.0],
    'max_bin':  [255, 511],
}
cands = list(itertools.product(*(grid[k] for k in grid)))
random.shuffle(cands)
trials = min(48, len(cands))

best_score, best_pars = -1.0, None
for t in range(trials):
    pars = {k: v for k, v in zip(grid.keys(), cands[t])}
    s = eval_params_cv(df, cv_folds, use_cols, pars)
    tag = "NEWBEST" if s > best_score else ""
    if s > best_score:
        best_score, best_pars = s, pars
    print(f"[{t+1}/{trials}] CV blended={s:.4f} {tag} {pars}")
print("Best CV params:", best_pars, "CV blended:", best_score)

Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[419]	valid_0's ndcg@3: 0.250083	valid_0's ndcg@10: 0.360481
Evaluated only: ndcg@3
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[2]	valid_0's ndcg@3: 0.234743	valid_0's ndcg@10: 0.372582
Evaluated only: ndcg@3
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[6]	valid_0's ndcg@3: 0.233686	valid_0's ndcg@10: 0.360317
Evaluated only: ndcg@3
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[33]	valid_0's ndcg@3: 0.226345	valid_0's ndcg@10: 0.357323
Evaluated only: ndcg@3
[1/48] CV blended=0.2964 NEWBEST {'R': 5, 'trunc': 3, 'leaves': 63, 'min_leaf': 100, 'lr': 0.03, 'ff': 0.95, 'bf': 0.7, 'l2': 2.0, 'max_bin': 255}
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[31]	valid_0's ndcg@8: 0.38908	valid_

In [18]:
df_final = add_fold_te(df, train_mask=(df['day_t']>=df['day_t'].min()))  # train_mask=all rows
# - downsample ทั้งชุดเพื่อเน้น hard negatives (opt.)
keep_all = downsample_train_by_qid(df_final, df_final['day_t']>=df_final['day_t'].min(), R=5)
train_all_mask = keep_all

X_all = cast_bool_uint8(df_final.loc[train_all_mask, use_cols].select_dtypes(include=['number','bool']))
y_all = df_final.loc[train_all_mask, 'label'].astype(int).values
grp_all = df_final.loc[train_all_mask].groupby('qid', sort=False).size().tolist()

final_params = dict(best_pars) if best_pars is not None else {
    'lr':0.05,'leaves':63,'min_leaf':50,'ff':0.9,'bf':0.9,'l2':2.0,'max_bin':511
}
final_ranker = lgb.LGBMRanker(
    objective='lambdarank', metric='ndcg', boosting_type='gbdt',
    n_estimators=6000, learning_rate=final_params['lr'],
    num_leaves=final_params['leaves'], min_data_in_leaf=final_params['min_leaf'],
    feature_fraction=final_params['ff'], bagging_fraction=final_params['bf'], bagging_freq=1,
    lambda_l2=final_params['l2'], max_bin=final_params['max_bin'],
    lambdarank_truncation_level=5, random_state=42, verbose=-1
)
# ไม่มี valid-set เดี่ยว → เทรนจนจบ (จะไม่มี early stopping)
final_ranker.fit(X_all, y_all, group=grp_all)

print("Final model trained on all data with params:", final_params)

Final model trained on all data with params: {'R': 8, 'trunc': 5, 'leaves': 31, 'min_leaf': 100, 'lr': 0.08, 'ff': 0.7, 'bf': 0.85, 'l2': 1.0, 'max_bin': 255}


In [19]:
# 1) สร้างฟังก์ชัน time-based split (ถ้ายังไม่มี)
def make_time_based_split(base, test_ratio=0.2, purge_days=7):
    days = np.sort(base['day_t'].unique())
    cutoff = days[int(len(days)*(1 - test_ratio))]
    tr_end = cutoff - purge_days
    train_mask = base['day_t'] <= tr_end
    valid_mask = base['day_t'] >= cutoff
    grp_tr = base.loc[train_mask].groupby('qid', sort=False).size().tolist()
    grp_va = base.loc[valid_mask].groupby('qid', sort=False).size().tolist()
    return train_mask, valid_mask, grp_tr, grp_va

# 2) สร้าง train/valid สำหรับวัดผล และคำนวณ TE เฉพาะฝั่ง train
train_mask, valid_mask, grp_tr, grp_va = make_time_based_split(df, test_ratio=0.2, purge_days=7)
df_hold = add_fold_te(df, train_mask)

# 3) downsample เฉพาะฝั่ง train (hard negatives)
keep_tr = downsample_train_by_qid(df_hold, train_mask, R=5)
tr_final = train_mask & keep_tr

# 4) เตรียมเมทริกซ์เทรน/วาลิเดต
X_tr_h = cast_bool_uint8(df_hold.loc[tr_final, use_cols].select_dtypes(include=['number','bool']))
y_tr_h = df_hold.loc[tr_final, 'label'].astype(int).values
grp_tr_h = df_hold.loc[tr_final].groupby('qid', sort=False).size().tolist()

X_va_h = cast_bool_uint8(df_hold.loc[valid_mask, use_cols].select_dtypes(include=['number','bool']))
y_va_h = df_hold.loc[valid_mask, 'label'].astype(int).values
grp_va_h = grp_va  # จาก split

# 5) เทรนโมเดลสำหรับ valid (ใช้พารามิเตอร์ที่ได้จาก CV)
pars = dict(best_pars)
ranker_valid = lgb.LGBMRanker(
    objective='lambdarank', metric='ndcg', boosting_type='gbdt',
    n_estimators=4000, learning_rate=pars['lr'],
    num_leaves=pars['leaves'], min_data_in_leaf=pars['min_leaf'],
    feature_fraction=pars['ff'], bagging_fraction=pars['bf'], bagging_freq=1,
    lambda_l2=pars['l2'], max_bin=pars['max_bin'],
    lambdarank_truncation_level=5, random_state=42, verbose=-1
)
ranker_valid.fit(
    X_tr_h, y_tr_h, group=grp_tr_h,
    eval_set=[(X_va_h, y_va_h)], eval_group=[grp_va_h],
    eval_at=[5,10], callbacks=[lgb.early_stopping(300, first_metric_only=True)]
)

# 6) วัดผลบน valid (pos-only และ all-queries)
valid_pos_mask = positive_valid_mask(df_hold, valid_mask)
vf_all = make_scored_frame(ranker_valid, X_va_h, df_hold, valid_mask)
vf_pos = make_scored_frame(ranker_valid, X_va_h.loc[valid_pos_mask], df_hold, valid_pos_mask)

report_at_ks(vf_pos, 'valid pos-only')
report_at_ks(vf_all, 'valid all-queries')

nd5  = ndcg_at_k_from_valid_frame(vf_pos, 5)
nd10 = ndcg_at_k_from_valid_frame(vf_pos, 10)
print(f"Blended (0.8*NDCG@5+0.2*NDCG@10) = {0.8*nd5 + 0.2*nd10:.4f}")

Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[9]	valid_0's ndcg@5: 0.984897	valid_0's ndcg@10: 0.986614
Evaluated only: ndcg@5

==== valid pos-only ====
K=1  P=0.1604  R=0.1095  HR=0.1604  MRR=0.1658  MAP=0.1658  NDCG=0.1591
K=3  P=0.1355  R=0.2335  HR=0.3583  MRR=0.2469  MAP=0.2473  NDCG=0.2111
K=5  P=0.1348  R=0.3754  HR=0.5187  MRR=0.2851  MAP=0.2819  NDCG=0.2722
K=10  P=0.1182  R=0.6008  HR=0.7219  MRR=0.3089  MAP=0.2916  NDCG=0.3522

==== valid all-queries ====
K=1  P=0.0033  R=0.1095  HR=0.0033  MRR=0.0034  MAP=0.0034  NDCG=0.1591
K=3  P=0.0028  R=0.2335  HR=0.0074  MRR=0.0051  MAP=0.0051  NDCG=0.2111
K=5  P=0.0028  R=0.3754  HR=0.0107  MRR=0.0059  MAP=0.0058  NDCG=0.2722
K=10  P=0.0024  R=0.6008  HR=0.0150  MRR=0.0064  MAP=0.0060  NDCG=0.3522
Blended (0.8*NDCG@5+0.2*NDCG@10) = 0.2882


In [21]:
df.columns

Index(['household_key', 'BASKET_ID', 'PROMO_KEY', 'day_t', 'qid', 'label',
       'feat_hist_size', 'promo_offer_ct', 'feat_mean_recency',
       'feat_min_recency', 'BRAND_Private'],
      dtype='object')