<a href="https://colab.research.google.com/github/Boonyaratt/LGBM_recommendation/blob/master/LGBMmini_Baseline_features_selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import warnings
from datetime import datetime, timedelta
from dateutil import easter
from typing import Dict, Text
import os, kagglehub
import lightgbm as lgb
import heapq
from functools import lru_cache
from collections import defaultdict
from xgboost import XGBRanker
import pathlib
import zipfile
from sklearn.metrics import ndcg_score

warnings.filterwarnings('ignore')
np.random.seed(42)

In [None]:
from google.colab import files
uploaded = files.upload()

# upload features_all3.zip

Saving features_all3.zip to features_all3.zip


In [None]:
!mkdir dataset

In [None]:
# Define paths
zip_file_path = '/content/features_all3.zip' # Adjust path to your zip file
extract_path = '/content/dataset' # Desired extraction directory in Colab

# Create a ZipFile object and extract
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)


In [None]:
df = pd.read_csv('/content/dataset/features_all3.csv')
df.head()

Unnamed: 0,household_key,BASKET_ID,PROMO_KEY,day_t,qid,label,feat_min_recency,feat_mean_recency,feat_seen_products,overlap_decay_h3,...,feat_overlap_ratio,feat_jaccard,AGE_19_24,AGE_25_34,AGE_35_44,AGE_45_54,AGE_55_64,AGE_65P,AGE_UNK,age_mid
0,1,31172831466,54850010009|29,282,1_31172831466,0,8,34.0,4,0.182293,...,0.013889,0.011173,0,0,0,0,0,1,0,70
1,1,31172831466,57800000033|29,282,1_31172831466,0,9999,9999.0,0,0.0,...,0.0,0.0,0,0,0,0,0,1,0,70
2,1,31172831466,54300021057|29,282,1_31172831466,0,9999,9999.0,0,0.0,...,0.0,0.0,0,0,0,0,0,1,0,70
3,1,31172831466,51380041013|29,282,1_31172831466,0,9999,9999.0,0,0.0,...,0.0,0.0,0,0,0,0,0,1,0,70
4,1,31172831466,54300016033|29,282,1_31172831466,0,9999,9999.0,0,0.0,...,0.0,0.0,0,0,0,0,0,1,0,70


##Clean

In [None]:
def make_time_based_split(base, test_ratio=0.2, purge_days=7):
    days = np.sort(base['day_t'].unique())
    cutoff = days[int(len(days)*(1 - test_ratio))]
    tr_end = cutoff - purge_days
    train_mask = base['day_t'] <= tr_end
    valid_mask = base['day_t'] >= cutoff
    grp_tr = base.loc[train_mask].groupby('qid', sort=False).size().tolist()
    grp_va = base.loc[valid_mask].groupby('qid', sort=False).size().tolist()
    print(f"Train {base.loc[train_mask,'day_t'].min()}–{base.loc[train_mask,'day_t'].max()} | "
          f"Valid {base.loc[valid_mask,'day_t'].min()}–{base.loc[valid_mask,'day_t'].max()} | "
          f"Purge={purge_days}d")
    return train_mask, valid_mask, grp_tr, grp_va

train_mask, valid_mask, grp_tr, grp_va = make_time_based_split(df, test_ratio=0.2, purge_days=7)

Train 224–607 | Valid 614–711 | Purge=7d


In [None]:
def positive_valid_mask(df, valid_mask, qid_col='qid', label_col='label'):
    pos_per_q = df.loc[valid_mask].groupby(qid_col)[label_col].sum()
    return valid_mask & df[qid_col].isin(pos_per_q.index[pos_per_q > 0])

valid_pos_mask = positive_valid_mask(df, valid_mask)
grp_va_pos = df.loc[valid_pos_mask].groupby('qid', sort=False).size().tolist()
print("valid_pos queries:", len(grp_va_pos), " / all-valid queries:", len(grp_va))

valid_pos queries: 187  / all-valid queries: 9026


In [None]:
# 1) Age-segmented promo rate (Laplace smoothing) จาก train เท่านั้น
age_buckets = ['AGE_19_24','AGE_25_34','AGE_35_44','AGE_45_54','AGE_55_64','AGE_65P','AGE_UNK']
def to_age_bucket(row):
    for c in age_buckets:
        if c in row and row[c] == 1:
            return c
    return 'AGE_UNK'

df['age_bucket'] = df[age_buckets].apply(to_age_bucket, axis=1)

train_df = df.loc[train_mask]
g = train_df.groupby(['PROMO_KEY','age_bucket'])
agepromo_pos_rate = ((g['label'].sum() + 1) / (g['label'].count() + 2)).rename('agepromo_pos_rate').reset_index()
df = df.merge(agepromo_pos_rate, on=['PROMO_KEY','age_bucket'], how='left')
df['agepromo_pos_rate'] = df['agepromo_pos_rate'].fillna(df['feat_promo_pos_rate']).fillna(0.0)

In [None]:
def make_scored_frame_from_series(df, mask, score_series, rng=None):
    vf = df.loc[mask, ['qid','label']].copy()
    s = pd.to_numeric(score_series.loc[mask], errors='coerce').fillna(0.0)
    if rng is None:
        rng = np.random.RandomState(42)
    vf['score'] = s.values + rng.normal(0, 1e-6, size=len(s))
    return vf

def hitrate_at_k(dfv, k=10):
    top = dfv.sort_values(['qid','score'], ascending=[True, False]).groupby('qid').head(k)
    return (top.groupby('qid')['label'].max()).mean()

def recall_precision_at_k(dfv, k=10):
    got = dfv.sort_values(['qid','score'], ascending=[True, False]).groupby('qid').head(k)
    pos_per_q = dfv.groupby('qid')['label'].sum()
    hit_per_q = got.groupby('qid')['label'].sum()
    recall = (hit_per_q / pos_per_q.replace(0, np.nan)).mean()
    precision = got.groupby('qid')['label'].mean().mean()
    return float(recall), float(precision)

def mrr_at_k(dfv, k=10):
    def _mrr(g):
        labels = g.sort_values('score', ascending=False)['label'].to_numpy()[:k]
        for i, lab in enumerate(labels, 1):
            if lab == 1: return 1.0/i
        return 0.0
    return float(dfv.groupby('qid', group_keys=False).apply(_mrr).mean())

def map_at_k(dfv, k=10):
    def _ap(g):
        labels = g.sort_values('score', ascending=False)['label'].to_numpy()[:k]
        npos = labels.sum()
        if npos == 0: return 0.0
        c = labels.cumsum()
        precisions = (c / np.arange(1, len(labels)+1)) * labels
        return float(precisions.sum() / min(npos, k))
    return float(dfv.groupby('qid', group_keys=False).apply(_ap).mean())

def ndcg_at_k_from_valid_frame(dfv, k=10):
    ndcgs = []
    for qid, g in dfv.groupby('qid'):
        y_true = g['label'].to_numpy()
        if y_true.sum() == 0:
            continue
        y_pred = g['score'].to_numpy()
        ndcgs.append(ndcg_score(y_true.reshape(1,-1), y_pred.reshape(1,-1), k=k))
    return float(np.mean(ndcgs)) if ndcgs else 0.0

def report_at_ks(vf, name, ks=(1,3,5,10)):
    print(f"\n==== {name} ====")
    for K in ks:
        r, p = recall_precision_at_k(vf, K)
        hr = hitrate_at_k(vf, K)
        mrr = mrr_at_k(vf, K)
        mAP = map_at_k(vf, K)
        nd = ndcg_at_k_from_valid_frame(vf, k=K)
        print(f"K={K}  P={p:.4f}  R={r:.4f}  HR={hr:.4f}  MRR={mrr:.4f}  MAP={mAP:.4f}  NDCG={nd:.4f}")



In [None]:
# สร้าง promo_pos_rate_train จาก train เท่านั้น แบบไม่ใช้ merge เพื่อกัน suffix
train_df = df.loc[train_mask].copy()

promo_only = (
    train_df.groupby('PROMO_KEY')['label']
    .agg(sum='sum', count='count')
)
promo_only_rate = (promo_only['sum'] + 1) / (promo_only['count'] + 2)

# เขียนคอลัมน์ลง df โดยตรง (ปลอดภัยต่อการรันซ้ำ)
df['promo_pos_rate_train'] = df['PROMO_KEY'].map(promo_only_rate)

# global fallback จาก train
global_pos_rate_train = float((train_df['label'].sum() + 1) / (train_df['label'].count() + 2))

# chain fallback: agepromo → promo-only(train) → global(train) → 0
df['agepromo_pos_rate'] = (
    df['agepromo_pos_rate']
    .fillna(df['promo_pos_rate_train'])
    .fillna(global_pos_rate_train)
    .fillna(0.0)
).astype(float)

In [None]:
# Baseline 1: LGBMRanker-mini (in-family)
import lightgbm as lgb

cand_feats = [
    'age_mid','AGE_19_24','AGE_25_34','AGE_35_44','AGE_45_54','AGE_55_64','AGE_65P','AGE_UNK',
    'feat_min_recency','seen_in_3d','seen_in_7d','short_overlap_7d',
    'feat_overlap_cnt','feat_jaccard','feat_user_dept_aff','feat_user_brand_aff',
    'feat_promo_pos_rate','feat_promo_offer_ct_log1p',
    'feat_is_weekend','feat_dayofweek',
    # one-hots (ใช้เท่าที่มีใน df)
] + [c for c in df.columns if c.startswith('DEPARTMENT_')] + [c for c in df.columns if c.startswith('BRAND_')]

feat_cols = [c for c in cand_feats if c in df.columns]
X_tr = df.loc[train_mask, feat_cols].apply(pd.to_numeric, errors='coerce').fillna(0.0)
y_tr = df.loc[train_mask, 'label'].astype(int)
X_va = df.loc[valid_mask, feat_cols].apply(pd.to_numeric, errors='coerce').fillna(0.0)
y_va = df.loc[valid_mask, 'label'].astype(int)

# sanity checks
assert sum(grp_tr) == len(X_tr) and sum(grp_va) == len(X_va)

lgb_ranker = lgb.LGBMRanker(
    objective='lambdarank',
    metric='ndcg',          # ใช้ ndcg + eval_at ระบุ K
    eval_at=[10],
    n_estimators=200,
    learning_rate=0.08,
    max_depth=4,
    num_leaves=31,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    random_state=42,
    n_jobs=-1,
    verbosity=-1           # ปิด log จากตัวโมเดล
)

lgb_ranker.fit(
    X_tr, y_tr,
    group=grp_tr,
    eval_set=[(X_va, y_va)],
    eval_group=[grp_va],
    callbacks=[
        lgb.early_stopping(50, verbose=False),  # early stopping
        lgb.log_evaluation(period=0)            # ไม่พิมพ์ log ทุกรอบ
    ]
)


In [None]:
va_scores = pd.Series(lgb_ranker.predict(X_va), index=X_va.index)
vf_pos = make_scored_frame_from_series(df, valid_pos_mask, va_scores, rng=np.random.RandomState(42))
vf_all = make_scored_frame_from_series(df, valid_mask, va_scores, rng=np.random.RandomState(42))
report_at_ks(vf_pos, 'LGBMRanker-mini | pos-only')
report_at_ks(vf_all, 'LGBMRanker-mini | all-valid')


==== LGBMRanker-mini | pos-only ====
K=1  P=0.2620  R=0.1605  HR=0.2620  MRR=0.2620  MAP=0.2620  NDCG=0.2620
K=3  P=0.1818  R=0.3303  HR=0.4866  MRR=0.3565  MAP=0.3529  NDCG=0.2999
K=5  P=0.1594  R=0.4687  HR=0.6310  MRR=0.3878  MAP=0.3766  NDCG=0.3540
K=10  P=0.1289  R=0.6910  HR=0.8075  MRR=0.4107  MAP=0.3671  NDCG=0.4356

==== LGBMRanker-mini | all-valid ====
K=1  P=0.0054  R=0.1564  HR=0.0054  MRR=0.0054  MAP=0.0054  NDCG=0.2620
K=3  P=0.0038  R=0.3303  HR=0.0099  MRR=0.0073  MAP=0.0072  NDCG=0.2995
K=5  P=0.0033  R=0.4705  HR=0.0131  MRR=0.0080  MAP=0.0078  NDCG=0.3544
K=10  P=0.0027  R=0.6857  HR=0.0166  MRR=0.0085  MAP=0.0076  NDCG=0.4335
