In [1]:
import pandas as pd
import os
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import GroupKFold


import lightgbm as lgb
import matplotlib.pyplot as plt
from tqdm import tqdm


In [2]:
DATA_PATH = r"C:\Users\Adesh Mishra\OneDrive\Desktop\Amex\data"  # adjust if needed

def load_all(path=DATA_PATH):
    files = {
        "train":   "train_data.parquet",
        "test":    "test_data.parquet",
        "event":   "add_event.parquet",
        "trans":   "add_trans.parquet",
        "meta":    "offer_metadata.parquet",
    }
    dfs = {k: pd.read_parquet(os.path.join(path, fn)) for k,fn in files.items()}
    return dfs

dfs = load_all()
for name, df in dfs.items():
    print(f"{name:6s}", "shape=", df.shape)


train  shape= (770164, 372)
test   shape= (369301, 371)
event  shape= (21457473, 5)
trans  shape= (6339465, 9)
meta   shape= (4164, 12)


In [3]:
train = dfs["train"]
events = dfs["event"]

# target distribution
print("Train y distribution:\n", train.y.value_counts(normalize=True))

# click-through rate in events
print("Event click rate:", events.id7.notna().mean())


Train y distribution:
 y
0    0.951892
1    0.048108
Name: proportion, dtype: float64
Event click rate: 0.018599114630133754


In [4]:
def parse_timestamps(df, col="id4"):
    df = df.copy()
    df["ts"] = pd.to_datetime(df[col])
    return df

def make_rfm(trans, ref_date):
    t = trans.copy()
    t["ts"] = pd.to_datetime(t.f370)
    agg = t.groupby("id2").agg(
        recency   = ("ts", lambda x: (ref_date - x.max()).days),
        frequency = ("f367", "count"),
        monetary  = ("f367", "sum")
    ).reset_index()
    return agg

def make_event_feats(events):
    e = events.copy()
    e["is_click"] = e.id7.notnull().astype(int)
    agg = e.groupby("id2").agg(
        impressions = ("id3", "count"),
        clicks      = ("is_click", "sum")
    ).reset_index()
    return agg

def encode_meta(meta):
    return pd.get_dummies(meta, columns=["f375"], prefix="freq")


In [5]:
# 1) parse timestamps
train_fe = parse_timestamps(dfs["train"])
test_fe  = parse_timestamps(dfs["test"])
ref_date = train_fe.ts.min()

# 2) build & downcast side-tables
rfm  = make_rfm(dfs["trans"], ref_date)
evt  = make_event_feats(dfs["event"])
# compact metadata encoding
from sklearn.preprocessing import LabelEncoder
menc = dfs["meta"][["id3","f375"]].copy()
le = LabelEncoder()
menc["freq_enc"] = le.fit_transform(menc["f375"])
menc = menc[["id3","freq_enc"]]

# downcast types to save memory
rfm["id2"] = rfm["id2"].astype("int32")
evt["id2"] = evt["id2"].astype("int32")
menc["id3"]= menc["id3"].astype("int32")
for col in ["recency","frequency","monetary"]:
    rfm[col] = pd.to_numeric(rfm[col], downcast="float")
for col in ["impressions","clicks"]:
    evt[col] = pd.to_numeric(evt[col], downcast="unsigned")

# 3) set indices for fast lookup
rfm_idx  = rfm.set_index("id2")
evt_idx  = evt.set_index("id2")
menc_idx = menc.set_index("id3")

# 4) map features one-by-one onto train_fe/test_fe
train_df = train_fe.copy()
test_df  = test_fe.copy()

for col in ["recency","frequency","monetary"]:
    train_df[col] = train_df["id2"].map(  rfm_idx[col])
    test_df[col]  = test_df["id2"].map(  rfm_idx[col])

for col in ["impressions","clicks"]:
    train_df[col] = train_df["id2"].map(  evt_idx[col])
    test_df[col]  = test_df["id2"].map(  evt_idx[col])

train_df["freq_enc"] = train_df["id3"].map(menc_idx["freq_enc"])
test_df["freq_enc"]  = test_df["id3"].map(menc_idx["freq_enc"])

# 5) fill any NaNs
train_df.fillna(0, inplace=True)
test_df.fillna(0, inplace=True)

print("Train:", train_df.shape, "Test:", test_df.shape)


  train_df.fillna(0, inplace=True)
  test_df.fillna(0, inplace=True)


Train: (770164, 379) Test: (369301, 378)


In [6]:
def apk(actual, predicted, k=7):
    if len(predicted) > k:
        predicted = predicted[:k]
    score = num_hits = 0.0
    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1
            score += num_hits/(i+1)
    return score/min(len(actual), k) if actual else 0.0

def mapk(all_actuals, all_preds, k=7):
    return np.mean([apk(a, p, k) for a, p in zip(all_actuals, all_preds)])


In [11]:
from lightgbm.callback import early_stopping, log_evaluation
from sklearn.model_selection import GroupKFold
import numpy as np
import lightgbm as lgb

# 0) Ensure y is numeric
train_df["y"] = train_df["y"].astype(np.int8)

# 1) Define FEATURES exactly as before
exclude = {"id1", "id2", "id3", "ts", "y", "fold"}
FEATURES = [
    c for c in train_df.columns
    if c not in exclude
    and np.issubdtype(train_df[c].dtype, np.number)
]
print(f"Using {len(FEATURES)} features")

# 2) Assign folds
train_df["fold"] = -1
gkf = GroupKFold(n_splits=5)
for fold, (_, val_idx) in enumerate(gkf.split(train_df, groups=train_df.id2)):
    train_df.loc[val_idx, "fold"] = fold

models, scores = [], []

# 3) Train each fold as a ranking task
for fold in range(5):
    print(f"\n--- Fold {fold} ---")
    trn = train_df[train_df.fold != fold]
    val = train_df[train_df.fold == fold]

    # group sizes per user
    group_trn = trn.groupby("id2").size().to_numpy()
    group_val = val.groupby("id2").size().to_numpy()

    dtr = lgb.Dataset(trn[FEATURES], label=trn.y, group=group_trn)
    dvl = lgb.Dataset(val[FEATURES], label=val.y, group=group_val)

    params = {
        "objective":     "lambdarank",
        "metric":        "map",
        "map_eval_at":   [7],
        "learning_rate": 0.05,
        "num_leaves":    64,
        "max_depth":     8,
        "feature_fraction": 0.8,
        "bagging_fraction": 0.8,
        "bagging_freq":     5,
        # GPU flags
        "device":         "gpu",
        "gpu_platform_id":0,
        "gpu_device_id":  0,
    }

    model = lgb.train(
        params,
        dtr,
        valid_sets=[dvl],
        num_boost_round=2000,
        callbacks=[
            early_stopping(stopping_rounds=100),
            log_evaluation(period=100),
        ]
    )
    models.append(model)

    # 4) Evaluate MAP@7 on val
    val["pred"] = model.predict(val[FEATURES])
    actuals, preds = [], []
    for _,grp in val.groupby("id2"):
        actuals.append(grp.loc[grp.y==1,"id3"].tolist())
        preds.append(grp.sort_values("pred", ascending=False)["id3"].tolist())
    score = mapk(actuals, preds)
    print(f"Fold {fold} MAP@7 = {score:.5f}")
    scores.append(score)

print(f"\nAverage MAP@7: {np.mean(scores):.5f}")


Using 10 features

--- Fold 0 ---
[LightGBM] [Info] Total groups: 37240, total data: 616131
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 616131, number of used features: 0
[LightGBM] [Info] Using requested OpenCL platform 0 device 0
[LightGBM] [Info] Using GPU Device: Intel(R) UHD Graphics, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Total groups: 9310, total data: 154033
Training until validation scores don't improve for 100 rounds
[100]	valid_0's map@7: 0.881114
Early stopping, best iteration is:
[4]	valid_0's map@7: 0.881114


  val["pred"] = model.predict(val[FEATURES])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val["pred"] = model.predict(val[FEATURES])


Fold 0 MAP@7 = 0.01702

--- Fold 1 ---
[LightGBM] [Info] Total groups: 37240, total data: 616131
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 616131, number of used features: 0
[LightGBM] [Info] Using requested OpenCL platform 0 device 0
[LightGBM] [Info] Using GPU Device: Intel(R) UHD Graphics, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Total groups: 9310, total data: 154033
Training until validation scores don't improve for 100 rounds
[100]	valid_0's map@7: 0.888176
Early stopping, best iteration is:
[26]	valid_0's map@7: 0.888176


  val["pred"] = model.predict(val[FEATURES])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val["pred"] = model.predict(val[FEATURES])


Fold 1 MAP@7 = 0.01555

--- Fold 2 ---
[LightGBM] [Info] Total groups: 37240, total data: 616131
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 616131, number of used features: 0
[LightGBM] [Info] Using requested OpenCL platform 0 device 0
[LightGBM] [Info] Using GPU Device: Intel(R) UHD Graphics, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Total groups: 9310, total data: 154033
Training until validation scores don't improve for 100 rounds
[100]	valid_0's map@7: 0.885794
Early stopping, best iteration is:
[58]	valid_0's map@7: 0.885794


  val["pred"] = model.predict(val[FEATURES])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val["pred"] = model.predict(val[FEATURES])


Fold 2 MAP@7 = 0.01688

--- Fold 3 ---
[LightGBM] [Info] Total groups: 37240, total data: 616131
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 616131, number of used features: 0
[LightGBM] [Info] Using requested OpenCL platform 0 device 0
[LightGBM] [Info] Using GPU Device: Intel(R) UHD Graphics, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Total groups: 9310, total data: 154033
Training until validation scores don't improve for 100 rounds
[100]	valid_0's map@7: 0.882421
Early stopping, best iteration is:
[1]	valid_0's map@7: 0.882421


  val["pred"] = model.predict(val[FEATURES])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val["pred"] = model.predict(val[FEATURES])


Fold 3 MAP@7 = 0.01749

--- Fold 4 ---
[LightGBM] [Info] Total groups: 37240, total data: 616132
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 616132, number of used features: 0
[LightGBM] [Info] Using requested OpenCL platform 0 device 0
[LightGBM] [Info] Using GPU Device: Intel(R) UHD Graphics, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Total groups: 9310, total data: 154032
Training until validation scores don't improve for 100 rounds
[100]	valid_0's map@7: 0.881577
Early stopping, best iteration is:
[17]	valid_0's map@7: 0.881577


  val["pred"] = model.predict(val[FEATURES])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val["pred"] = model.predict(val[FEATURES])


Fold 4 MAP@7 = 0.01652

Average MAP@7: 0.01669
