In [1]:

import pandas as pd, numpy as np
from pathlib import Path
from datetime import timedelta
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import average_precision_score
import warnings, gc, os, sys, math, random, time, json

In [2]:
train=pd.read_parquet(r"C:\Users\Adesh Mishra\OneDrive\Desktop\Amex\data\train_data.parquet")
test=pd.read_parquet(r"C:\Users\Adesh Mishra\OneDrive\Desktop\Amex\data\test_data.parquet")
trans=pd.read_parquet(r"C:\Users\Adesh Mishra\OneDrive\Desktop\Amex\data\add_trans.parquet")
event=pd.read_parquet(r"C:\Users\Adesh Mishra\OneDrive\Desktop\Amex\data\add_event.parquet")
metadata=pd.read_parquet(r"C:\Users\Adesh Mishra\OneDrive\Desktop\Amex\data\offer_metadata.parquet")

In [3]:
# 1.  Core dtype hygiene
# ---------------------------------------------------------------
ID_COLS = ["id2", "id3"]
for df in [train, test, event, trans, metadata]:
    for c in ID_COLS:
        if c in df.columns:
            df[c] = df[c].astype("string")        # unify dtype

train["y"] = pd.to_numeric(train["y"], errors="coerce").fillna(0).astype("int8")

DT_FIX = [
    (train,"id5"), (test,"id5"), (train,"id4"), (test,"id4"),
    (event,"id4"), (trans,"f370"), (metadata,"id12"), (metadata,"id13")
]
for df,col in DT_FIX:
    df[col] = pd.to_datetime(df[col])

In [4]:
# 2.  Aggregate features
# ---------------------------------------------------------------
cutoff = train["id5"].max()

# 2‑a  Event history  (30‑day window)
evt_agg = (event[event["id4"] >= cutoff - timedelta(days=30)]
           .groupby(["id2","id3"], sort=False)
           .agg(evt_imps=("id4","count"),
                evt_last=("id4","max"))
           .reset_index())
evt_agg["evt_recency_days"] = (cutoff - evt_agg["evt_last"]).dt.days.astype("int16")
evt_agg[ID_COLS] = evt_agg[ID_COLS].astype("string")


In [5]:
# 2‑b  Transaction history  (90‑day window)
trans_agg = (trans[trans["f370"] >= cutoff - timedelta(days=90)]
             .assign(amt=lambda d: d["f367"].astype("float32"))
             .groupby("id2", sort=False)
             .agg(spend_90d=("amt","sum"),
                  txn_cnt_90d=("amt","size"),
                  spend_avg=("amt","mean"))
             .reset_index())
trans_agg["id2"] = trans_agg["id2"].astype("string")

In [6]:
# 2‑c  Offer metadata
meta_agg = (metadata[["id3","f376","f375","id13"]]
            .assign(days_to_expiry=lambda d: (d["id13"]-cutoff).dt.days.clip(lower=-1))
            .drop(columns="id13"))
meta_agg["id3"] = meta_agg["id3"].astype("string")

In [7]:
# 3.  Join with suffixes  (no column collisions)
# ---------------------------------------------------------------
def enrich(df):
    df = df.merge(evt_agg , on=["id2","id3"], how="left", suffixes=("", "_evt"))
    df = df.merge(trans_agg, on="id2",        how="left", suffixes=("", "_txn"))
    df = df.merge(meta_agg , on="id3",        how="left", suffixes=("", "_meta"))
    return df

train = enrich(train)
test  = enrich(test)

In [8]:
# fill missing numerics with 0
NUM_FILL = train.select_dtypes(include="number").columns.difference(["y"])
train[NUM_FILL] = train[NUM_FILL].fillna(0)
test [NUM_FILL] = test [NUM_FILL].fillna(0)

In [9]:
# 4.  Datetime → “days since …”  (except key id5)
# ---------------------------------------------------------------
PROTECT_DTS = {"id5","id4"}
def strip_dt(df, ref):
    for c in df.select_dtypes("datetime64[ns]").columns.difference(PROTECT_DTS):
        df[f"days_since_{c}"] = (ref - df[c]).dt.days.astype("float32")
        df.drop(columns=c, inplace=True)

ref_date = train["id5"].max()
strip_dt(train, ref_date)
strip_dt(test , ref_date)

In [10]:
# 5.  Categorical handling
# ---------------------------------------------------------------
manual_cat = ["f368","f369","id6","id9","id10","id11","f374"]
cat_cols = [c for c in train.columns
            if ((train[c].dtype == "object") or (train[c].dtype.name=="string") or (c in manual_cat))
            and c != "y" and c in test.columns]

for c in cat_cols:
    le = LabelEncoder().fit(
        pd.concat([train[c], test[c]]).fillna("NA").astype(str)
    )
    train[c] = le.transform(train[c].fillna("NA").astype(str)).astype("int32")
    test[c]  = le.transform(test[c].fillna("NA").astype(str)).astype("int32")

   



In [11]:
# ---------------------------------------------------------------
# 6.  Train / valid split (last calendar day)
# ---------------------------------------------------------------
last_day  = train["id5"].max().normalize()
val_start = last_day
trn_idx   = train["id5"] < val_start
val_idx   = train["id5"] >= val_start
if trn_idx.sum() == 0:                                   # widen if needed
    val_start -= timedelta(days=1)
    trn_idx = train["id5"] < val_start
    val_idx = train["id5"] >= val_start



FEATS = [c for c in train.columns if c not in {"y", "id1", "id4", "id5"}]

# --- NEW: keep only cats that are real features -----------------
cat_cols = [c for c in cat_cols if c in FEATS] 


def make_group(df): return df.groupby(["id2","id5"], sort=False).size().values
X_tr, y_tr = train.loc[trn_idx, FEATS], train.loc[trn_idx,"y"]
X_va, y_va = train.loc[val_idx, FEATS], train.loc[val_idx,"y"]
g_tr, g_va = make_group(train.loc[trn_idx]), make_group(train.loc[val_idx])

In [12]:
# 7.  LightGBM ranker
# ---------------------------------------------------------------
params = dict(objective="lambdarank", metric="map", eval_at=[7],
              learning_rate=0.045, num_leaves=191, min_data_in_leaf=40,
              feature_fraction=0.9, bagging_fraction=0.8,
              lambda_l1=1.0, lambda_l2=0.1, n_estimators=6000)

ranker = lgb.LGBMRanker(**params)
ranker.fit(X_tr, y_tr, group=g_tr,
           eval_set=[(X_va, y_va)], eval_group=[g_va],
           categorical_feature=cat_cols,
           eval_metric="map",
           callbacks=[lgb.early_stopping(400), lgb.log_evaluation(200)])

print("Best iter :", ranker.best_iteration_, 
      "  Valid MAP@7 :", ranker.best_score_["valid_0"]["map@7"])




[LightGBM] [Info] Total groups: 36500, total data: 548319
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.310588 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 386293
[LightGBM] [Info] Number of data points in the train set: 548319, number of used features: 329
[LightGBM] [Info] Total groups: 15968, total data: 221845
Training until validation scores don't improve for 400 rounds
[200]	valid_0's map@7: 0.937943
[400]	valid_0's map@7: 0.938348
[600]	valid_0's map@7: 0.938372
[800]	valid_0's map@7: 0.938517
[1000]	valid_0's map@7: 0.938517
Early stopping, best iteration is:
[632]	valid_0's map@7: 0.938557
Best iter : 632   Valid MAP@7 : 0.9385569825166205


In [14]:
import numpy as np
import pandas as pd

def map_at_k(df, k=7,
             user_col="id2", day_col="id5",
             label_col="y", score_col="pred"):
    """
    Parameters
    ----------
    df : pd.DataFrame
        Must contain the four columns: user_col, day_col, label_col, score_col
    k  : int
        Evaluate MAP@k  (default 7)
    Returns
    -------
    float : MAP@k
    """

    # 1) sort by user, day, and predicted score DESC
    df = (df
          .sort_values([user_col, day_col, score_col],
                       ascending=[True,    True,  False])
          .reset_index(drop=True))

    # 2) rank inside each (user,day) and keep top‑k
    df["rank"] = df.groupby([user_col, day_col]).cumcount()
    df = df[df["rank"] < k]

    # 3) cumulative hits and precision @ each position
    df["hit"] = df[label_col]
    df["cum_hit"] = df.groupby([user_col, day_col])["hit"].cumsum()
    df["precision"] = df["cum_hit"] / (df["rank"] + 1)

    # 4) AP for each bucket = sum(precision * hit) / (#positives or k)
    ap = (df["precision"] * df["hit"]).groupby(
          [df[user_col], df[day_col]]).sum()

    denom = (df.groupby([user_col, day_col])["hit"]
               .sum().clip(lower=1))            # avoid divide‑by‑zero
    ap = ap / denom

    return ap.mean()


In [15]:
# build a frame with the 4 mandatory columns
val_pred = pd.DataFrame({
    "id2" : train.loc[val_idx, "id2"].values,
    "id5" : train.loc[val_idx, "id5"].values,
    "y"   : y_va.values,
    "pred": ranker.predict(X_va, num_iteration=ranker.best_iteration_)
})

val_map7 = map_at_k(val_pred, k=7)
print(f"Validation MAP@7 = {val_map7:.5f}")




Validation MAP@7 = 0.05256


In [16]:
# Drop bad features
DROP_FEATS = ["id1", "id4", "id5", "evt_last"]
FEATS = [c for c in train.columns if c not in DROP_FEATS + ["y"]]

# Extra engineered features
train['spend_per_txn'] = train['spend_90d'] / (train['txn_cnt_90d'] + 1e-5)
test['spend_per_txn']  = test['spend_90d'] / (test['txn_cnt_90d'] + 1e-5)

train['evt_per_txn'] = train['evt_imps'] / (train['txn_cnt_90d'] + 1e-5)
test['evt_per_txn']  = test['evt_imps'] / (test['txn_cnt_90d'] + 1e-5)

# Split
last_day = train["id5"].max().normalize()
val_start = last_day
trn_idx = train["id5"] < val_start
val_idx = train["id5"] >= val_start

# Group
g_tr = make_group(train.loc[trn_idx])
g_va = make_group(train.loc[val_idx])

# Fit model
ranker = lgb.LGBMRanker(**params)
ranker.fit(
    train.loc[trn_idx, FEATS], train.loc[trn_idx, "y"],
    group=g_tr,
    eval_set=[(train.loc[val_idx, FEATS], train.loc[val_idx, "y"])],
    eval_group=[g_va],
    categorical_feature=cat_cols,
    eval_metric="map",
    callbacks=[lgb.early_stopping(300)]
)

# Validation prediction
val_pred = pd.DataFrame({
    "id2": train.loc[val_idx, "id2"].values,
    "id5": train.loc[val_idx, "id5"].values,
    "y": train.loc[val_idx, "y"].values,
    "pred": ranker.predict(train.loc[val_idx, FEATS], num_iteration=ranker.best_iteration_)
})

print("MAP@7 on validation:", map_at_k(val_pred, k=7))




[LightGBM] [Info] Total groups: 36500, total data: 548319
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.796116 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 386293
[LightGBM] [Info] Number of data points in the train set: 548319, number of used features: 329
[LightGBM] [Info] Total groups: 15968, total data: 221845
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[632]	valid_0's map@7: 0.938557




MAP@7 on validation: 0.05256380071651239
