In [1]:

import pandas as pd, numpy as np
from pathlib import Path
from datetime import timedelta
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import average_precision_score
import warnings, gc, os, sys, math, random, time, json

In [2]:
train=pd.read_parquet(r"C:\Users\Adesh Mishra\OneDrive\Desktop\Amex\data\train_data.parquet")
test=pd.read_parquet(r"C:\Users\Adesh Mishra\OneDrive\Desktop\Amex\data\test_data.parquet")
trans=pd.read_parquet(r"C:\Users\Adesh Mishra\OneDrive\Desktop\Amex\data\add_trans.parquet")
event=pd.read_parquet(r"C:\Users\Adesh Mishra\OneDrive\Desktop\Amex\data\add_event.parquet")
metadata=pd.read_parquet(r"C:\Users\Adesh Mishra\OneDrive\Desktop\Amex\data\offer_metadata.parquet")

In [17]:
# Convert string labels (like "00001") to binary 0/1: 1 if any '1' appears, else 0
train['y'] = train['y'].apply(lambda x: 1 if '1' in str(x) else 0)


In [18]:
features = [col for col in train.columns if col.startswith('f')]

for col in features:
    train[col] = pd.to_numeric(train[col], errors='coerce')
    test[col] = pd.to_numeric(test[col], errors='coerce')

train[features] = train[features].fillna(-9999)
test[features] = test[features].fillna(-9999)


In [19]:
# User click rate
user_click_rate = train.groupby('id2')['y'].mean().rename('user_click_rate')
train = train.merge(user_click_rate, on='id2', how='left')
test = test.merge(user_click_rate, on='id2', how='left')

# Offer click rate
offer_click_rate = train.groupby('id3')['y'].mean().rename('offer_click_rate')
train = train.merge(offer_click_rate, on='id3', how='left')
test = test.merge(offer_click_rate, on='id3', how='left')

# User-offer click rate
user_offer_click_rate = train.groupby(['id2', 'id3'])['y'].mean().rename('user_offer_click_rate')
train = train.merge(user_offer_click_rate, on=['id2', 'id3'], how='left')
test = test.merge(user_offer_click_rate, on=['id2', 'id3'], how='left')

for col in ['user_click_rate', 'offer_click_rate', 'user_offer_click_rate']:
    train[col] = train[col].fillna(0)
    test[col] = test[col].fillna(0)


In [20]:
# User's total offers seen
user_offer_count = train.groupby('id2').size().rename('user_offer_count')
train = train.merge(user_offer_count, on='id2', how='left')
test = test.merge(user_offer_count, on='id2', how='left').fillna(0)

# Offer's total impressions
offer_count = train.groupby('id3').size().rename('offer_count')
train = train.merge(offer_count, on='id3', how='left')
test = test.merge(offer_count, on='id3', how='left').fillna(0)


In [22]:
# Before merging
train['id3'] = train['id3'].astype(str)
test['id3'] = test['id3'].astype(str)
metadata['id3'] = metadata['id3'].astype(str)




In [23]:
# Merge all columns except for the offer id
meta_cols = [col for col in metadata.columns if col != 'id3']
train = train.merge(metadata, on='id3', how='left')
test = test.merge(metadata, on='id3', how='left')


In [25]:
train['id2'] = train['id2'].astype(str)
event['id2'] = event['id2'].astype(str)


In [26]:
# Only run if 'id4' in event/train/test is a datetime of impression
if 'id4' in event.columns and 'id4' in train.columns:
    event['id4'] = pd.to_datetime(event['id4'], errors='coerce')
    train['date'] = pd.to_datetime(train['id4'], errors='coerce')
    last_event = event.groupby('id2')['id4'].max().rename('last_event_time')
    train = train.merge(last_event, on='id2', how='left')
    train['days_since_last_event'] = (train['date'] - train['last_event_time']).dt.days
    train['days_since_last_event'] = train['days_since_last_event'].fillna(999)
    test = test.merge(last_event, on='id2', how='left')
    test['date'] = pd.to_datetime(test['id4'], errors='coerce')
    test['days_since_last_event'] = (test['date'] - test['last_event_time']).dt.days
    test['days_since_last_event'] = test['days_since_last_event'].fillna(999)
    features.append('days_since_last_event')


In [27]:
extra_features = [
    'user_click_rate', 'offer_click_rate', 'user_offer_click_rate',
    'user_offer_count', 'offer_count'
]

# Add all metadata columns except the ID (and any columns you've already included)
meta_features = [col for col in metadata.columns if col != 'id3']

all_features = features + extra_features + meta_features
# Remove any duplicates
all_features = list(dict.fromkeys(all_features))

# Ensure all features are numeric and fill NA
for col in all_features:
    train[col] = pd.to_numeric(train[col], errors='coerce')
    test[col] = pd.to_numeric(test[col], errors='coerce')

train[all_features] = train[all_features].fillna(-9999)
test[all_features] = test[all_features].fillna(-9999)


In [28]:
def apk(actual, predicted, k=7):
    if len(predicted) > k:
        predicted = predicted[:k]
    score = 0.0
    num_hits = 0.0
    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    if not actual:
        return 0.0
    return score / min(len(actual), k)

def mapk(df, k=7):
    map_scores = []
    for user_id, group in df.groupby('id2'):
        actual = group.loc[group['y'] == 1, 'id3'].tolist()
        predicted = group.sort_values('pred', ascending=False)['id3'].tolist()
        map_scores.append(apk(actual, predicted, k))
    return np.mean(map_scores)


In [29]:
X_train = train[all_features]
y_train = train['y']
from sklearn.model_selection import GroupKFold
cv = GroupKFold(n_splits=5)
val_scores = []

for train_idx, val_idx in cv.split(X_train, y_train, groups=train['id2']):
    tr_X, tr_y = X_train.iloc[train_idx], y_train.iloc[train_idx]
    val_X, val_y = X_train.iloc[val_idx], y_train.iloc[val_idx]
    val_id2 = train.iloc[val_idx]['id2']
    val_id3 = train.iloc[val_idx]['id3']
    
    model = lgb.LGBMClassifier(
        n_estimators=300,
        learning_rate=0.03,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        class_weight='balanced'  # helps with imbalance
    )
    model.fit(tr_X, tr_y)
    val_pred = model.predict_proba(val_X)[:, 1]
    val_df = pd.DataFrame({
        'id2': val_id2,
        'id3': val_id3,
        'y': val_y,
        'pred': val_pred
    })
    score = mapk(val_df, k=7)
    val_scores.append(score)
    print(f"Fold MAP@7: {score:.4f}")

print(f"Mean MAP@7: {np.mean(val_scores):.4f}")


[LightGBM] [Info] Number of positive: 29723, number of negative: 586408
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.639015 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 48631
[LightGBM] [Info] Number of data points in the train set: 616131, number of used features: 363
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Fold MAP@7: 0.1090
[LightGBM] [Info] Number of positive: 30191, number of negative: 585940
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.472781 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 48608
[LightGBM] [Info] Number of data points in the train set: 616131, number of used features: 325
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.50000

In [30]:
test_pred = model.predict_proba(test[all_features])[:, 1]
test['pred'] = test_pred
test['rank'] = test.groupby('id2')['pred'].rank(method='first', ascending=False)
submission = test[test['rank'] <= 7].sort_values(['id2', 'rank'])
submission[['id2', 'id3']].to_csv('submission.csv', index=False)


  test['rank'] = test.groupby('id2')['pred'].rank(method='first', ascending=False)
