In [8]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GroupKFold

In [9]:
train=pd.read_parquet(r"C:\Users\Adesh Mishra\OneDrive\Desktop\Amex\data\train_data.parquet")
test=pd.read_parquet(r"C:\Users\Adesh Mishra\OneDrive\Desktop\Amex\data\test_data.parquet")
trans=pd.read_parquet(r"C:\Users\Adesh Mishra\OneDrive\Desktop\Amex\data\add_trans.parquet")
event=pd.read_parquet(r"C:\Users\Adesh Mishra\OneDrive\Desktop\Amex\data\add_event.parquet")
metadata=pd.read_parquet(r"C:\Users\Adesh Mishra\OneDrive\Desktop\Amex\data\offer_metadata.parquet")

In [10]:
train.head()

Unnamed: 0,id1,id2,id3,id4,id5,y,f1,f2,f3,f4,...,f357,f358,f359,f360,f361,f362,f363,f364,f365,f366
0,1366776_189706075_16-23_2023-11-02 22:22:00.042,1366776,189706075,2023-11-02 22:22:00.042,2023-11-02,0,1.0,,,,...,,-9999.0,0.0,,28.0,0.0,0.0,337.0,0.0,0.0
1,1366776_89227_16-23_2023-11-01 23:51:24.999,1366776,89227,2023-11-01 23:51:24.999,2023-11-01,0,1.0,,,,...,,,0.0,,87.0,0.0,0.0,1010.0,2.0,0.0019801980198019
2,1366776_35046_16-23_2023-11-01 00:30:59.797,1366776,35046,2023-11-01 00:30:59.797,2023-11-01,0,1.0,,,,...,,,0.0,,23.0,0.0,0.0,1010.0,2.0,0.0019801980198019
3,1366776_6275451_16-23_2023-11-02 22:21:32.261,1366776,6275451,2023-11-02 22:21:32.261,2023-11-02,0,1.0,,,,...,,-9999.0,0.0,,277.0,1.0,0.003610108303249,337.0,0.0,0.0
4,1366776_78053_16-23_2023-11-02 22:21:34.799,1366776,78053,2023-11-02 22:21:34.799,2023-11-02,0,1.0,,,,...,,-9999.0,0.0,,359.0,0.0,0.0,337.0,0.0,0.0


In [11]:
# 1. LABEL FIX
train['y'] = train['y'].apply(lambda x: 1 if '1' in str(x) else 0)

# 2. ID TYPE CONSISTENCY
for df in [train, test, trans, event, metadata]:
    for col in ['id2', 'id3']:
        if col in df.columns:
            df[col] = df[col].astype(str)

In [12]:
# 3. PARSE DATES EARLY (once)
for df in [train, test, trans, event]:
    if 'id4' in df.columns:
        df['id4'] = pd.to_datetime(df['id4'], errors='coerce')

In [13]:
# 4. TRANSACTION RECENCY
if 'id4' in trans.columns:
    last_trans = trans.groupby('id2')['id4'].max().rename('last_trans_time')
    train = train.merge(last_trans, on='id2', how='left')
    test = test.merge(last_trans, on='id2', how='left')
    train['days_since_last_trans'] = (train['id4'] - train['last_trans_time']).dt.days
    test['days_since_last_trans'] = (test['id4'] - test['last_trans_time']).dt.days
    train['days_since_last_trans'] = train['days_since_last_trans'].fillna(999)
    test['days_since_last_trans'] = test['days_since_last_trans'].fillna(999)

In [14]:
train.head()

Unnamed: 0,id1,id2,id3,id4,id5,y,f1,f2,f3,f4,...,f357,f358,f359,f360,f361,f362,f363,f364,f365,f366
0,1366776_189706075_16-23_2023-11-02 22:22:00.042,1366776,189706075,2023-11-02 22:22:00.042,2023-11-02,0,1.0,,,,...,,-9999.0,0.0,,28.0,0.0,0.0,337.0,0.0,0.0
1,1366776_89227_16-23_2023-11-01 23:51:24.999,1366776,89227,2023-11-01 23:51:24.999,2023-11-01,0,1.0,,,,...,,,0.0,,87.0,0.0,0.0,1010.0,2.0,0.0019801980198019
2,1366776_35046_16-23_2023-11-01 00:30:59.797,1366776,35046,2023-11-01 00:30:59.797,2023-11-01,0,1.0,,,,...,,,0.0,,23.0,0.0,0.0,1010.0,2.0,0.0019801980198019
3,1366776_6275451_16-23_2023-11-02 22:21:32.261,1366776,6275451,2023-11-02 22:21:32.261,2023-11-02,0,1.0,,,,...,,-9999.0,0.0,,277.0,1.0,0.003610108303249,337.0,0.0,0.0
4,1366776_78053_16-23_2023-11-02 22:21:34.799,1366776,78053,2023-11-02 22:21:34.799,2023-11-02,0,1.0,,,,...,,-9999.0,0.0,,359.0,0.0,0.0,337.0,0.0,0.0


In [15]:
from bisect import bisect_left
# 5. FAST EVENT COUNTS (30 days)
all_df = pd.concat([train[['id2', 'id4']], test[['id2', 'id4']]], keys=['train', 'test']).reset_index(level=0).rename(columns={'level_0': 'set'})
event_dict = event.groupby('id2')['id4'].apply(lambda x: sorted(x)).to_dict()

def fast_count(user, curr_date, window_days=30):
    dates = event_dict.get(user, [])
    left = bisect_left(dates, curr_date - pd.Timedelta(days=window_days))
    right = bisect_left(dates, curr_date)
    return right - left

all_df['events_last30'] = [
    fast_count(row['id2'], row['id4'], 30)
    for _, row in all_df.iterrows()
]
train['events_last30'] = all_df[all_df['set'] == 'train']['events_last30'].values
test['events_last30'] = all_df[all_df['set'] == 'test']['events_last30'].values

In [16]:
train.head()

Unnamed: 0,id1,id2,id3,id4,id5,y,f1,f2,f3,f4,...,f358,f359,f360,f361,f362,f363,f364,f365,f366,events_last30
0,1366776_189706075_16-23_2023-11-02 22:22:00.042,1366776,189706075,2023-11-02 22:22:00.042,2023-11-02,0,1.0,,,,...,-9999.0,0.0,,28.0,0.0,0.0,337.0,0.0,0.0,0
1,1366776_89227_16-23_2023-11-01 23:51:24.999,1366776,89227,2023-11-01 23:51:24.999,2023-11-01,0,1.0,,,,...,,0.0,,87.0,0.0,0.0,1010.0,2.0,0.0019801980198019,0
2,1366776_35046_16-23_2023-11-01 00:30:59.797,1366776,35046,2023-11-01 00:30:59.797,2023-11-01,0,1.0,,,,...,,0.0,,23.0,0.0,0.0,1010.0,2.0,0.0019801980198019,0
3,1366776_6275451_16-23_2023-11-02 22:21:32.261,1366776,6275451,2023-11-02 22:21:32.261,2023-11-02,0,1.0,,,,...,-9999.0,0.0,,277.0,1.0,0.003610108303249,337.0,0.0,0.0,0
4,1366776_78053_16-23_2023-11-02 22:21:34.799,1366776,78053,2023-11-02 22:21:34.799,2023-11-02,0,1.0,,,,...,-9999.0,0.0,,359.0,0.0,0.0,337.0,0.0,0.0,0


In [17]:
test.head()

Unnamed: 0,id1,id2,id3,id4,id5,f1,f2,f3,f4,f5,...,f358,f359,f360,f361,f362,f363,f364,f365,f366,events_last30
46756,1362907_91950_16-23_2023-11-04 18:56:26.000794,1362907,91950,2023-11-04 18:56:26.000794,2023-11-04,,,,,,...,0.0465999838670646,0.0,,1.0,0.0,0.0,56.0,0.0,0.0,0
57819,1082599_88356_16-23_2023-11-04 06:08:53.373,1082599,88356,2023-11-04 06:08:53.373000,2023-11-04,,9.0,,,,...,0.0603093514970909,0.0,,195.0,13.0,0.0666666666666666,,,,0
15390,1888466_958700_16-23_2023-11-05 10:07:28.000725,1888466,958700,2023-11-05 10:07:28.000725,2023-11-05,,,,,22.0,...,0.0734836702954899,0.0,,155.0,67.0,0.432258064516129,1142.0,436.0,0.3817863397548161,0
145730,1888971_795739_16-23_2023-11-04 12:25:28.244,1888971,795739,2023-11-04 12:25:28.244000,2023-11-04,,,,,,...,0.040572039549215,0.0,,,,,,,,0
146085,1256369_82296_16-23_2023-11-05 06:45:26.657,1256369,82296,2023-11-05 06:45:26.657000,2023-11-05,,,,,,...,0.038243539079214,0.0,,29.0,2.0,0.0689655172413793,361.0,3.0,0.0083102493074792,0


In [18]:
# 6. USER/OFFER CLICK STATS
features = [col for col in train.columns if col.startswith('f')]
for col in features:
    train[col] = pd.to_numeric(train[col], errors='coerce')
    test[col] = pd.to_numeric(test[col], errors='coerce')
train[features] = train[features].fillna(-9999)
test[features] = test[features].fillna(-9999)

In [19]:
train.head()

Unnamed: 0,id1,id2,id3,id4,id5,y,f1,f2,f3,f4,...,f358,f359,f360,f361,f362,f363,f364,f365,f366,events_last30
0,1366776_189706075_16-23_2023-11-02 22:22:00.042,1366776,189706075,2023-11-02 22:22:00.042,2023-11-02,0,1.0,-9999.0,-9999.0,-9999.0,...,-9999.0,0.0,-9999.0,28.0,0.0,0.0,337.0,0.0,0.0,0
1,1366776_89227_16-23_2023-11-01 23:51:24.999,1366776,89227,2023-11-01 23:51:24.999,2023-11-01,0,1.0,-9999.0,-9999.0,-9999.0,...,-9999.0,0.0,-9999.0,87.0,0.0,0.0,1010.0,2.0,0.00198,0
2,1366776_35046_16-23_2023-11-01 00:30:59.797,1366776,35046,2023-11-01 00:30:59.797,2023-11-01,0,1.0,-9999.0,-9999.0,-9999.0,...,-9999.0,0.0,-9999.0,23.0,0.0,0.0,1010.0,2.0,0.00198,0
3,1366776_6275451_16-23_2023-11-02 22:21:32.261,1366776,6275451,2023-11-02 22:21:32.261,2023-11-02,0,1.0,-9999.0,-9999.0,-9999.0,...,-9999.0,0.0,-9999.0,277.0,1.0,0.00361,337.0,0.0,0.0,0
4,1366776_78053_16-23_2023-11-02 22:21:34.799,1366776,78053,2023-11-02 22:21:34.799,2023-11-02,0,1.0,-9999.0,-9999.0,-9999.0,...,-9999.0,0.0,-9999.0,359.0,0.0,0.0,337.0,0.0,0.0,0


In [20]:
# User click rate
user_click_rate = train.groupby('id2')['y'].mean().rename('user_click_rate')
train = train.merge(user_click_rate, on='id2', how='left')
test = test.merge(user_click_rate, on='id2', how='left')

In [21]:
train.head()

Unnamed: 0,id1,id2,id3,id4,id5,y,f1,f2,f3,f4,...,f359,f360,f361,f362,f363,f364,f365,f366,events_last30,user_click_rate
0,1366776_189706075_16-23_2023-11-02 22:22:00.042,1366776,189706075,2023-11-02 22:22:00.042,2023-11-02,0,1.0,-9999.0,-9999.0,-9999.0,...,0.0,-9999.0,28.0,0.0,0.0,337.0,0.0,0.0,0,0.023256
1,1366776_89227_16-23_2023-11-01 23:51:24.999,1366776,89227,2023-11-01 23:51:24.999,2023-11-01,0,1.0,-9999.0,-9999.0,-9999.0,...,0.0,-9999.0,87.0,0.0,0.0,1010.0,2.0,0.00198,0,0.023256
2,1366776_35046_16-23_2023-11-01 00:30:59.797,1366776,35046,2023-11-01 00:30:59.797,2023-11-01,0,1.0,-9999.0,-9999.0,-9999.0,...,0.0,-9999.0,23.0,0.0,0.0,1010.0,2.0,0.00198,0,0.023256
3,1366776_6275451_16-23_2023-11-02 22:21:32.261,1366776,6275451,2023-11-02 22:21:32.261,2023-11-02,0,1.0,-9999.0,-9999.0,-9999.0,...,0.0,-9999.0,277.0,1.0,0.00361,337.0,0.0,0.0,0,0.023256
4,1366776_78053_16-23_2023-11-02 22:21:34.799,1366776,78053,2023-11-02 22:21:34.799,2023-11-02,0,1.0,-9999.0,-9999.0,-9999.0,...,0.0,-9999.0,359.0,0.0,0.0,337.0,0.0,0.0,0,0.023256


In [22]:
# Offer click rate
offer_click_rate = train.groupby('id3')['y'].mean().rename('offer_click_rate')
train = train.merge(offer_click_rate, on='id3', how='left')
test = test.merge(offer_click_rate, on='id3', how='left')

In [23]:
train.head()

Unnamed: 0,id1,id2,id3,id4,id5,y,f1,f2,f3,f4,...,f360,f361,f362,f363,f364,f365,f366,events_last30,user_click_rate,offer_click_rate
0,1366776_189706075_16-23_2023-11-02 22:22:00.042,1366776,189706075,2023-11-02 22:22:00.042,2023-11-02,0,1.0,-9999.0,-9999.0,-9999.0,...,-9999.0,28.0,0.0,0.0,337.0,0.0,0.0,0,0.023256,0.128987
1,1366776_89227_16-23_2023-11-01 23:51:24.999,1366776,89227,2023-11-01 23:51:24.999,2023-11-01,0,1.0,-9999.0,-9999.0,-9999.0,...,-9999.0,87.0,0.0,0.0,1010.0,2.0,0.00198,0,0.023256,0.090245
2,1366776_35046_16-23_2023-11-01 00:30:59.797,1366776,35046,2023-11-01 00:30:59.797,2023-11-01,0,1.0,-9999.0,-9999.0,-9999.0,...,-9999.0,23.0,0.0,0.0,1010.0,2.0,0.00198,0,0.023256,0.081144
3,1366776_6275451_16-23_2023-11-02 22:21:32.261,1366776,6275451,2023-11-02 22:21:32.261,2023-11-02,0,1.0,-9999.0,-9999.0,-9999.0,...,-9999.0,277.0,1.0,0.00361,337.0,0.0,0.0,0,0.023256,0.083107
4,1366776_78053_16-23_2023-11-02 22:21:34.799,1366776,78053,2023-11-02 22:21:34.799,2023-11-02,0,1.0,-9999.0,-9999.0,-9999.0,...,-9999.0,359.0,0.0,0.0,337.0,0.0,0.0,0,0.023256,0.076727


In [24]:
test.head()

Unnamed: 0,id1,id2,id3,id4,id5,f1,f2,f3,f4,f5,...,f360,f361,f362,f363,f364,f365,f366,events_last30,user_click_rate,offer_click_rate
0,1362907_91950_16-23_2023-11-04 18:56:26.000794,1362907,91950,2023-11-04 18:56:26.000794,2023-11-04,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,...,-9999.0,1.0,0.0,0.0,56.0,0.0,0.0,0,0.0,0.012239
1,1082599_88356_16-23_2023-11-04 06:08:53.373,1082599,88356,2023-11-04 06:08:53.373000,2023-11-04,-9999.0,9.0,-9999.0,-9999.0,-9999.0,...,-9999.0,195.0,13.0,0.066667,-9999.0,-9999.0,-9999.0,0,0.0,0.025253
2,1888466_958700_16-23_2023-11-05 10:07:28.000725,1888466,958700,2023-11-05 10:07:28.000725,2023-11-05,-9999.0,-9999.0,-9999.0,-9999.0,22.0,...,-9999.0,155.0,67.0,0.432258,1142.0,436.0,0.381786,0,,0.125674
3,1888971_795739_16-23_2023-11-04 12:25:28.244,1888971,795739,2023-11-04 12:25:28.244000,2023-11-04,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,...,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,0,,0.009915
4,1256369_82296_16-23_2023-11-05 06:45:26.657,1256369,82296,2023-11-05 06:45:26.657000,2023-11-05,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,...,-9999.0,29.0,2.0,0.068966,361.0,3.0,0.00831,0,,0.008314


In [25]:
# User-offer click rate
user_offer_click_rate = train.groupby(['id2', 'id3'])['y'].mean().rename('user_offer_click_rate')
train = train.merge(user_offer_click_rate, on=['id2', 'id3'], how='left')
test = test.merge(user_offer_click_rate, on=['id2', 'id3'], how='left')
for col in ['user_click_rate', 'offer_click_rate', 'user_offer_click_rate']:
    train[col] = train[col].fillna(0)
    test[col] = test[col].fillna(0)

In [26]:
train.head()

Unnamed: 0,id1,id2,id3,id4,id5,y,f1,f2,f3,f4,...,f361,f362,f363,f364,f365,f366,events_last30,user_click_rate,offer_click_rate,user_offer_click_rate
0,1366776_189706075_16-23_2023-11-02 22:22:00.042,1366776,189706075,2023-11-02 22:22:00.042,2023-11-02,0,1.0,-9999.0,-9999.0,-9999.0,...,28.0,0.0,0.0,337.0,0.0,0.0,0,0.023256,0.128987,0.0
1,1366776_89227_16-23_2023-11-01 23:51:24.999,1366776,89227,2023-11-01 23:51:24.999,2023-11-01,0,1.0,-9999.0,-9999.0,-9999.0,...,87.0,0.0,0.0,1010.0,2.0,0.00198,0,0.023256,0.090245,0.0
2,1366776_35046_16-23_2023-11-01 00:30:59.797,1366776,35046,2023-11-01 00:30:59.797,2023-11-01,0,1.0,-9999.0,-9999.0,-9999.0,...,23.0,0.0,0.0,1010.0,2.0,0.00198,0,0.023256,0.081144,0.0
3,1366776_6275451_16-23_2023-11-02 22:21:32.261,1366776,6275451,2023-11-02 22:21:32.261,2023-11-02,0,1.0,-9999.0,-9999.0,-9999.0,...,277.0,1.0,0.00361,337.0,0.0,0.0,0,0.023256,0.083107,0.0
4,1366776_78053_16-23_2023-11-02 22:21:34.799,1366776,78053,2023-11-02 22:21:34.799,2023-11-02,0,1.0,-9999.0,-9999.0,-9999.0,...,359.0,0.0,0.0,337.0,0.0,0.0,0,0.023256,0.076727,0.0


In [27]:
# Offer's total impressions
offer_count = train.groupby('id3').size().rename('offer_count')
train = train.merge(offer_count, on='id3', how='left')
test = test.merge(offer_count, on='id3', how='left')
test['offer_count'] = test['offer_count'].fillna(0)

In [28]:
train.head()

Unnamed: 0,id1,id2,id3,id4,id5,y,f1,f2,f3,f4,...,f362,f363,f364,f365,f366,events_last30,user_click_rate,offer_click_rate,user_offer_click_rate,offer_count
0,1366776_189706075_16-23_2023-11-02 22:22:00.042,1366776,189706075,2023-11-02 22:22:00.042,2023-11-02,0,1.0,-9999.0,-9999.0,-9999.0,...,0.0,0.0,337.0,0.0,0.0,0,0.023256,0.128987,0.0,2132
1,1366776_89227_16-23_2023-11-01 23:51:24.999,1366776,89227,2023-11-01 23:51:24.999,2023-11-01,0,1.0,-9999.0,-9999.0,-9999.0,...,0.0,0.0,1010.0,2.0,0.00198,0,0.023256,0.090245,0.0,2327
2,1366776_35046_16-23_2023-11-01 00:30:59.797,1366776,35046,2023-11-01 00:30:59.797,2023-11-01,0,1.0,-9999.0,-9999.0,-9999.0,...,0.0,0.0,1010.0,2.0,0.00198,0,0.023256,0.081144,0.0,2132
3,1366776_6275451_16-23_2023-11-02 22:21:32.261,1366776,6275451,2023-11-02 22:21:32.261,2023-11-02,0,1.0,-9999.0,-9999.0,-9999.0,...,1.0,0.00361,337.0,0.0,0.0,0,0.023256,0.083107,0.0,2214
4,1366776_78053_16-23_2023-11-02 22:21:34.799,1366776,78053,2023-11-02 22:21:34.799,2023-11-02,0,1.0,-9999.0,-9999.0,-9999.0,...,0.0,0.0,337.0,0.0,0.0,0,0.023256,0.076727,0.0,2359


In [29]:
print('Columns BEFORE merge:', train.columns)
cols_to_add = [col for col in metadata.columns if col != 'id3' and col not in train.columns]
print('cols_to_add:', cols_to_add)


Columns BEFORE merge: Index(['id1', 'id2', 'id3', 'id4', 'id5', 'y', 'f1', 'f2', 'f3', 'f4',
       ...
       'f362', 'f363', 'f364', 'f365', 'f366', 'events_last30',
       'user_click_rate', 'offer_click_rate', 'user_offer_click_rate',
       'offer_count'],
      dtype='object', length=377)
cols_to_add: ['id9', 'f375', 'f376', 'f377', 'id10', 'id11', 'f378', 'f374', 'id8', 'id12', 'id13']


In [30]:
# Only add columns from metadata that don't exist in train/test, except for the merge key
cols_to_add = [col for col in metadata.columns if col != 'id3' and col not in train.columns]
if cols_to_add:
    train = train.merge(metadata[['id3'] + cols_to_add], on='id3', how='left')
    test = test.merge(metadata[['id3'] + cols_to_add], on='id3', how='left')

# If cols_to_add is empty, SKIP the merge
print('Columns AFTER merge:', train.columns)



Columns AFTER merge: Index(['id1', 'id2', 'id3', 'id4', 'id5', 'y', 'f1', 'f2', 'f3', 'f4',
       ...
       'f375', 'f376', 'f377', 'id10', 'id11', 'f378', 'f374', 'id8', 'id12',
       'id13'],
      dtype='object', length=388)


In [31]:
from sklearn.preprocessing import LabelEncoder
# 8. CATEGORICAL ENCODING FOR METADATA
cat_cols = ['id9', 'id10', 'id11', 'f374']
for col in cat_cols:
    if col in train.columns:
        le = LabelEncoder()
        data = pd.concat([train[col], test[col]], axis=0).astype(str)
        le.fit(data)
        train[col + '_le'] = le.transform(train[col].astype(str))
        test[col + '_le'] = le.transform(test[col].astype(str))

In [32]:
# 9. USER-OFFER SEEN COUNT
train['user_offer_seen_count'] = train.groupby(['id2','id3']).cumcount()
test['user_offer_seen_count'] = 0

In [33]:
feature_list = [
    'user_click_rate', 'offer_click_rate', 'user_offer_click_rate',
    'user_offer_count', 'offer_count', 'days_since_last_trans', 'events_last30',
    'user_offer_seen_count'
]

print("In TRAIN:")
for feat in feature_list:
    print(f"{feat}: {'YES' if feat in train.columns else 'NO'}")

print("\nIn TEST:")
for feat in feature_list:
    print(f"{feat}: {'YES' if feat in test.columns else 'NO'}")


In TRAIN:
user_click_rate: YES
offer_click_rate: YES
user_offer_click_rate: YES
user_offer_count: NO
offer_count: YES
days_since_last_trans: NO
events_last30: YES
user_offer_seen_count: YES

In TEST:
user_click_rate: YES
offer_click_rate: YES
user_offer_click_rate: YES
user_offer_count: NO
offer_count: YES
days_since_last_trans: NO
events_last30: YES
user_offer_seen_count: YES


In [34]:
# 10. FINAL FEATURE LIST
extra_features = [
    'user_click_rate', 'offer_click_rate', 'user_offer_click_rate',
     'offer_count',  'events_last30',
    'user_offer_seen_count'
]
meta_features = [col for col in metadata.columns if col != 'id3']
cat_le_features = [col + '_le' for col in cat_cols if col + '_le' in train.columns]
all_features = features + extra_features + meta_features + cat_le_features
all_features = list(dict.fromkeys(all_features))  # remove duplicates

In [35]:
# Ensure all features are numeric
for col in all_features:
    train[col] = pd.to_numeric(train[col], errors='coerce')
    test[col] = pd.to_numeric(test[col], errors='coerce')
train[all_features] = train[all_features].fillna(-9999)
test[all_features] = test[all_features].fillna(-9999)

In [36]:
train.head()

Unnamed: 0,id1,id2,id3,id4,id5,y,f1,f2,f3,f4,...,f378,f374,id8,id12,id13,id9_le,id10_le,id11_le,f374_le,user_offer_seen_count
0,1366776_189706075_16-23_2023-11-02 22:22:00.042,1366776,189706075,2023-11-02 22:22:00.042,2023-11-02,0,1.0,-9999.0,-9999.0,-9999.0,...,-9999.0,-9999.0,57310000.0,-9999.0,-9999.0,480,0,0,78,0
1,1366776_89227_16-23_2023-11-01 23:51:24.999,1366776,89227,2023-11-01 23:51:24.999,2023-11-01,0,1.0,-9999.0,-9999.0,-9999.0,...,-9999.0,-9999.0,59210000.0,-9999.0,-9999.0,448,0,0,54,0
2,1366776_35046_16-23_2023-11-01 00:30:59.797,1366776,35046,2023-11-01 00:30:59.797,2023-11-01,0,1.0,-9999.0,-9999.0,-9999.0,...,-9999.0,-9999.0,72310000.0,-9999.0,-9999.0,487,0,0,8,0
3,1366776_6275451_16-23_2023-11-02 22:21:32.261,1366776,6275451,2023-11-02 22:21:32.261,2023-11-02,0,1.0,-9999.0,-9999.0,-9999.0,...,-9999.0,-9999.0,56510500.0,-9999.0,-9999.0,466,0,0,35,0
4,1366776_78053_16-23_2023-11-02 22:21:34.799,1366776,78053,2023-11-02 22:21:34.799,2023-11-02,0,1.0,-9999.0,-9999.0,-9999.0,...,-9999.0,-9999.0,59991300.0,-9999.0,-9999.0,467,0,0,69,0


In [37]:
test.head()

Unnamed: 0,id1,id2,id3,id4,id5,f1,f2,f3,f4,f5,...,f378,f374,id8,id12,id13,id9_le,id10_le,id11_le,f374_le,user_offer_seen_count
0,1362907_91950_16-23_2023-11-04 18:56:26.000794,1362907,91950,2023-11-04 18:56:26.000794,2023-11-04,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,...,-9999.0,-9999.0,56619906.0,-9999.0,-9999.0,434,1,0,82,0
1,1082599_88356_16-23_2023-11-04 06:08:53.373,1082599,88356,2023-11-04 06:08:53.373000,2023-11-04,-9999.0,9.0,-9999.0,-9999.0,-9999.0,...,-9999.0,-9999.0,59440109.0,-9999.0,-9999.0,394,0,0,50,0
2,1888466_958700_16-23_2023-11-05 10:07:28.000725,1888466,958700,2023-11-05 10:07:28.000725,2023-11-05,-9999.0,-9999.0,-9999.0,-9999.0,22.0,...,-9999.0,-9999.0,59990000.0,-9999.0,-9999.0,298,0,0,69,0
3,1888971_795739_16-23_2023-11-04 12:25:28.244,1888971,795739,2023-11-04 12:25:28.244000,2023-11-04,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,...,-9999.0,-9999.0,56990300.0,-9999.0,-9999.0,486,0,0,64,0
4,1256369_82296_16-23_2023-11-05 06:45:26.657,1256369,82296,2023-11-05 06:45:26.657000,2023-11-05,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,...,-9999.0,-9999.0,56990300.0,-9999.0,-9999.0,33,0,0,64,0


In [38]:
# 11. MAP@7 METRIC
def apk(actual, predicted, k=7):
    if len(predicted) > k:
        predicted = predicted[:k]
    score = 0.0
    num_hits = 0.0
    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    if not actual:
        return 0.0
    return score / min(len(actual), k)
def mapk(df, k=7):
    map_scores = []
    for user_id, group in df.groupby('id2'):
        actual = group.loc[group['y'] == 1, 'id3'].tolist()
        predicted = group.sort_values('pred', ascending=False)['id3'].tolist()
        map_scores.append(apk(actual, predicted, k))
    return np.mean(map_scores)

In [39]:
# 12. MODELING (CV)
X_train = train[all_features]
y_train = train['y']
cv = GroupKFold(n_splits=5)
val_scores = []
for train_idx, val_idx in cv.split(X_train, y_train, groups=train['id2']):
    tr_X, tr_y = X_train.iloc[train_idx], y_train.iloc[train_idx]
    val_X, val_y = X_train.iloc[val_idx], y_train.iloc[val_idx]
    val_id2 = train.iloc[val_idx]['id2']
    val_id3 = train.iloc[val_idx]['id3']
    model = lgb.LGBMClassifier(
        n_estimators=300,
        learning_rate=0.03,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        class_weight='balanced'
    )
    model.fit(tr_X, tr_y)
    val_pred = model.predict_proba(val_X)[:, 1]
    val_df = pd.DataFrame({
        'id2': val_id2,
        'id3': val_id3,
        'y': val_y,
        'pred': val_pred
    })
    score = mapk(val_df, k=7)
    val_scores.append(score)
    print(f"Fold MAP@7: {score:.4f}")
print(f"Mean MAP@7: {np.mean(val_scores):.4f}")

[LightGBM] [Info] Number of positive: 29723, number of negative: 586408
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.836701 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 48719
[LightGBM] [Info] Number of data points in the train set: 616131, number of used features: 367
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Fold MAP@7: 0.1089
[LightGBM] [Info] Number of positive: 30191, number of negative: 585940
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.676446 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 48698
[LightGBM] [Info] Number of data points in the train set: 616131, number of used featu

In [40]:
import joblib

# Save
joblib.dump(model, 'model.pkl')

['model.pkl']

In [41]:
import joblib
model_loaded = joblib.load('model.pkl')

In [42]:
# Ensure 'pred' column exists
test['pred'] = model_loaded.predict_proba(test[all_features])[:, 1]

# Select the columns you want, no rank filtering
submission_file = test[['id1', 'id2', 'id3', 'id5', 'pred']]

# Save to CSV
submission_file.to_csv('submission.csv', index=False)

print(submission_file.head())
print(f"Submission file saved with {len(submission_file)} rows.")


  test['pred'] = model_loaded.predict_proba(test[all_features])[:, 1]


                                               id1      id2     id3  \
0   1362907_91950_16-23_2023-11-04 18:56:26.000794  1362907   91950   
1      1082599_88356_16-23_2023-11-04 06:08:53.373  1082599   88356   
2  1888466_958700_16-23_2023-11-05 10:07:28.000725  1888466  958700   
3     1888971_795739_16-23_2023-11-04 12:25:28.244  1888971  795739   
4      1256369_82296_16-23_2023-11-05 06:45:26.657  1256369   82296   

          id5      pred  
0  2023-11-04  0.000066  
1  2023-11-04  0.000075  
2  2023-11-05  0.000109  
3  2023-11-04  0.000071  
4  2023-11-05  0.000074  
Submission file saved with 369301 rows.
