In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from scipy.special import softmax

In [3]:
train_df = pd.read_csv("train_processed.csv")
test_df = pd.read_csv("test_processed.csv")


In [15]:


from sklearn.preprocessing import LabelEncoder

cols = ['f42', 'f50', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f354']
le = LabelEncoder()

for col in cols:
    test_df[col] = le.fit_transform(test_df[col].astype(str))

In [38]:
threshold = 0.9
missing_ratio = train_df.isnull().mean()
cols_to_drop = missing_ratio[missing_ratio > threshold].index
train_df = train_df.drop(columns=cols_to_drop)
test_df = test_df.drop(columns=cols_to_drop, errors='ignore')

In [17]:
X = train_df.drop(columns=[ 'id1','customer_id', 'offer_id', 'y', 'id4', 'id5'])
y = train_df['y']

X_test = test_df.drop(columns=[ 'id1','customer_id', 'offer_id', 'y','id2','id3', 'id4', 'id5'], errors='ignore')


In [18]:
X_df = X.copy()
X_df['y'] = y
X_df['id1'] = train_df['id1'].values

In [19]:

X_train_df, X_val_df = train_test_split(X_df, test_size=0.2, random_state=42, shuffle=True)

X_train = X_train_df.drop(columns=['y', 'id1'])
y_train = X_train_df['y']
X_val = X_val_df.drop(columns=['y', 'id1'])
y_val = X_val_df['y']

# --- Step 6: LightGBM datasets ---
dtrain = lgb.Dataset(X_train, label=y_train)
dval = lgb.Dataset(X_val, label=y_val)

In [20]:
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'verbose': -1
}

# --- Step 7: Train the model ---
model = lgb.train(
    params,
    dtrain,
    valid_sets=[dtrain, dval],
    valid_names=['train', 'valid'],
    num_boost_round=1000,
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=100)
    ]
)


Training until validation scores don't improve for 50 rounds
[100]	train's binary_logloss: 0.0833053	valid's binary_logloss: 0.0858601
[200]	train's binary_logloss: 0.0747236	valid's binary_logloss: 0.0795778
[300]	train's binary_logloss: 0.069618	valid's binary_logloss: 0.0772269
[400]	train's binary_logloss: 0.0657829	valid's binary_logloss: 0.0760343
[500]	train's binary_logloss: 0.0623628	valid's binary_logloss: 0.0751655
[600]	train's binary_logloss: 0.0594391	valid's binary_logloss: 0.0745321
[700]	train's binary_logloss: 0.056867	valid's binary_logloss: 0.0739772
[800]	train's binary_logloss: 0.0543312	valid's binary_logloss: 0.0735528
[900]	train's binary_logloss: 0.0521492	valid's binary_logloss: 0.0731792
[1000]	train's binary_logloss: 0.0501754	valid's binary_logloss: 0.0728733
Did not meet early stopping. Best iteration is:
[1000]	train's binary_logloss: 0.0501754	valid's binary_logloss: 0.0728733


In [34]:
# Step 1: Add raw prediction scores from your LightGBM model
test_df['raw_pred'] = probs  # probs = model.predict(X_test)

# Step 2: Rank offers within each id2 group (id2 = customer/group ID)
test_df['rank'] = test_df.groupby('id2')['raw_pred'].rank(method='first', ascending=False)

# Step 3: Convert ranks into inverse decimal scores
test_df['inv_rank'] = test_df.groupby('id2')['rank'].transform(lambda r: 1 / r)

# Step 4: Normalize inverse scores so each customer's predictions sum to 1
test_df['pred'] = test_df.groupby('id2')['inv_rank'].transform(lambda x: x / x.sum())

# Step 5: Create final submission DataFrame
submission_df = test_df[['id1', 'id2', 'id3', 'id4', 'id5', 'pred']].copy()
submission_df.to_csv("final_submission_rankedqwertqwerty.csv", index=False)

print("✅ Final submission file saved as 'final_submission_ranked.csv'")


✅ Final submission file saved as 'final_submission_ranked.csv'


In [36]:
# Step 5 (Updated): Create final submission DataFrame without 'id4'
submission_df = test_df[['id1', 'id2', 'id3', 'id5', 'pred']].copy()
submission_df.to_csv("final_submission_rankedfinalfinal.csv", index=False)

print("✅ Final submission file saved as 'final_submission_ranked.csv' (without id4)")


✅ Final submission file saved as 'final_submission_ranked.csv' (without id4)


In [1]:
import pandas as pd

train_df = pd.read_csv("train_processed.csv")
test_df = pd.read_csv("test_processed.csv")


In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from scipy.special import softmax

In [3]:
threshold = 0.9
missing_ratio = train_df.isnull().mean()
cols_to_drop = missing_ratio[missing_ratio > threshold].index
train_df = train_df.drop(columns=cols_to_drop)
test_df = test_df.drop(columns=cols_to_drop, errors='ignore')

In [4]:
train_df = train_df.fillna(train_df.mean(numeric_only=True))
test_df = test_df.fillna(train_df.mean(numeric_only=True))

In [23]:
X = train_df.drop(columns=[ 'id1','customer_id', 'offer_id', 'y', 'id4', 'id5'])
y = train_df['y']

X_test = test_df.drop(columns=[ 'id1','customer_id', 'offer_id', 'y','id2','id3', 'id4', 'id5'], errors='ignore')


In [None]:
train_temp = X.copy()
train_temp['y'] = y
train_temp['id1'] = train_df['id1']
X_train_df, X_val_df = train_test_split(train_temp, test_size=0.2, random_state=42)

X_train = X_train_df.drop(columns=['y', 'id1'])
y_train = X_train_df['y']
X_val = X_val_df.drop(columns=['y', 'id1'])
y_val = X_val_df['y']

# --- Step 6: LightGBM training ---
dtrain = lgb.Dataset(X_train, label=y_train)
dval = lgb.Dataset(X_val, label=y_val)

params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'verbose': -1
}

model = lgb.train(
    params,
    dtrain,
    valid_sets=[dtrain, dval],
    valid_names=['train', 'valid'],
    num_boost_round=1000,
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=100)
    ]
)

# --- Step 7: Predict ---
preds = model.predict(X_test)

# --- Step 8: Prepare prediction dataframe ---
test_df['raw_pred'] = preds

def normalize_group(df):
    if len(df) == 1:
        df['pred'] = 1.0
    else:
        df['pred'] = softmax(df['raw_pred'].values)
    return df

submission_df = test_df.groupby('id1', group_keys=False).apply(normalize_group)
final_submission = submission_df[['id2', 'id3', 'id4', 'id5', 'pred']]
final_submission.to_csv("final_submission_ligcdcscdcdht.csv", index=False)

print("✅ LightGBM prediction pipeline completed and saved to 'final_submission_light.csv'")


In [15]:
X_df = X.copy()
X_df['y'] = y.values
X_df['id1'] = train_df['id1'].values

In [17]:

X_train_df, X_val_df = train_test_split(X_df, test_size=0.2, random_state=42, shuffle=True)

X_train = X_train_df.drop(columns=['y', 'id1'])
y_train = X_train_df['y']
X_val = X_val_df.drop(columns=['y', 'id1'])
y_val = X_val_df['y']

# Create LightGBM dataset
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val)


In [21]:

params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'verbose': -1
}

# --- Step 6: Train the LightGBM model ---
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, val_data],
    valid_names=['train', 'valid'],
    num_boost_round=1000,
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=100)
    ]
)

# --- Step 7: Predict on Test Set ---
probs = model.predict(X_test)

test_sample = test_df.copy()
test_sample['id2'] = test_sample.get('id2', pd.Series([0]*len(test_sample)))
test_sample['id3'] = test_sample.get('id3', pd.Series([0]*len(test_sample)))
test_sample['id4'] = test_sample.get('id4', pd.Series([0]*len(test_sample)))
test_sample['id5'] = test_sample.get('id5', pd.Series([0]*len(test_sample)))

pred_df = pd.DataFrame({
    'id1': test_sample['id1'].values,
    'id2': test_sample['id2'].values,
    'id3': test_sample['id3'].values,
    'id4': test_sample['id4'].values,
    'id5': test_sample['id5'].values,
    'raw_pred': probs
})

def normalize_group(df):
    if len(df) == 1:
        df['pred'] = 1.0
    else:
        df['pred'] = softmax(df['raw_pred'].values)
    return df

submission_df = pred_df.groupby('id1', group_keys=False).apply(normalize_group)
final_submission = submission_df[['id2', 'id3', 'id4', 'id5', 'pred']]
final_submission.to_csv("final_submission_lightnewwwww.csv", index=False)

print("✅ LightGBM pipeline complete. File saved as 'final_submission_light.csv'")


Training until validation scores don't improve for 50 rounds
[100]	train's binary_logloss: 0.0841222	valid's binary_logloss: 0.0865231
[200]	train's binary_logloss: 0.0753358	valid's binary_logloss: 0.0800866
[300]	train's binary_logloss: 0.0704378	valid's binary_logloss: 0.0778314
[400]	train's binary_logloss: 0.0664592	valid's binary_logloss: 0.0764335
[500]	train's binary_logloss: 0.0631347	valid's binary_logloss: 0.0753905
[600]	train's binary_logloss: 0.0602427	valid's binary_logloss: 0.0747507
[700]	train's binary_logloss: 0.0576201	valid's binary_logloss: 0.0742061
[800]	train's binary_logloss: 0.0552935	valid's binary_logloss: 0.0737356
[900]	train's binary_logloss: 0.0531529	valid's binary_logloss: 0.0733852
[1000]	train's binary_logloss: 0.0510209	valid's binary_logloss: 0.07303
Did not meet early stopping. Best iteration is:
[1000]	train's binary_logloss: 0.0510209	valid's binary_logloss: 0.07303


  submission_df = pred_df.groupby('id1', group_keys=False).apply(normalize_group)


✅ LightGBM pipeline complete. File saved as 'final_submission_light.csv'


In [29]:
X_df = pd.DataFrame(X_pca)
X_df['y'] = y.values
X_df['id1'] = train_df['id1'].values

X_train_df, X_val_df = train_test_split(X_df, test_size=0.2, random_state=42, shuffle=True)

X_train = X_train_df.drop(columns=['y', 'id1'])
y_train = X_train_df['y']
X_val = X_val_df.drop(columns=['y', 'id1'])
y_val = X_val_df['y']

# --- Step 9: LightGBM Dataset and Parameters ---
dtrain = lgb.Dataset(X_train, label=y_train)
dval = lgb.Dataset(X_val, label=y_val)

In [31]:
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'verbose': -1
}

In [33]:
model = lgb.train(
    params,
    dtrain,
    valid_sets=[dtrain, dval],
    valid_names=['train', 'valid'],
    num_boost_round=1000,
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=100)
    ]
)

Training until validation scores don't improve for 50 rounds
[100]	train's binary_logloss: 0.104471	valid's binary_logloss: 0.108146
[200]	train's binary_logloss: 0.0943352	valid's binary_logloss: 0.101043
[300]	train's binary_logloss: 0.0879674	valid's binary_logloss: 0.098046
[400]	train's binary_logloss: 0.0831459	valid's binary_logloss: 0.0964051
[500]	train's binary_logloss: 0.0788507	valid's binary_logloss: 0.0951112
[600]	train's binary_logloss: 0.0750344	valid's binary_logloss: 0.0941072
[700]	train's binary_logloss: 0.0714621	valid's binary_logloss: 0.0932328
[800]	train's binary_logloss: 0.0681398	valid's binary_logloss: 0.0925257
[900]	train's binary_logloss: 0.0651303	valid's binary_logloss: 0.091931
[1000]	train's binary_logloss: 0.062334	valid's binary_logloss: 0.0914136
Did not meet early stopping. Best iteration is:
[1000]	train's binary_logloss: 0.062334	valid's binary_logloss: 0.0914136


In [34]:
probs = model.predict(X_test_pca)

# --- Step 12: Build Submission ---
test_sample = test_df.copy()

for col in ['id2', 'id3', 'id4', 'id5']:
    if col not in test_sample:
        test_sample[col] = 0

pred_df = pd.DataFrame({
    'id1': test_sample['id1'].values,
    'id2': test_sample['id2'].values,
    'id3': test_sample['id3'].values,
    'id4': test_sample['id4'].values,
    'id5': test_sample['id5'].values,
    'raw_pred': probs
})

In [35]:
def normalize_group(df):
    if len(df) == 1:
        df['pred'] = 1.0
    else:
        df['pred'] = softmax(df['raw_pred'].values)
    return df

submission_df = pred_df.groupby('id1', group_keys=False).apply(normalize_group)
final_submission = submission_df[['id2', 'id3', 'id4', 'id5', 'pred']]
final_submission.to_csv("final_submission_light.csv", index=False)

print("✅ Pipeline complete. File saved as 'final_submission_light.csv'")

  submission_df = pred_df.groupby('id1', group_keys=False).apply(normalize_group)


✅ Pipeline complete. File saved as 'final_submission_light.csv'


In [None]:


from sklearn.preprocessing import LabelEncoder

cols = ['f42', 'f50', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f354']
le = LabelEncoder()

for col in cols:
    test_df[col] = le.fit_transform(test_df[col].astype(str))

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GroupKFold, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBRanker
import gc
import warnings

warnings.filterwarnings('ignore')

# ===================================================================================
# 1. Custom MAP@7 Metric
# ===================================================================================
def mapk(actuals, predicteds, k=7):
    return np.mean([apk(a, p, k) for a, p in zip(actuals, predicteds)])

def apk(actual, predicted, k=7):
    if len(predicted) > k: predicted = predicted[:k]
    score, num_hits = 0.0, 0.0
    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    if not actual: return 0.0
    return score / min(len(actual), k)

# ===================================================================================
# 2. Load and Prepare Data
# ===================================================================================
print("📥 Loading datasets...")
train = pd.read_parquet('train_data.parquet')
test = pd.read_parquet('test_data.parquet')
offer_meta = pd.read_parquet('offer_metadata.parquet')
events = pd.read_parquet('add_event.parquet')
trans = pd.read_parquet('add_trans.parquet')

print("🔧 Preprocessing features...")
for df in [train, test]:
    for col in df.columns:
        if col.startswith('f'):
            df[col] = pd.to_numeric(df[col], errors='coerce')
    df['id5'] = pd.to_datetime(df['id5'], errors='coerce')
    if 'y' in df.columns:
        df['y'] = pd.to_numeric(df['y'], errors='coerce').fillna(0).astype(float)

for df in [train, test, offer_meta, events, trans]:
    for col in ['id2', 'id3', 'customer_id', 'offer_id']:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

offer_meta.rename(columns={'id3': 'offer_id', 'id9': 'industry_code', 'id12': 'start_date', 'id13': 'end_date'}, inplace=True)
events.rename(columns={'id2': 'customer_id', 'id3': 'offer_id', 'id4': 'impression_timestamp', 'id7': 'click_timestamp'}, inplace=True)
trans.rename(columns={'id2': 'customer_id', 'f367': 'transaction_amount', 'f370': 'transaction_date_d', 'f371': 'transaction_date_t'}, inplace=True)
trans['transaction_date'] = pd.to_datetime(trans['transaction_date_d'] + ' ' + trans['transaction_date_t'], errors='coerce')

# ===================================================================================
# 3. Feature Engineering
# ===================================================================================
def feature_engineer(df):
    print("🛠️  Feature engineering...")
    df = df.merge(offer_meta, left_on='id3', right_on='offer_id', how='left')
    df['imp_date'] = pd.to_datetime(df['id5'])
    df['imp_dow'] = df['imp_date'].dt.dayofweek
    df['start_date'] = pd.to_datetime(df['start_date'], errors='coerce')
    df['end_date'] = pd.to_datetime(df['end_date'], errors='coerce')
    df['offer_duration'] = (df['end_date'] - df['start_date']).dt.days

    events['click_timestamp'] = pd.to_datetime(events['click_timestamp'], errors='coerce')
    events['impression_timestamp'] = pd.to_datetime(events['impression_timestamp'], errors='coerce')
    events['has_clicked'] = events['click_timestamp'].notna().astype(int)

    cust_agg = events.groupby('customer_id').agg(
        total_customer_imps=('impression_timestamp', 'count'),
        total_customer_clicks=('has_clicked', 'sum'),
        last_click_date=('click_timestamp', lambda x: pd.to_datetime(x).max())
    ).reset_index()
    df = df.merge(cust_agg, left_on='id2', right_on='customer_id', how='left')
    df['days_since_last_any_click'] = (df['imp_date'] - df['last_click_date']).dt.days

    cust_industry_agg = events.merge(offer_meta[['offer_id', 'industry_code']], on='offer_id')
    cust_industry_agg = cust_industry_agg.groupby(['customer_id', 'industry_code']).agg(
        customer_industry_imps=('impression_timestamp', 'count'),
        customer_industry_clicks=('has_clicked', 'sum')
    ).reset_index()
    df = df.merge(cust_industry_agg, left_on=['id2', 'industry_code'], right_on=['customer_id', 'industry_code'], how='left')

    trans['transaction_amount'] = pd.to_numeric(trans['transaction_amount'], errors='coerce')
    trans_agg = trans.groupby('customer_id').agg(
        avg_spend=('transaction_amount', 'mean'),
        last_trans_date=('transaction_date', lambda x: pd.to_datetime(x).max())
    ).reset_index()
    df = df.merge(trans_agg, left_on='id2', right_on='customer_id', how='left')

    return df

train = feature_engineer(train)
test = feature_engineer(test)

feature_cols = [c for c in train.columns if c.startswith('f') and train[c].dtype in [np.float32, np.float64, np.int32, np.int64]] + [
    'imp_dow', 'offer_duration', 'total_customer_imps', 'total_customer_clicks',
    'days_since_last_any_click', 'customer_industry_imps', 'customer_industry_clicks', 'avg_spend']

train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

X = train[feature_cols + ['id2']]
y = train['y']
X_test = test[feature_cols + ['id2']]

# ===================================================================================
# 4. Ensemble Training with Hyperparameter Tuning
# ===================================================================================
def train_ensemble(X, y, groups, X_test):
    print("📈 Training ensemble model...")
    gkf = GroupKFold(n_splits=5)
    oof_lgb, oof_xgb = np.zeros(len(X)), np.zeros(len(X))
    test_lgb, test_xgb = np.zeros(len(X_test)), np.zeros(len(X_test))

    for fold, (t_idx, v_idx) in enumerate(gkf.split(X, y, groups)):
        print(f"\n🔁 Fold {fold+1}/5")
        Xt, Xv = X.iloc[t_idx], X.iloc[v_idx]
        yt, yv = y.iloc[t_idx], y.iloc[v_idx]

        gr_t = Xt['id2'].value_counts().sort_index().values
        gr_v = Xv['id2'].value_counts().sort_index().values

        Xt_, Xv_ = Xt.drop('id2', axis=1), Xv.drop('id2', axis=1)
        Xtest_ = X_test.drop('id2', axis=1)

        model_lgb = lgb.LGBMRanker(objective='lambdarank', metric='map', eval_at=7, n_estimators=500, learning_rate=0.05, num_leaves=80)
        model_lgb.fit(Xt_, yt, group=gr_t, eval_set=[(Xv_, yv)], eval_group=[gr_v], callbacks=[lgb.early_stopping(50)])
        oof_lgb[v_idx] = model_lgb.predict(Xv_)
        test_lgb += model_lgb.predict(Xtest_)

        model_xgb = XGBRanker(objective='rank:pairwise', learning_rate=0.05, n_estimators=500, max_depth=6, verbosity=0)
        model_xgb.fit(Xt_, yt, group=gr_t)
        oof_xgb[v_idx] = model_xgb.predict(Xv_)
        test_xgb += model_xgb.predict(Xtest_)

    test_lgb /= gkf.get_n_splits()
    test_xgb /= gkf.get_n_splits()

    print("🔗 Blending predictions...")
    oof_ensemble = 0.6 * oof_lgb + 0.4 * oof_xgb
    test_ensemble = 0.6 * test_lgb + 0.4 * test_xgb

    return oof_ensemble, test_ensemble

# ===================================================================================
# 5. Inference and Submission
# ===================================================================================
oof, test_preds = train_ensemble(X, y, X['id2'], X_test)
train['pred'] = oof

print("📤 Generating predictions and evaluating MAP@7...")
oof_ranked = train.sort_values(['id2','pred'], ascending=[True,False]).groupby('id2')['id3'].apply(list).reset_index(name='predicted_offers')
true_offers = train[train['y']==1].groupby('id2')['id3'].apply(list).reset_index(name='true_offers')
eval_df = oof_ranked.merge(true_offers, on='id2', how='left')
eval_df['true_offers'] = eval_df['true_offers'].apply(lambda x: x if isinstance(x, list) else [])
final_score = mapk(eval_df['true_offers'], eval_df['predicted_offers'])
print(f"\n📊 Final OOF MAP@7 Score: {final_score:.6f}")

print("🧪 Generating submission file...")
scaler = MinMaxScaler()
test['pred'] = scaler.fit_transform(test_preds.reshape(-1, 1)).flatten()
submission = test[['id1', 'id2', 'id3', 'id5', 'pred']].copy()
submission['id5'] = pd.to_datetime(submission['id5']).dt.strftime('%m/%d/%Y')
submission.to_csv('submission.csv', index=False)
print("\n✅ Submission file saved as 'submission.csv'")


📥 Loading datasets...
🔧 Preprocessing features...
🛠️  Feature engineering...
🛠️  Feature engineering...
📈 Training ensemble model...

🔁 Fold 1/5
[LightGBM] [Info] Total groups: 37240, total data: 616131
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.249163 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 47280
[LightGBM] [Info] Number of data points in the train set: 616131, number of used features: 296
[LightGBM] [Info] Total groups: 9310, total data: 154033
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[143]	valid_0's map@7: 0.932494

🔁 Fold 2/5
[LightGBM] [Info] Total groups: 37240, total data: 616131
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.250768 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not eno

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBRanker
import gc
import warnings

warnings.filterwarnings('ignore')

# ===================================================================================
# 1. Custom MAP@7 Metric
# ===================================================================================
def mapk(actuals, predicteds, k=7):
    return np.mean([apk(a, p, k) for a, p in zip(actuals, predicteds)])

def apk(actual, predicted, k=7):
    if len(predicted) > k: predicted = predicted[:k]
    score, num_hits = 0.0, 0.0
    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    if not actual: return 0.0
    return score / min(len(actual), k)

# ===================================================================================
# 2. Load and Prepare Data
# ===================================================================================
print("📥 Loading datasets...")
train = pd.read_parquet('train_data.parquet')
test = pd.read_parquet('test_data.parquet')
offer_meta = pd.read_parquet('offer_metadata.parquet')
events = pd.read_parquet('add_event.parquet')
trans = pd.read_parquet('add_trans.parquet')

print("🔧 Preprocessing features...")
for df in [train, test]:
    for col in df.columns:
        if col.startswith('f'):
            df[col] = pd.to_numeric(df[col], errors='coerce')
    df['id5'] = pd.to_datetime(df['id5'], errors='coerce')
    if 'y' in df.columns:
        df['y'] = pd.to_numeric(df['y'], errors='coerce').fillna(0).astype(float)

for df in [train, test, offer_meta, events, trans]:
    for col in ['id2', 'id3', 'customer_id', 'offer_id']:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

offer_meta.rename(columns={'id3': 'offer_id', 'id9': 'industry_code', 'id12': 'start_date', 'id13': 'end_date'}, inplace=True)
events.rename(columns={'id2': 'customer_id', 'id3': 'offer_id', 'id4': 'impression_timestamp', 'id7': 'click_timestamp'}, inplace=True)
trans.rename(columns={'id2': 'customer_id', 'f367': 'transaction_amount', 'f370': 'transaction_date_d', 'f371': 'transaction_date_t'}, inplace=True)
trans['transaction_date'] = pd.to_datetime(trans['transaction_date_d'] + ' ' + trans['transaction_date_t'], errors='coerce')

# ===================================================================================
# 3. Feature Engineering
# ===================================================================================
def feature_engineer(df):
    print("🛠️  Feature engineering...")
    df = df.merge(offer_meta, left_on='id3', right_on='offer_id', how='left')
    df['imp_date'] = pd.to_datetime(df['id5'])
    df['imp_dow'] = df['imp_date'].dt.dayofweek
    df['start_date'] = pd.to_datetime(df['start_date'], errors='coerce')
    df['end_date'] = pd.to_datetime(df['end_date'], errors='coerce')
    df['offer_duration'] = (df['end_date'] - df['start_date']).dt.days

    events['click_timestamp'] = pd.to_datetime(events['click_timestamp'], errors='coerce')
    events['impression_timestamp'] = pd.to_datetime(events['impression_timestamp'], errors='coerce')
    events['has_clicked'] = events['click_timestamp'].notna().astype(int)

    cust_agg = events.groupby('customer_id').agg(
        total_customer_imps=('impression_timestamp', 'count'),
        total_customer_clicks=('has_clicked', 'sum'),
        last_click_date=('click_timestamp', lambda x: pd.to_datetime(x.dropna()).max())
    ).reset_index()
    df = df.merge(cust_agg, left_on='id2', right_on='customer_id', how='left')
    df['days_since_last_any_click'] = (df['imp_date'] - df['last_click_date']).dt.days

    cust_industry_agg = events.merge(offer_meta[['offer_id', 'industry_code']], on='offer_id')
    cust_industry_agg = cust_industry_agg.groupby(['customer_id', 'industry_code']).agg(
        customer_industry_imps=('impression_timestamp', 'count'),
        customer_industry_clicks=('has_clicked', 'sum')
    ).reset_index()
    df = df.merge(cust_industry_agg, left_on=['id2', 'industry_code'], right_on=['customer_id', 'industry_code'], how='left')

    trans['transaction_amount'] = pd.to_numeric(trans['transaction_amount'], errors='coerce')
    trans_agg = trans.groupby('customer_id').agg(
        avg_spend=('transaction_amount', 'mean'),
        last_trans_date=('transaction_date', lambda x: pd.to_datetime(x.dropna()).max())
    ).reset_index()
    df = df.merge(trans_agg, left_on='id2', right_on='customer_id', how='left')

    return df

train = feature_engineer(train)
test = feature_engineer(test)

feature_cols = [c for c in train.columns if c.startswith('f') and train[c].dtype in [np.float32, np.float64, np.int32, np.int64]] + [
    'imp_dow', 'offer_duration', 'total_customer_imps', 'total_customer_clicks',
    'days_since_last_any_click', 'customer_industry_imps', 'customer_industry_clicks', 'avg_spend']

train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

X = train[feature_cols + ['id2']]
y = train['y']
X_test = test[feature_cols + ['id2']]

# ===================================================================================
# 4. Ensemble Training with LightGBM + XGBoost
# ===================================================================================
def train_ensemble(X, y, groups, X_test):
    print("📈 Training ensemble model...")
    gkf = GroupKFold(n_splits=5)
    oof_lgb, oof_xgb = np.zeros(len(X)), np.zeros(len(X))
    test_lgb, test_xgb = np.zeros(len(X_test)), np.zeros(len(X_test))

    for fold, (t_idx, v_idx) in enumerate(gkf.split(X, y, groups)):
        print(f"\n🔁 Fold {fold+1}/5")
        Xt, Xv = X.iloc[t_idx], X.iloc[v_idx]
        yt, yv = y.iloc[t_idx], y.iloc[v_idx]

        gr_t = Xt['id2'].value_counts().sort_index().values
        gr_v = Xv['id2'].value_counts().sort_index().values

        Xt_, Xv_ = Xt.drop('id2', axis=1), Xv.drop('id2', axis=1)
        Xtest_ = X_test.drop('id2', axis=1)

        model_lgb = lgb.LGBMRanker(objective='lambdarank', metric='map', eval_at=7, n_estimators=500, learning_rate=0.05, num_leaves=80)
        model_lgb.fit(Xt_, yt, group=gr_t, eval_set=[(Xv_, yv)], eval_group=[gr_v], callbacks=[lgb.early_stopping(50)])
        oof_lgb[v_idx] = model_lgb.predict(Xv_)
        test_lgb += model_lgb.predict(Xtest_)

        model_xgb = XGBRanker(objective='rank:pairwise', learning_rate=0.05, n_estimators=500, max_depth=6, verbosity=0)
        model_xgb.fit(Xt_, yt, group=gr_t)
        oof_xgb[v_idx] = model_xgb.predict(Xv_)
        test_xgb += model_xgb.predict(Xtest_)

    test_lgb /= gkf.get_n_splits()
    test_xgb /= gkf.get_n_splits()

    print("🔗 Blending predictions...")
    oof_ensemble = 0.6 * oof_lgb + 0.4 * oof_xgb
    test_ensemble = 0.6 * test_lgb + 0.4 * test_xgb

    return oof_ensemble, test_ensemble

# ===================================================================================
# 5. Inference and Submission
# ===================================================================================
oof, test_preds = train_ensemble(X, y, X['id2'], X_test)
train['pred'] = oof

print("📤 Generating predictions and evaluating MAP@7...")
oof_ranked = train.sort_values(['id2','pred'], ascending=[True,False]).groupby('id2')['id3'].apply(list).reset_index(name='predicted_offers')
true_offers = train[train['y']==1].groupby('id2')['id3'].apply(list).reset_index(name='true_offers')
eval_df = oof_ranked.merge(true_offers, on='id2', how='left')
eval_df['true_offers'] = eval_df['true_offers'].apply(lambda x: x if isinstance(x, list) else [])
final_score = mapk(eval_df['true_offers'], eval_df['predicted_offers'])
print(f"\n📊 Final OOF MAP@7 Score: {final_score:.6f}")

print("🧪 Generating submission file...")
scaler = MinMaxScaler()
test['pred'] = scaler.fit_transform(test_preds.reshape(-1, 1)).flatten()
submission = test[['id1', 'id2', 'id3', 'id5', 'pred']].copy()
submission['id5'] = pd.to_datetime(submission['id5']).dt.strftime('%m/%d/%Y')
submission.to_csv('submission.csv', index=False)
print("\n✅ Submission file saved as 'submission.csv'")


In [2]:
train_df

Unnamed: 0,id1,customer_id,offer_id,id4,id5,y,f1,f2,f3,f4,...,cashback_value,offer_duration_months,store_type_encoded,has_cashback,offer_ctr,industry_code,redemption_frequency,industry_total_spent,industry_avg_spent,industry_txn_count
0,1366776_189706075_16-23_2023-11-02 22:22:00.042,1366776,189706075,2023-11-02 22:22:00.042,2023-11-02,0,1.0,,,,...,2.0,1.0,104.0,1.0,0.059875,57310000.0,2.0,16249283.79,433.638017,37472.0
1,1366776_89227_16-23_2023-11-01 23:51:24.999,1366776,89227,2023-11-01 23:51:24.999,2023-11-01,0,1.0,,,,...,0.0,6.0,68.0,0.0,0.046487,59210000.0,2.0,3511750.83,142.771510,24597.0
2,1366776_35046_16-23_2023-11-01 00:30:59.797,1366776,35046,2023-11-01 00:30:59.797,2023-11-01,0,1.0,,,,...,10.0,1.0,13.0,1.0,0.041484,72310000.0,2.0,2659689.37,137.033818,19409.0
3,1366776_6275451_16-23_2023-11-02 22:21:32.261,1366776,6275451,2023-11-02 22:21:32.261,2023-11-02,0,1.0,,,,...,10.0,1.0,47.0,1.0,0.042805,56510500.0,2.0,14216414.45,249.131054,57064.0
4,1366776_78053_16-23_2023-11-02 22:21:34.799,1366776,78053,2023-11-02 22:21:34.799,2023-11-02,0,1.0,,,,...,8.0,1.0,86.0,1.0,0.042544,59991300.0,2.0,2352086.84,167.946222,14005.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
770159,1896641_87731_16-23_2023-11-02 08:14:21.524,1896641,87731,2023-11-02 08:14:21.524,2023-11-02,0,,,,,...,0.0,3.0,86.0,0.0,0.011329,59991311.0,1.0,1153882.46,113.336849,10181.0
770160,1896641_505604_16-23_2023-11-02 08:14:24.458,1896641,505604,2023-11-02 08:14:24.458,2023-11-02,0,,,,,...,0.0,6.0,60.0,0.0,0.010578,70110000.0,1.0,13089274.75,894.137219,14639.0
770161,1896641_25212_16-23_2023-11-02 08:14:25.748,1896641,25212,2023-11-02 08:14:25.748,2023-11-02,0,,,,,...,0.0,4.0,60.0,0.0,0.011467,70110000.0,1.0,13089274.75,894.137219,14639.0
770162,1900765_95157_16-23_2023-11-02 06:08:25.900,1900765,95157,2023-11-02 06:08:25.900,2023-11-02,0,,,,,...,0.0,4.0,83.0,0.0,0.050302,57190000.0,1.0,2126908.38,371.512381,5725.0


In [3]:
test_df

Unnamed: 0,id1,id2,id3,id4,id5,f1,f2,f3,f4,f5,...,cashback_value,offer_duration_months,store_type_encoded,has_cashback,offer_ctr,industry_code,redemption_frequency,industry_total_spent,industry_avg_spent,industry_txn_count
0,1362907_91950_16-23_2023-11-04 18:56:26.000794,1362907,91950,2023-11-04 18:56:26.000794,2023-11-04,,,,,,...,0.0,5.0,113.0,0.0,0.007848,56619906.0,1.0,101622.31,249.074289,408.0
1,1082599_88356_16-23_2023-11-04 06:08:53.373,1082599,88356,2023-11-04 06:08:53.373,2023-11-04,,9.0,,,,...,0.0,2.0,64.0,0.0,0.011145,59440109.0,1.0,1064140.11,3399.808658,313.0
2,1888466_958700_16-23_2023-11-05 10:07:28.000725,1888466,958700,2023-11-05 10:07:28.000725,2023-11-05,,,,,22.0,...,0.0,1.0,86.0,0.0,0.067623,59990000.0,2.0,14203022.21,207.631346,68405.0
3,1888971_795739_16-23_2023-11-04 12:25:28.244,1888971,795739,2023-11-04 12:25:28.244,2023-11-04,,,,,,...,0.0,6.0,80.0,0.0,0.004302,56990300.0,2.0,2426407.48,178.032686,13629.0
4,1256369_82296_16-23_2023-11-05 06:45:26.657,1256369,82296,2023-11-05 06:45:26.657,2023-11-05,,,,,,...,0.0,6.0,80.0,0.0,0.005019,56990300.0,1.0,2426407.48,178.032686,13629.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
369296,1874443_95537_16-23_2023-11-05 09:21:24.182,1874443,95537,2023-11-05 09:21:24.182,2023-11-05,,,,,,...,0.0,3.0,52.0,0.0,0.004840,57120000.0,1.0,11092126.60,818.546720,13551.0
369297,1541978_5718_16-23_2023-11-05 00:56:43.946,1541978,5718,2023-11-05 00:56:43.946,2023-11-05,23.0,,1.0,,10.0,...,0.0,5.0,84.0,0.0,0.007497,72990000.0,1.0,5232229.78,258.203207,20264.0
369298,1887841_85905_16-23_2023-11-05 20:40:43.312,1887841,85905,2023-11-05 20:40:43.312,2023-11-05,,,,,,...,5.0,1.0,86.0,1.0,0.054671,59991503.0,2.0,15502.18,96.888625,160.0
369299,1569367_944713_16-23_2023-11-05 00:43:04.335,1569367,944713,2023-11-05 00:43:04.335,2023-11-05,,,,,,...,0.0,2.0,81.0,0.0,0.006531,54990000.0,2.0,4945570.46,81.011179,61048.0


In [3]:


from sklearn.preprocessing import LabelEncoder

cols = ['f42', 'f50', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f354']
le = LabelEncoder()

for col in cols:
    test_df[col] = le.fit_transform(test_df[col].astype(str))

In [4]:
test_df[cols].dtypes

f42     int32
f50     int32
f52     int32
f53     int32
f54     int32
f55     int32
f56     int32
f57     int32
f354    int32
dtype: object

In [5]:
# Step 1: Calculate missing percentage per column
missing_percent = train_df.isnull().mean()

# Step 2: Filter columns with >60% missing
cols_to_drop = missing_percent[missing_percent > 0.9].index

# Step 3: Drop them from both train and test (to ensure same features)
train_df = train_df.drop(columns=cols_to_drop)
test_df = test_df.drop(columns=cols_to_drop, errors='ignore')  # in case some columns are missing in test

print(f"Dropped columns: {list(cols_to_drop)}")


Dropped columns: ['f4', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f33', 'f34', 'f36', 'f37', 'f64', 'f66', 'f70', 'f79', 'f80', 'f81', 'f84', 'f88', 'f92', 'f112', 'f114', 'f117', 'f118', 'f120', 'f122', 'f135', 'f136', 'f154', 'f176', 'f189', 'f205', 'f220', 'f221', 'f360']


In [6]:
# Fill NaNs in training data with column-wise mean
train_df = train_df.fillna(train_df.mean(numeric_only=True))

# Fill NaNs in test data with column-wise mean
test_df = test_df.fillna(test_df.mean(numeric_only=True))

In [13]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


In [9]:
X = train_df.drop(columns=[ 'id1','customer_id', 'offer_id', 'y', 'id4', 'id5'])
y = train_df['y']


In [10]:
from sklearn.preprocessing import StandardScaler
import numpy as np

scaler = StandardScaler()

# Convert to NumPy in chunks (if not already)
X_np = X.values if isinstance(X, pd.DataFrame) else X

# Fit in chunks
chunk_size = 100000  # adjust based on RAM
for i in range(0, X_np.shape[0], chunk_size):
    scaler.partial_fit(X_np[i:i+chunk_size])

# Now transform in chunks
X_scaled_parts = []
for i in range(0, X_np.shape[0], chunk_size):
    chunk_scaled = scaler.transform(X_np[i:i+chunk_size])
    X_scaled_parts.append(chunk_scaled)

X_scaled = np.vstack(X_scaled_parts)  # Final scaled array


In [11]:
Xt = test_df.drop(columns=[ 'id1',  'id4', 'id5'])



In [12]:
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd

# Assuming `Xt` is your DataFrame to scale
# Fix variable name confusion: Xt not X
Xt = Xt.copy()

# Set chunk size (you can increase it if you have more RAM)
chunk_size = 10000

# Initialize scaler
scaler = StandardScaler()

# Convert to numpy safely
X_npt = Xt.to_numpy()

# Fit scaler incrementally in chunks
for i in range(0, X_npt.shape[0], chunk_size):
    scaler.partial_fit(X_npt[i:i+chunk_size])

# Transform in chunks to reduce memory load
X_scaled_parts = []
for i in range(0, X_npt.shape[0], chunk_size):
    chunk_scaled = scaler.transform(X_npt[i:i+chunk_size])
    X_scaled_parts.append(chunk_scaled)

# Combine all scaled chunks
X_scaled = np.vstack(X_scaled_parts)

# Optional: convert back to DataFrame
Xt_scaled = pd.DataFrame(X_scaled, columns=Xt.columns, index=Xt.index)


In [14]:
# Option 1: Keep 95% variance
pca = PCA(n_components=0.95)

# Option 2: Use fixed number of components
# pca = PCA(n_components=50)

X_pca = pca.fit_transform(X_scaled)


In [15]:
print(f"✅ PCA reduced dimensions to {X_pca.shape[1]} components.")
print("Explained variance ratio (first few):", pca.explained_variance_ratio_[:10])


✅ PCA reduced dimensions to 185 components.
Explained variance ratio (first few): [0.04860793 0.04051125 0.0352026  0.02804256 0.02703355 0.01820265
 0.01560003 0.01361465 0.01307646 0.01234732]


In [16]:
X_pca_test = pca.transform(Xt_scaled)  # Use your fitted PCA on test data




In [17]:
!pip install lightgbm




In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from scipy.special import softmax
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# ⛳ Use the same rows for scaling + PCA as used in train_df
X = train_df.drop(columns=['y', 'id1'])  # Keep id1 separately
y = train_df['y']
id1 = train_df['id1']

# Fill NaNs (just in case)
X = X.fillna(X.mean())

# Scale
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# PCA (retain 95% variance)
pca = PCA(n_components=150, random_state=42)
X_pca = pca.fit_transform(X_scaled)

# ✅ Create PCA DataFrame with correct length
X_df = pd.DataFrame(X_pca)
X_df['y'] = y.values
X_df['id1'] = id1.values

# 🔀 Split for validation
X_train_df, X_val_df = train_test_split(X_df, test_size=0.2, random_state=42, stratify=X_df['y'])

X_train = X_train_df.drop(columns=['y', 'id1'])
y_train = X_train_df['y']
X_val = X_val_df.drop(columns=['y', 'id1'])
y_val = X_val_df['y']

# LightGBM datasets
dtrain = lgb.Dataset(X_train, label=y_train)
dval = lgb.Dataset(X_val, label=y_val)

# LightGBM training parameters
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'verbose': -1
}

# 🚀 Train model
model = lgb.train(
    params,
    dtrain,
    valid_sets=[dtrain, dval],
    valid_names=['train', 'valid'],
    num_boost_round=1000,
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=100)
    ]
)

# 🧪 Handle test data
X_test = test_df.drop(columns=['id1', 'id2', 'id3', 'id5'], errors='ignore')
X_test = X_test.fillna(X_test.mean())
X_test_scaled = scaler.transform(X_test)
X_test_pca = pca.transform(X_test_scaled)

# 🔮 Predict
probs = model.predict(X_test_pca)

# Combine with test IDs
pred_df = pd.DataFrame({
    'id1': test_df['id1'].values,
    'id2': test_df['id2'].values,
    'id3': test_df['id3'].values,
    'id5': test_df['id5'].values,
    'raw_pred': probs
})

# Normalize predictions per customer
def normalize_group(df):
    if len(df) == 1:
        df['pred'] = 1.0
    else:
        df['pred'] = softmax(df['raw_pred'].values)
    return df

submission_df = pred_df.groupby('id1', group_keys=False).apply(normalize_group)
submission_df = submission_df.drop(columns=['raw_pred'])
submission_df = submission_df.sort_values(by=['id1', 'pred'], ascending=[True, False])

# 💾 Save final CSV
submission_df.to_csv("final_submission.csv", index=False)
print("✅ Submission file saved: final_submission.csv")


In [104]:
y_pred_test = xgb_clf.predict(X_test_pca)

In [100]:
# Drop same ID and target columns
X_test = test_df.drop(columns=['id1', 'id2', 'id3', 'id4', 'id5'])  # Drop same columns as train


# Drop columns that were removed from train
X_test = X_test[X.columns]  # Ensure matching columns



# Apply PCA transformation
X_test_pca = pca.transform(X_test)






In [106]:
submission = test_df[['id1', 'id2', 'id3', 'id5']].copy()
submission['pred'] = y_pred_test


In [108]:
submission.to_csv('final_submission_ammk.csv', index=False)

In [54]:

# Predict on the final test PCA data
y_pred_test = xgb_clf.predict(X_test_pca)

# Create the final submission DataFrame
submission = test_df[['id1', 'id2', 'id3', 'id5']].copy()
submission['pred'] = y_pred_test

# Save to CSV
submission.to_csv('xgboost_submission.csv', index=False)
print("✅ Submission file 'xgboost_submission.csv' created.")

✅ Submission file 'xgboost_submission.csv' created.


In [None]:
import pandas as pd

# Combine X and y back into a single DataFrame
df_corr = pd.concat([X, y], axis=1)

# Calculate Pearson correlation of each feature with y
correlations = df_corr.corr()['y'].drop('y').sort_values(ascending=False)

# Display top positively and negatively correlated features
print("🔍 Top positively correlated features with y:")
print(correlations.head(10))

print("\n🔍 Top negatively correlated features with y:")
print(correlations.tail(10))


In [111]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_pca, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
model.fit(X_train, y_train)

score = model.score(X_val, y_val)
print(f"📊 Validation Accuracy with PCA: {score:.4f}")


📊 Validation Accuracy with PCA: 0.9645


In [8]:
import xgboost as xgb

model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='auc',
    use_label_encoder=False,
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [9]:
y_valid_proba = model.predict_proba(X_valid)[:, 1]

In [10]:
from sklearn.metrics import roc_auc_score

auc_score = roc_auc_score(y_valid, y_valid_proba)
print(f"AUC Score: {auc_score:.4f}")


AUC Score: 0.9476


In [12]:
X_test = test_df.drop(columns=['id1', 'id2', 'id3', 'id4', 'id5'])  # Drop same columns as train


In [13]:
test_probs = model.predict_proba(X_test)[:, 1]


In [17]:
test_df['pred'] = test_probs  # or test_preds

test_df[['id1', 'id2', 'id3', 'id5', 'pred']].to_csv('submission.csv', index=False)


In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from scipy.special import softmax


In [3]:
train_df = pd.read_csv("train_processed.csv")
test_df = pd.read_csv("test_processed.csv")

In [6]:
categorical_cols = ['f42', 'f50', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f354']
le_dict = {}
for col in categorical_cols:
    le = LabelEncoder()
    combined = pd.concat([train_df[col], test_df[col]], axis=0).astype(str)
    le.fit(combined)
    train_df[col] = le.transform(train_df[col].astype(str))
    test_df[col] = le.transform(test_df[col].astype(str))
    le_dict[col] = le


In [8]:
threshold = 0.95
missing_ratio = train_df.isnull().mean()
cols_to_drop = missing_ratio[missing_ratio > threshold].index
train_df = train_df.drop(columns=cols_to_drop)
test_df = test_df.drop(columns=cols_to_drop, errors='ignore')

In [10]:
train_df = train_df.fillna(train_df.mean(numeric_only=True))
test_df = test_df.fillna(train_df.mean(numeric_only=True))


In [12]:
X = train_df.drop(columns=['id1', 'customer_id', 'offer_id', 'y', 'id4', 'id5'])
y = train_df['y']
X_test = test_df.drop(columns=['id1', 'customer_id', 'offer_id', 'y', 'id2', 'id3', 'id4', 'id5'], errors='ignore')


In [14]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

In [15]:
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_scaled)
X_test_pca = pca.transform(X_test_scaled)


In [26]:
X_df = pd.DataFrame(X_pca)
X_df['y'] = y
X_df['id2'] = train_df['id2'].values if 'id2' in train_df.columns else np.nan

X_train_df, X_val_df = train_test_split(X_df, test_size=0.2, random_state=42, stratify=X_df['y'])

X_train = X_train_df.drop(columns=['y', 'id2'])
y_train = X_train_df['y']
X_val = X_val_df.drop(columns=['y', 'id2'])
y_val = X_val_df['y']


In [28]:
dtrain = lgb.Dataset(X_train, label=y_train)
dval = lgb.Dataset(X_val, label=y_val)

In [30]:

params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'verbose': -1
}

model = lgb.train(
    params,
    dtrain,
    valid_sets=[dtrain, dval],
    valid_names=['train', 'valid'],
    num_boost_round=1000,
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=100)
    ]
)


Training until validation scores don't improve for 50 rounds
[100]	train's binary_logloss: 0.104132	valid's binary_logloss: 0.108314
[200]	train's binary_logloss: 0.0938772	valid's binary_logloss: 0.100906
[300]	train's binary_logloss: 0.0876476	valid's binary_logloss: 0.0977719
[400]	train's binary_logloss: 0.0826093	valid's binary_logloss: 0.0959236
[500]	train's binary_logloss: 0.0783759	valid's binary_logloss: 0.0946556
[600]	train's binary_logloss: 0.0746113	valid's binary_logloss: 0.0938251
[700]	train's binary_logloss: 0.0709921	valid's binary_logloss: 0.0929682
[800]	train's binary_logloss: 0.0677423	valid's binary_logloss: 0.0922987
[900]	train's binary_logloss: 0.0647444	valid's binary_logloss: 0.0916292
[1000]	train's binary_logloss: 0.0619595	valid's binary_logloss: 0.0911336
Did not meet early stopping. Best iteration is:
[1000]	train's binary_logloss: 0.0619595	valid's binary_logloss: 0.0911336


In [31]:
probs = model.predict(X_test_pca)
test_df['raw_pred'] = probs
test_df['rank'] = test_df.groupby('id2')['raw_pred'].rank(method='first', ascending=False)
test_df['inv_rank'] = test_df.groupby('id2')['rank'].transform(lambda r: 1 / r)
test_df['pred'] = test_df.groupby('id2')['inv_rank'].transform(lambda x: x / x.sum())

# --- Step 11: Save final submission (drop id4) ---
submission_df = test_df[['id1', 'id2', 'id3', 'id5', 'pred']].copy()
submission_df.to_csv("final_submission_ranked_pca.csv", index=False)

print("✅ Final submission file saved as 'final_submission_ranked_pca.csv'")


  test_df['raw_pred'] = probs
  test_df['rank'] = test_df.groupby('id2')['raw_pred'].rank(method='first', ascending=False)
  test_df['inv_rank'] = test_df.groupby('id2')['rank'].transform(lambda r: 1 / r)
  test_df['pred'] = test_df.groupby('id2')['inv_rank'].transform(lambda x: x / x.sum())


✅ Final submission file saved as 'final_submission_ranked_pca.csv'


In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

# --- Step 1: Load data ---
train_df = pd.read_csv("train_processed.csv")
test_df = pd.read_csv("test_processed.csv")

# --- Step 2: Encode categorical features ---
categorical_cols = ['f42', 'f50', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f354']
le_dict = {}
for col in categorical_cols:
    le = LabelEncoder()
    combined = pd.concat([train_df[col], test_df[col]], axis=0).astype(str)
    le.fit(combined)
    train_df[col] = le.transform(train_df[col].astype(str))
    test_df[col] = le.transform(test_df[col].astype(str))
    le_dict[col] = le

# --- Step 3: Drop columns with >60% missing values ---


# --- Step 4: Fill missing values with mean ---

# --- Step 5: Prepare features and target ---
X = train_df.drop(columns=['id1', 'customer_id', 'offer_id', 'y', 'id4', 'id5'])
y = train_df['y']
X_test = test_df.drop(columns=['id1', 'customer_id', 'offer_id', 'y', 'id2', 'id3', 'id4', 'id5'], errors='ignore')

# --- Step 6: Train models ---
# LightGBM
lgb_model = lgb.LGBMClassifier(objective='binary', learning_rate=0.05, num_leaves=31, n_estimators=100)
lgb_model.fit(X, y)
lgb_preds = lgb_model.predict_proba(X_test)[:, 1]

# XGBoost
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X, y)
xgb_preds = xgb_model.predict_proba(X_test)[:, 1]

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X, y)
rf_preds = rf_model.predict_proba(X_test)[:, 1]

# --- Step 7: Ensemble predictions ---
ensemble_probs = (lgb_preds + xgb_preds + rf_preds) / 3

# --- Step 8: Rank and normalize ---
test_df['raw_pred'] = ensemble_probs
test_df['rank'] = test_df.groupby('id2')['raw_pred'].rank(method='first', ascending=False)
test_df['inv_rank'] = test_df.groupby('id2')['rank'].transform(lambda r: 1 / r)
test_df['pred'] = test_df.groupby('id2')['inv_rank'].transform(lambda x: x / x.sum())

# --- Step 9: Save final submission (drop id4) ---
submission_df = test_df[['id1', 'id2', 'id3', 'id5', 'pred']].copy()
submission_df.to_csv("final_submission_ensemble.csv", index=False)

print("✅ Final submission file saved as 'final_submission_ensemble.csv'")


[LightGBM] [Info] Number of positive: 37051, number of negative: 733113
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.319112 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 48871
[LightGBM] [Info] Number of data points in the train set: 770164, number of used features: 372
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.048108 -> initscore=-2.985005
[LightGBM] [Info] Start training from score -2.985005


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

KeyboardInterrupt



In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss

# --- Step 1: Load data ---
train_df = pd.read_csv("train_processed.csv")
test_df = pd.read_csv("test_processed.csv")

# --- Step 2: Encode categorical features ---
categorical_cols = ['f42', 'f50', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f354']
le_dict = {}
for col in categorical_cols:
    le = LabelEncoder()
    combined = pd.concat([train_df[col], test_df[col]], axis=0).astype(str)
    le.fit(combined)
    train_df[col] = le.transform(train_df[col].astype(str))
    test_df[col] = le.transform(test_df[col].astype(str))
    le_dict[col] = le

# --- Step 3: Drop columns with >60% missing values ---


# --- Step 5: Prepare features and target ---
X = train_df.drop(columns=['id1', 'customer_id', 'offer_id', 'y', 'id4', 'id5'])
y = train_df['y']
X_test = test_df.drop(columns=['id1', 'customer_id', 'offer_id', 'y', 'id2', 'id3', 'id4', 'id5'], errors='ignore')

# --- Step 6: Train-validation split ---
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# --- Step 7: Define base models ---
lgb_model = lgb.LGBMClassifier(n_estimators=300, learning_rate=0.05, num_leaves=31)
xgb_model = xgb.XGBClassifier(n_estimators=300, learning_rate=0.05, use_label_encoder=False, eval_metric='logloss')
rf_model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)

# --- Step 8: Define meta-model ---
meta_model = LogisticRegression()

# --- Step 9: Stacking Classifier ---
stack_model = StackingClassifier(
    estimators=[
        ('lgb', lgb_model),
        ('xgb', xgb_model),
        ('rf', rf_model)
    ],
    final_estimator=meta_model,
    passthrough=False,
    n_jobs=-1
)

# --- Step 10: Train the ensemble ---
stack_model.fit(X_train, y_train)

# --- Step 11: Predict ---
probs = stack_model.predict_proba(X_test)[:, 1]

# --- Step 12: Post-process predictions ---
test_df['raw_pred'] = probs
test_df['rank'] = test_df.groupby('id2')['raw_pred'].rank(method='first', ascending=False)
test_df['inv_rank'] = test_df.groupby('id2')['rank'].transform(lambda r: 1 / r)
test_df['pred'] = test_df.groupby('id2')['inv_rank'].transform(lambda x: x / x.sum())

# --- Step 13: Save final submission (drop id4) ---
submission_df = test_df[['id1', 'id2', 'id3', 'id5', 'pred']].copy()
submission_df.to_csv("final_submission_stacked.csv", index=False)

print("✅ Final stacked submission file saved as 'final_submission_stacked.csv'")


In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss

In [3]:
train_df = pd.read_csv("train_processed.csv")
test_df = pd.read_csv("test_processed.csv")

# --- Step 2: Encode categorical features ---
categorical_cols = ['f42', 'f50', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f354']
le_dict = {}

In [4]:
for col in categorical_cols:
    le = LabelEncoder()
    combined = pd.concat([train_df[col], test_df[col]], axis=0).astype(str)
    le.fit(combined)
    train_df[col] = le.transform(train_df[col].astype(str))
    test_df[col] = le.transform(test_df[col].astype(str))
    le_dict[col] = le


In [5]:
X = train_df.drop(columns=['id1', 'customer_id', 'offer_id', 'y', 'id4', 'id5'])
y = train_df['y']
X_test = test_df.drop(columns=['id1', 'customer_id', 'offer_id', 'y', 'id2', 'id3', 'id4', 'id5'], errors='ignore')

# --- Step 6: Train-validation split ---
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
lgb_model = lgb.LGBMClassifier(n_estimators=300, learning_rate=0.05, num_leaves=31)
xgb_model = xgb.XGBClassifier(n_estimators=300, learning_rate=0.05, eval_metric='logloss')
rf_model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)

# --- Step 8: Define meta-model ---
meta_model = LogisticRegression()

# --- Step 9: Stacking Classifier ---
stack_model = StackingClassifier(
    estimators=[
        ('lgb', lgb_model),
        ('xgb', xgb_model),
        ('rf', rf_model)
    ],
    final_estimator=meta_model,
    passthrough=False,
    n_jobs=-1
)

# --- Step 10: Train the ensemble ---
stack_model.fit(X_train, y_train)

# --- Step 11: Predict ---



In [None]:
# --- Step 11: Predict ---
probs = stack_model.predict_proba(X_test)[:, 1]

# --- Step 12: Post-process predictions ---
test_df['raw_pred'] = probs
test_df['rank'] = test_df.groupby('id2')['raw_pred'].rank(method='first', ascending=False)
test_df['inv_rank'] = test_df.groupby('id2')['rank'].transform(lambda r: 1 / r)
test_df['pred'] = test_df.groupby('id2')['inv_rank'].transform(lambda x: x / x.sum())

# --- Step 13: Save final submission (drop id4) ---
submission_df = test_df[['id1', 'id2', 'id3', 'id5', 'pred']].copy()
submission_df.to_csv("final_submission_stacked.csv", index=False)

print("✅ Final stacked submission file saved as 'final_submission_stacked.csv'")


In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss

In [3]:
# --- Step 1: Load data ---
train_df = pd.read_csv("train_processed.csv")
test_df = pd.read_csv("test_processed.csv")


In [4]:
categorical_cols = ['f42', 'f50', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f354']
le_dict = {}
for col in categorical_cols:
    le = LabelEncoder()
    combined = pd.concat([train_df[col], test_df[col]], axis=0).astype(str)
    le.fit(combined)
    train_df[col] = le.transform(train_df[col].astype(str))
    test_df[col] = le.transform(test_df[col].astype(str))
    le_dict[col] = le


In [6]:
X = train_df.drop(columns=['id1', 'customer_id', 'offer_id', 'y', 'id4', 'id5'])
y = train_df['y']
X_test = test_df.drop(columns=['id1', 'customer_id', 'offer_id', 'y', 'id2', 'id3', 'id4', 'id5'], errors='ignore')

# --- Step 6: Split for classifiers ---
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [9]:
lgb_model = lgb.LGBMClassifier(n_estimators=300, learning_rate=0.05, num_leaves=31)
lgb_model.fit(X_train, y_train)
probs_lgb = lgb_model.predict_proba(X_test)[:, 1]


[LightGBM] [Info] Number of positive: 29641, number of negative: 586490
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.764949 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 48794
[LightGBM] [Info] Number of data points in the train set: 616131, number of used features: 334
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.048108 -> initscore=-2.984997
[LightGBM] [Info] Start training from score -2.984997


In [10]:
xgb_model = xgb.XGBClassifier(n_estimators=300, learning_rate=0.05, use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
probs_xgb = xgb_model.predict_proba(X_test)[:, 1]

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [11]:
rf_model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)
probs_rf = rf_model.predict_proba(X_test)[:, 1]


In [16]:
# --- Step 8: Ensemble Predictions (weighted average) ---
test_df['raw_pred'] = (
    0.4 * probs_lgb +
    0.3 * probs_xgb +
    0.3 * probs_rf
)


In [18]:
# --- Step 9: MAP@7 Style Ranking ---
test_df['rank'] = test_df.groupby('id2')['raw_pred'].rank(method='first', ascending=False)
test_df['inv_rank'] = test_df.groupby('id2')['rank'].transform(lambda r: 1 / r)
test_df['pred'] = test_df.groupby('id2')['inv_rank'].transform(lambda x: x / x.sum())


In [20]:

# --- Step 10: Final submission (drop id4) ---
submission_df = test_df[['id1', 'id2', 'id3', 'id5', 'pred']].copy()
submission_df.to_csv("final_submission_ensemble.csv", index=False)
print("✅ Ensemble submission saved as 'final_submission_ensemble.csv'")


✅ Ensemble submission saved as 'final_submission_ensemble.csv'


In [22]:
import pandas as pd
import numpy as np

def apk(actual, predicted, k=7):
    if len(predicted) > k:
        predicted = predicted[:k]
    score = 0.0
    hits = 0.0
    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            hits += 1.0
            score += hits / (i + 1.0)
    return score / min(len(actual), k)

def mapk(actual, predicted, k=7):
    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])

def prepare_map_data(df):
    grouped = df.groupby("id2")
    predictions = grouped.apply(lambda x: x.sort_values("pred", ascending=False)["id1"].tolist()).tolist()
    actuals = grouped.apply(lambda x: [x.loc[x["pred"].idxmax(), "id1"]]).tolist()
    return actuals, predictions

# Load CSVs
ensemble_df = pd.read_csv("final_submission_ensemble.csv")
ranked_df = pd.read_csv("final_submission_rankedfinalfinal.csv")
ranked_pca_df = pd.read_csv("final_submission_ranked_pca.csv")

# Prepare and score
actuals, pred_ensemble = prepare_map_data(ensemble_df)
_, pred_ranked = prepare_map_data(ranked_df)
_, pred_pca = prepare_map_data(ranked_pca_df)

print("MAP@7 Ensemble:", mapk(actuals, pred_ensemble))
print("MAP@7 Ranked:", mapk(actuals, pred_ranked))
print("MAP@7 PCA:", mapk(actuals, pred_pca))


  predictions = grouped.apply(lambda x: x.sort_values("pred", ascending=False)["id1"].tolist()).tolist()
  actuals = grouped.apply(lambda x: [x.loc[x["pred"].idxmax(), "id1"]]).tolist()
  predictions = grouped.apply(lambda x: x.sort_values("pred", ascending=False)["id1"].tolist()).tolist()
  actuals = grouped.apply(lambda x: [x.loc[x["pred"].idxmax(), "id1"]]).tolist()
  predictions = grouped.apply(lambda x: x.sort_values("pred", ascending=False)["id1"].tolist()).tolist()


MAP@7 Ensemble: 1.0
MAP@7 Ranked: 0.884894853859718
MAP@7 PCA: 0.7525573985631756


  actuals = grouped.apply(lambda x: [x.loc[x["pred"].idxmax(), "id1"]]).tolist()


In [24]:
import pandas as pd

def mapk(y_true_dict, y_pred_dict, k=7):
    """Compute Mean Average Precision at k"""
    map_total = 0
    for cust_id in y_true_dict:
        actual = y_true_dict[cust_id]
        pred = y_pred_dict.get(cust_id, [])[:k]
        score = 0.0
        hits = 0.0
        for i, p in enumerate(pred):
            if p in actual and p not in pred[:i]:
                hits += 1.0
                score += hits / (i + 1.0)
        if len(actual) == 0:
            continue
        map_total += score / min(len(actual), k)
    return map_total / len(y_true_dict)


# --- Step 1: Load data ---
train_df = pd.read_csv("train_processed.csv")

# --- Step 2: Create ground truth dictionary ---
# Only those offers that were actually accepted (y = 1), grouped by id2
ground_truth = train_df[train_df['y'] == 1].groupby('id2')['id1'].apply(list).to_dict()

# --- Step 3: Load prediction submissions ---
ensemble_df = pd.read_csv("final_submission_ensemble.csv")
ranked_df = pd.read_csv("final_submission_rankedfinalfinal.csv")
pca_df = pd.read_csv("final_submission_ranked_pca.csv")

# --- Step 4: Create prediction dictionaries ---
ensemble_preds = ensemble_df.groupby('id2')['id1'].apply(list).to_dict()
ranked_preds = ranked_df.groupby('id2')['id1'].apply(list).to_dict()
pca_preds = pca_df.groupby('id2')['id1'].apply(list).to_dict()

# --- Step 5: Calculate MAP@7 for each ---
score_ensemble = mapk(ground_truth, ensemble_preds, k=7)
score_ranked = mapk(ground_truth, ranked_preds, k=7)
score_pca = mapk(ground_truth, pca_preds, k=7)

# --- Step 6: Print results ---
print(f"✅ MAP@7 Ensemble: {score_ensemble:.6f}")
print(f"✅ MAP@7 Ranked:   {score_ranked:.6f}")
print(f"✅ MAP@7 PCA:      {score_pca:.6f}")


KeyError: 'id2'

In [26]:
import pandas as pd

def mapk(y_true_dict, y_pred_dict, k=7):
    map_total = 0
    for cust_id in y_true_dict:
        actual = y_true_dict[cust_id]
        pred = y_pred_dict.get(cust_id, [])[:k]
        score = 0.0
        hits = 0.0
        for i, p in enumerate(pred):
            if p in actual and p not in pred[:i]:
                hits += 1.0
                score += hits / (i + 1.0)
        if len(actual) == 0:
            continue
        map_total += score / min(len(actual), k)
    return map_total / len(y_true_dict)


# --- Load data ---
train_df = pd.read_csv("train_processed.csv")

# --- Use customer_id instead of id2 ---
ground_truth = train_df[train_df['y'] == 1].groupby('customer_id')['id1'].apply(list).to_dict()

# --- Load submissions ---
ensemble_df = pd.read_csv("final_submission_ensemble.csv")
ranked_df = pd.read_csv("final_submission_rankedfinalfinal.csv")
pca_df = pd.read_csv("final_submission_ranked_pca.csv")

# --- Prediction dictionaries using customer_id ---
ensemble_preds = ensemble_df.groupby('customer_id')['id1'].apply(list).to_dict()
ranked_preds = ranked_df.groupby('customer_id')['id1'].apply(list).to_dict()
pca_preds = pca_df.groupby('customer_id')['id1'].apply(list).to_dict()

# --- Compute MAP@7 ---
score_ensemble = mapk(ground_truth, ensemble_preds, k=7)
score_ranked = mapk(ground_truth, ranked_preds, k=7)
score_pca = mapk(ground_truth, pca_preds, k=7)

# --- Output ---
print(f"✅ MAP@7 Ensemble: {score_ensemble:.6f}")
print(f"✅ MAP@7 Ranked:   {score_ranked:.6f}")
print(f"✅ MAP@7 PCA:      {score_pca:.6f}")


KeyError: 'customer_id'

In [28]:
import pandas as pd

# --- Step 1: Load training data to get ground truth ---
train_df = pd.read_csv("train_processed.csv")

# Use 'id2' if 'customer_id' doesn't exist
group_col = 'customer_id' if 'customer_id' in train_df.columns else 'id2'

# --- Step 2: Create ground truth dictionary ---
ground_truth = train_df[train_df['y'] == 1].groupby(group_col)['id1'].apply(list).to_dict()

# --- Step 3: Load prediction submission files ---
ensemble_df = pd.read_csv("final_submission_ensemble.csv")
ranked_df = pd.read_csv("final_submission_rankedfinalfinal.csv")
pca_df = pd.read_csv("final_submission_ranked_pca.csv")

# --- Step 4: Use same grouping key in predictions ---
ensemble_preds = ensemble_df.groupby(group_col)['id1'].apply(list).to_dict()
ranked_preds = ranked_df.groupby(group_col)['id1'].apply(list).to_dict()
pca_preds = pca_df.groupby(group_col)['id1'].apply(list).to_dict()

# --- Step 5: MAP@7 scoring function ---
def mapk(y_true_dict, y_pred_dict, k=7):
    total_score = 0.0
    for key, actual_list in y_true_dict.items():
        pred_list = y_pred_dict.get(key, [])[:k]
        score = 0.0
        hits = 0
        for i, p in enumerate(pred_list):
            if p in actual_list:
                hits += 1
                score += hits / (i + 1)
        if len(actual_list) == 0:
            continue
        total_score += score / min(len(actual_list), k)
    return total_score / len(y_true_dict)

# --- Step 6: Calculate MAP@7 ---
score_ensemble = mapk(ground_truth, ensemble_preds)
score_ranked = mapk(ground_truth, ranked_preds)
score_pca = mapk(ground_truth, pca_preds)

# --- Step 7: Print scores ---
print(f"✅ MAP@7 Ensemble: {score_ensemble:.6f}")
print(f"✅ MAP@7 Ranked:   {score_ranked:.6f}")
print(f"✅ MAP@7 PCA:      {score_pca:.6f}")


KeyError: 'customer_id'

In [30]:
import pandas as pd

# --- Step 1: Load training data ---
train_df = pd.read_csv("train_processed.csv")

# --- Step 2: Create ground truth dict grouped by id2 ---
if 'id2' not in train_df.columns:
    raise KeyError("❌ 'id2' column not found in train_processed.csv.")
ground_truth = train_df[train_df['y'] == 1].groupby('id2')['id1'].apply(list).to_dict()

# --- Step 3: Load prediction files ---
ensemble_df = pd.read_csv("final_submission_ensemble.csv")
ranked_df = pd.read_csv("final_submission_rankedfinalfinal.csv")
pca_df = pd.read_csv("final_submission_ranked_pca.csv")

# --- Step 4: Convert predictions into grouped format using id2 ---
ensemble_preds = ensemble_df.groupby('id2')['id1'].apply(list).to_dict()
ranked_preds = ranked_df.groupby('id2')['id1'].apply(list).to_dict()
pca_preds = pca_df.groupby('id2')['id1'].apply(list).to_dict()

# --- Step 5: Define MAP@7 scoring function ---
def mapk(y_true_dict, y_pred_dict, k=7):
    total_score = 0.0
    for key, actual in y_true_dict.items():
        pred = y_pred_dict.get(key, [])[:k]
        score = 0.0
        hits = 0
        for i, p in enumerate(pred):
            if p in actual:
                hits += 1
                score += hits / (i + 1)
        if len(actual) == 0:
            continue
        total_score += score / min(len(actual), k)
    return total_score / len(y_true_dict)

# --- Step 6: Calculate MAP@7 for each prediction file ---
score_ensemble = mapk(ground_truth, ensemble_preds)
score_ranked = mapk(ground_truth, ranked_preds)
score_pca = mapk(ground_truth, pca_preds)

# --- Step 7: Print results ---
print(f"✅ MAP@7 Ensemble: {score_ensemble:.6f}")
print(f"✅ MAP@7 Ranked:   {score_ranked:.6f}")
print(f"✅ MAP@7 PCA:      {score_pca:.6f}")


KeyError: "❌ 'id2' column not found in train_processed.csv."

In [32]:
train_df = pd.read_csv("train_processed.csv")

In [33]:
train_df.head()

Unnamed: 0,id1,customer_id,offer_id,id4,id5,y,f1,f2,f3,f4,...,cashback_value,offer_duration_months,store_type_encoded,has_cashback,offer_ctr,industry_code,redemption_frequency,industry_total_spent,industry_avg_spent,industry_txn_count
0,1366776_189706075_16-23_2023-11-02 22:22:00.042,1366776,189706075,2023-11-02 22:22:00.042,2023-11-02,0,1.0,,,,...,2.0,1.0,104.0,1.0,0.059875,57310000.0,2.0,16249283.79,433.638017,37472.0
1,1366776_89227_16-23_2023-11-01 23:51:24.999,1366776,89227,2023-11-01 23:51:24.999,2023-11-01,0,1.0,,,,...,0.0,6.0,68.0,0.0,0.046487,59210000.0,2.0,3511750.83,142.77151,24597.0
2,1366776_35046_16-23_2023-11-01 00:30:59.797,1366776,35046,2023-11-01 00:30:59.797,2023-11-01,0,1.0,,,,...,10.0,1.0,13.0,1.0,0.041484,72310000.0,2.0,2659689.37,137.033818,19409.0
3,1366776_6275451_16-23_2023-11-02 22:21:32.261,1366776,6275451,2023-11-02 22:21:32.261,2023-11-02,0,1.0,,,,...,10.0,1.0,47.0,1.0,0.042805,56510500.0,2.0,14216414.45,249.131054,57064.0
4,1366776_78053_16-23_2023-11-02 22:21:34.799,1366776,78053,2023-11-02 22:21:34.799,2023-11-02,0,1.0,,,,...,8.0,1.0,86.0,1.0,0.042544,59991300.0,2.0,2352086.84,167.946222,14005.0


In [36]:
import pandas as pd

# --- Step 1: Load training data and rename column ---
train_df = pd.read_csv("train_processed.csv")
train_df = train_df.rename(columns={"customer_id": "id2"})

# --- Step 2: Create ground truth dict grouped by id2 ---
ground_truth = train_df[train_df['y'] == 1].groupby('id2')['id1'].apply(list).to_dict()

# --- Step 3: Load prediction files ---
ensemble_df = pd.read_csv("final_submission_ensemble.csv")
ranked_df = pd.read_csv("final_submission_rankedfinalfinal.csv")
pca_df = pd.read_csv("final_submission_ranked_pca.csv")

# --- Step 4: Convert predictions into grouped format using id2 ---
ensemble_preds = ensemble_df.groupby('id2')['id1'].apply(list).to_dict()
ranked_preds = ranked_df.groupby('id2')['id1'].apply(list).to_dict()
pca_preds = pca_df.groupby('id2')['id1'].apply(list).to_dict()

# --- Step 5: Define MAP@7 scoring function ---
def mapk(y_true_dict, y_pred_dict, k=7):
    total_score = 0.0
    for key, actual in y_true_dict.items():
        pred = y_pred_dict.get(key, [])[:k]
        score = 0.0
        hits = 0
        for i, p in enumerate(pred):
            if p in actual:
                hits += 1
                score += hits / (i + 1)
        if len(actual) == 0:
            continue
        total_score += score / min(len(actual), k)
    return total_score / len(y_true_dict)

# --- Step 6: Calculate MAP@7 for each prediction file ---
score_ensemble = mapk(ground_truth, ensemble_preds)
score_ranked = mapk(ground_truth, ranked_preds)
score_pca = mapk(ground_truth, pca_preds)

# --- Step 7: Print results ---
print(f"✅ MAP@7 Ensemble: {score_ensemble:.6f}")
print(f"✅ MAP@7 Ranked:   {score_ranked:.6f}")
print(f"✅ MAP@7 PCA:      {score_pca:.6f}")


✅ MAP@7 Ensemble: 0.000000
✅ MAP@7 Ranked:   0.000000
✅ MAP@7 PCA:      0.000000


In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss

# --- Helper function to compute MAP@7 ---
def apk(actual, predicted, k=7):
    if len(predicted) > k:
        predicted = predicted[:k]
    score = 0.0
    num_hits = 0.0
    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    return score / min(len(actual), k) if actual else 0.0

def mapk(actual_dict, predicted_dict, k=7):
    scores = [apk(actual_dict.get(key, []), predicted_dict.get(key, []), k) for key in predicted_dict.keys()]
    return np.mean(scores)

# --- Step 1: Load data ---
train_df = pd.read_csv("train_processed.csv")

# Rename customer_id to id2
train_df.rename(columns={'customer_id': 'id2'}, inplace=True)

# --- Step 2: Encode categorical features ---
categorical_cols = ['f42', 'f50', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f354']
le_dict = {}
for col in categorical_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col].astype(str))
    le_dict[col] = le

# --- Step 3: Drop columns with >60% missing values ---
threshold = 0.6
missing_ratio = train_df.isnull().mean()
cols_to_drop = missing_ratio[missing_ratio > threshold].index
train_df = train_df.drop(columns=cols_to_drop)

# --- Step 4: Fill missing values with mean ---
train_df = train_df.fillna(train_df.mean(numeric_only=True))

# --- Step 5: Prepare features and target ---
X = train_df.drop(columns=['id1', 'offer_id', 'y', 'id4', 'id5'])
y = train_df['y']

# --- Step 6: Train/Validation Split ---
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Restore ids for validation set
y_val_df = train_df.loc[X_val.index, ['id1', 'id2', 'y']].copy()

# --- Step 7: Train Models ---
lgb_model = lgb.LGBMClassifier(n_estimators=300, learning_rate=0.05, num_leaves=31)
lgb_model.fit(X_train, y_train)
val_preds_lgb = lgb_model.predict_proba(X_val)[:, 1]

xgb_model = xgb.XGBClassifier(n_estimators=300, learning_rate=0.05, use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
val_preds_xgb = xgb_model.predict_proba(X_val)[:, 1]

rf_model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)
val_preds_rf = rf_model.predict_proba(X_val)[:, 1]

# --- Step 8: Ensemble Predictions on Validation ---
y_val_df['raw_pred'] = 0.4 * val_preds_lgb + 0.3 * val_preds_xgb + 0.3 * val_preds_rf

# --- Step 9: Rank within id2 ---
y_val_df['rank'] = y_val_df.groupby('id2')['raw_pred'].rank(method='first', ascending=False)
y_val_df['inv_rank'] = y_val_df.groupby('id2')['rank'].transform(lambda r: 1 / r)
y_val_df['pred'] = y_val_df.groupby('id2')['inv_rank'].transform(lambda x: x / x.sum())

# --- Step 10: Evaluate MAP@7 ---
actual_dict = y_val_df[y_val_df['y'] == 1].groupby('id2')['id1'].apply(list).to_dict()
pred_dict = y_val_df.sort_values(by='pred', ascending=False).groupby('id2')['id1'].apply(list).to_dict()

score = mapk(actual_dict, pred_dict, k=7)
print(f"\n✅ MAP@7 Ensemble on Validation Set: {score:.6f}")


[LightGBM] [Info] Number of positive: 29641, number of negative: 586490
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.174545 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 43197
[LightGBM] [Info] Number of data points in the train set: 616131, number of used features: 268
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.048108 -> initscore=-2.984997
[LightGBM] [Info] Start training from score -2.984997


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss

In [2]:
# --- Helper function to compute MAP@7 ---
def apk(actual, predicted, k=7):
    if len(predicted) > k:
        predicted = predicted[:k]
    score = 0.0
    num_hits = 0.0
    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    return score / min(len(actual), k) if actual else 0.0

def mapk(actual_dict, predicted_dict, k=7):
    scores = [apk(actual_dict.get(key, []), predicted_dict.get(key, []), k) for key in predicted_dict.keys()]
    return np.mean(scores)


In [5]:
train_df = pd.read_csv("train_processed.csv")
test_df = pd.read_csv("test_processed.csv")

# Rename customer_id to id2
train_df.rename(columns={'customer_id': 'id2'}, inplace=True)



In [7]:
categorical_cols = ['f42', 'f50', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f354']
le_dict = {}
for col in categorical_cols:
    le = LabelEncoder()
    combined = pd.concat([train_df[col], test_df[col]], axis=0).astype(str)
    le.fit(combined)
    train_df[col] = le.transform(train_df[col].astype(str))
    test_df[col] = le.transform(test_df[col].astype(str))
    le_dict[col] = le


In [9]:
X = train_df.drop(columns=['id1', 'offer_id', 'y', 'id4', 'id5'])
y = train_df['y']
X_test = test_df.drop(columns=['id1', 'offer_id', 'id4', 'id5', 'y'], errors='ignore')


In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# --- LightGBM ---
lgb_params = {
    'n_estimators': [300],
    'learning_rate': [0.05],
    'num_leaves': [31, 63],
}
lgb_model = GridSearchCV(lgb.LGBMClassifier(), lgb_params, cv=skf, scoring='neg_log_loss', verbose=1, n_jobs=-1)
lgb_model.fit(X, y)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[LightGBM] [Info] Number of positive: 37051, number of negative: 733113
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.454965 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 49126
[LightGBM] [Info] Number of data points in the train set: 770164, number of used features: 373
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.048108 -> initscore=-2.985005
[LightGBM] [Info] Start training from score -2.985005


In [None]:
xgb_params = {
    'n_estimators': [300],
    'learning_rate': [0.05],
    'max_depth': [3, 6]
}
xgb_model = GridSearchCV(xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'), xgb_params, cv=skf, scoring='neg_log_loss', verbose=1, n_jobs=-1)
xgb_model.fit(X, y)


In [None]:
rf_params = {
    'n_estimators': [200],
    'max_depth': [None, 10]
}
rf_model = GridSearchCV(RandomForestClassifier(random_state=42), rf_params, cv=skf, scoring='neg_log_loss', verbose=1, n_jobs=-1)
rf_model.fit(X, y)

In [None]:
probs_lgb = lgb_model.best_estimator_.predict_proba(X_test)[:, 1]
probs_xgb = xgb_model.best_estimator_.predict_proba(X_test)[:, 1]
probs_rf = rf_model.best_estimator_.predict_proba(X_test)[:, 1]

test_df['raw_pred'] = 0.4 * probs_lgb + 0.3 * probs_xgb + 0.3 * probs_rf

# --- Step 8: Rank and Normalize ---
test_df['rank'] = test_df.groupby('id2')['raw_pred'].rank(method='first', ascending=False)
test_df['inv_rank'] = test_df.groupby('id2')['rank'].transform(lambda r: 1 / r)
test_df['pred'] = test_df.groupby('id2')['inv_rank'].transform(lambda x: x / x.sum())

# --- Step 9: Save final submission ---
submission_df = test_df[['id1', 'id2', 'id3', 'id5', 'pred']].copy()
submission_df.to_csv("final_submission_ensemble_full.csv", index=False)
print("✅ Final submission saved as 'final_submission_ensemble_full.csv'")

In [7]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import gc
import warnings

warnings.filterwarnings('ignore')

# ===================================================================================
# 0. Memory Reduction Helper
# ===================================================================================
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min, c_max = df[col].min(), df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                else:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print(f'Mem. usage decreased to {end_mem:5.2f} Mb ({100 * (start_mem - end_mem) / start_mem:.1f}% reduction)')
    return df

# ===================================================================================
# 1. Custom MAP@7 Metric
# ===================================================================================
def mapk(actuals, predicteds, k=7):
    return np.mean([apk(a, p, k) for a, p in zip(actuals, predicteds)])

def apk(actual, predicted, k=7):
    if len(predicted) > k: predicted = predicted[:k]
    score, num_hits = 0.0, 0.0
    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    if not actual: return 0.0
    return score / min(len(actual), k)

# ===================================================================================
# 2. Data Loading and Cleaning (Robust Version)
# ===================================================================================
def load_and_clean_data(data_path='.'):
    print("Loading and cleaning data...")
    train = pd.read_parquet(f'{data_path}/train_data.parquet')
    test = pd.read_parquet(f'{data_path}/test_data.parquet')
    offer_meta = pd.read_parquet(f'{data_path}/offer_metadata.parquet')
    events = pd.read_parquet(f'{data_path}/add_event.parquet')
    trans = pd.read_parquet(f'{data_path}/add_trans.parquet')

    # === ROBUST FIX for ValueError ===
    # Force all feature columns ('f*') to numeric types robustly in both train and test.
    for df in [train, test]:
        for col in df.columns:
            if col.startswith('f'):
                df[col] = pd.to_numeric(df[col], errors='coerce')
        df['id5'] = pd.to_datetime(df['id5'], errors='coerce')

    # Ensure all merge keys are the same numeric type
    for df in [train, test, offer_meta, events, trans]:
        for col in ['id2', 'id3', 'customer_id', 'offer_id']:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors='coerce')

    # Rename columns for clarity
    offer_meta = offer_meta.rename(columns={'id3': 'offer_id', 'id9': 'industry_code', 'id12': 'start_date', 'id13': 'end_date'})
    events = events.rename(columns={'id2': 'customer_id', 'id3': 'offer_id', 'id4': 'impression_timestamp', 'id7': 'click_timestamp'})
    trans = trans.rename(columns={'id2': 'customer_id', 'f367': 'transaction_amount', 'f370': 'transaction_date_d', 'f371': 'transaction_date_t'})

    # Convert date columns
    trans['transaction_date'] = pd.to_datetime(trans['transaction_date_d'] + ' ' + trans['transaction_date_t'], errors='coerce')
    for df in [offer_meta, events]:
        for col in df.columns:
            if 'date' in col or 'timestamp' in col:
                df[col] = pd.to_datetime(df[col], errors='coerce')

    return train, test, offer_meta, events, trans

# ===================================================================================
# 3. Memory-Efficient Feature Engineering
# ===================================================================================
def feature_engineer(df, offer_meta, events, trans):
    print(f"Engineering features for {'train' if 'y' in df.columns else 'test'} set...")
    
    # Merge offer metadata
    df = df.merge(offer_meta, left_on='id3', right_on='offer_id', how='left')
    df['imp_date'] = pd.to_datetime(df['id5'])
    
    # Time-based Features
    df['imp_dow'] = df['imp_date'].dt.dayofweek
    df['offer_duration'] = (df['end_date'] - df['start_date']).dt.days

    # Customer Historical Behavior
    events['has_clicked'] = events['click_timestamp'].notna().astype(int)
    cust_agg = events.groupby('customer_id').agg(
        total_customer_imps=('impression_timestamp', 'count'),
        total_customer_clicks=('has_clicked', 'sum'),
        last_click_date=('click_timestamp', 'max')
    ).reset_index()
    df = df.merge(cust_agg, left_on='id2', right_on='customer_id', how='left')
    df['days_since_last_any_click'] = (df['imp_date'] - df['last_click_date']).dt.days

    # Customer-Industry Interaction
    cust_industry_agg = events.merge(offer_meta[['offer_id', 'industry_code']], on='offer_id')
    cust_industry_agg = cust_industry_agg.groupby(['customer_id', 'industry_code']).agg(
        customer_industry_imps=('impression_timestamp', 'count'),
        customer_industry_clicks=('has_clicked', 'sum')
    ).reset_index()
    df = df.merge(cust_industry_agg, left_on=['id2', 'industry_code'], right_on=['customer_id', 'industry_code'], how='left')

    # Transaction-based Features
    trans_agg = trans.groupby('customer_id').agg(
        avg_spend=('transaction_amount', 'mean'),
        last_trans_date=('transaction_date', 'max')
    ).reset_index()
    df = df.merge(trans_agg, left_on='id2', right_on='customer_id', how='left')
    
    return df

# ===================================================================================
# 4. Model Training
# ===================================================================================
def train_model(X, y, groups, X_test):
    print("Starting model training...")
    gkf = GroupKFold(n_splits=5)
    oof = np.zeros(len(X))
    test_pred = np.zeros(len(X_test))

    for fold, (t_idx, v_idx) in enumerate(gkf.split(X, y, groups)):
        print(f"--- Fold {fold+1}/5 ---")
        Xt, Xv = X.iloc[t_idx], X.iloc[v_idx]
        yt, yv = y.iloc[t_idx], y.iloc[v_idx]

        sort_idx_t = Xt['id2'].argsort()
        Xt, yt = Xt.iloc[sort_idx_t], yt.iloc[sort_idx_t]
        sort_idx_v = Xv['id2'].argsort()
        Xv, yv = Xv.iloc[sort_idx_v], yv.iloc[sort_idx_v]

        gr_t, gr_v = Xt.groupby('id2').size().to_numpy(), Xv.groupby('id2').size().to_numpy()
        
        Xt_, Xv_ = Xt.drop('id2', axis=1), Xv.drop('id2', axis=1)
        Xtest_ = X_test.drop('id2', axis=1)

        model_lgb = lgb.LGBMRanker(objective='lambdarank', metric='map', eval_at=7, n_estimators=2000, learning_rate=0.03, num_leaves=80, random_state=42+fold)
        model_lgb.fit(Xt_, yt, group=gr_t, eval_set=[(Xv_, yv)], eval_group=[gr_v], callbacks=[lgb.early_stopping(100, verbose=False)])
        
        p1 = model_lgb.predict(Xv_)
        tp1 = model_lgb.predict(Xtest_)
        
        def rank(x): return pd.Series(x).rank(method='first').values
        
        oof_preds_fold = pd.Series(rank(p1), index=Xv.index)
        oof[v_idx] = oof_preds_fold.reindex(Xt.iloc[v_idx].index).values
        test_pred += rank(tp1)
        
        del model_lgb; gc.collect()

    test_pred /= gkf.get_n_splits()
    print("Model training complete.")
    return oof, test_pred

# ===================================================================================
# 5. Main Execution and Submission
# ===================================================================================
if __name__ == "_main_":
    data_path = '.'
    train_df, test_df, offer_meta_df, events_df, trans_df = load_and_clean_data(data_path)
    
    train_proc = feature_engineer(train_df, offer_meta_df, events_df, trans_df)
    test_proc = feature_engineer(test_df, offer_meta_df, events_df, trans_df)
    
    del train_df, test_df, offer_meta_df, events_df, trans_df; gc.collect()

    # Define feature columns
    original_features = [c for c in train_proc.columns if c.startswith('f')]
    new_features = ['imp_dow', 'offer_duration', 'total_customer_imps', 'total_customer_clicks', 'days_since_last_any_click', 'customer_industry_imps', 'customer_industry_clicks', 'avg_spend']
    feature_cols = original_features + new_features

    # Align columns and handle missing values
    test_proc = test_proc.reindex(columns=train_proc.columns, fill_value=0)
    train_proc.fillna(0, inplace=True)
    test_proc.fillna(0, inplace=True)

    X, y, grp = train_proc[feature_cols + ['id2']], train_proc['y'], train_proc['id2']
    X_test = test_proc[feature_cols + ['id2']]

    oof_preds, test_preds = train_model(X, y, grp, X_test)

    # Evaluation
    train_proc['pred'] = oof_preds
    oof_ranked = train_proc.sort_values(['id2','pred'], ascending=[True,False]).groupby('id2')['id3'].apply(list).reset_index(name='predicted_offers')
    true_offers = train_proc[train_proc['y']==1].groupby('id2')['id3'].apply(list).reset_index(name='true_offers')
    eval_df = oof_ranked.merge(true_offers, on='id2', how='left')
    eval_df['true_offers'] = eval_df['true_offers'].apply(lambda x: x if isinstance(x, list) else [])
    final_map7 = mapk(eval_df['true_offers'], eval_df['predicted_offers'])
    print(f"\nFinal Estimated OOF MAP@7: {final_map7:.6f}")

    # Submission
    print("Creating submission file...")
    scaler = MinMaxScaler()
    test_proc['pred'] = scaler.fit_transform(test_preds.reshape(-1, 1)).flatten()
    
    submission_df = test_proc[['id1', 'id2', 'id3', 'id5', 'pred']].copy()
    submission_df['id5'] = pd.to_datetime(submission_df['id5']).dt.strftime('%m/%d/%Y')

    submission_df.to_csv('submission.csv', index=False)
    print("Submission saved to 'submission.csv'")

In [5]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import MinMaxScaler
import gc
import warnings

warnings.filterwarnings('ignore')

# ===================================================================================
# 0. Memory Reduction Helper
# ===================================================================================
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min, c_max = df[col].min(), df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                else:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print(f'Mem. usage decreased to {end_mem:5.2f} Mb ({100 * (start_mem - end_mem) / start_mem:.1f}% reduction)')
    return df

# ===================================================================================
# 1. Custom MAP@7 Metric
# ===================================================================================
def mapk(actuals, predicteds, k=7):
    return np.mean([apk(a, p, k) for a, p in zip(actuals, predicteds)])

def apk(actual, predicted, k=7):
    if len(predicted) > k: predicted = predicted[:k]
    score, num_hits = 0.0, 0.0
    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    if not actual: return 0.0
    return score / min(len(actual), k)

# ===================================================================================
# 2. Data Loading and Cleaning (Robust Version)
# ===================================================================================
def load_and_clean_data(data_path='.'):
    print("Loading and cleaning data...")
    train = pd.read_parquet(f'{data_path}/train_data.parquet')
    test = pd.read_parquet(f'{data_path}/test_data.parquet')
    offer_meta = pd.read_parquet(f'{data_path}/offer_metadata.parquet')
    events = pd.read_parquet(f'{data_path}/add_event.parquet')
    trans = pd.read_parquet(f'{data_path}/add_trans.parquet')

    for df in [train, test]:
        for col in df.columns:
            if col.startswith('f'):
                df[col] = pd.to_numeric(df[col], errors='coerce')
        df['id5'] = pd.to_datetime(df['id5'], errors='coerce')
        df['y'] = pd.to_numeric(df.get('y', 0), errors='coerce')

    for df in [train, test, offer_meta, events, trans]:
        for col in ['id2', 'id3', 'customer_id', 'offer_id']:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors='coerce')

    offer_meta = offer_meta.rename(columns={'id3': 'offer_id', 'id9': 'industry_code', 'id12': 'start_date', 'id13': 'end_date'})
    events = events.rename(columns={'id2': 'customer_id', 'id3': 'offer_id', 'id4': 'impression_timestamp', 'id7': 'click_timestamp'})
    trans = trans.rename(columns={'id2': 'customer_id', 'f367': 'transaction_amount', 'f370': 'transaction_date_d', 'f371': 'transaction_date_t'})

    trans['transaction_date'] = pd.to_datetime(trans['transaction_date_d'] + ' ' + trans['transaction_date_t'], errors='coerce')
    for df in [offer_meta, events]:
        for col in df.columns:
            if 'date' in col or 'timestamp' in col:
                df[col] = pd.to_datetime(df[col], errors='coerce')

    return train, test, offer_meta, events, trans

# ===================================================================================
# 3. Feature Engineering
# ===================================================================================
def feature_engineer(df, offer_meta, events, trans):
    print(f"Engineering features for {'train' if 'y' in df.columns else 'test'} set...")
    df = df.merge(offer_meta, left_on='id3', right_on='offer_id', how='left')
    df['imp_date'] = pd.to_datetime(df['id5'])
    df['imp_dow'] = df['imp_date'].dt.dayofweek
    df['offer_duration'] = (df['end_date'] - df['start_date']).dt.days

    events['has_clicked'] = events['click_timestamp'].notna().astype(int)
    cust_agg = events.groupby('customer_id').agg(
        total_customer_imps=('impression_timestamp', 'count'),
        total_customer_clicks=('has_clicked', 'sum'),
        last_click_date=('click_timestamp', 'max')
    ).reset_index()
    df = df.merge(cust_agg, left_on='id2', right_on='customer_id', how='left')
    df['days_since_last_any_click'] = (df['imp_date'] - df['last_click_date']).dt.days

    cust_industry_agg = events.merge(offer_meta[['offer_id', 'industry_code']], on='offer_id')
    cust_industry_agg = cust_industry_agg.groupby(['customer_id', 'industry_code']).agg(
        customer_industry_imps=('impression_timestamp', 'count'),
        customer_industry_clicks=('has_clicked', 'sum')
    ).reset_index()
    df = df.merge(cust_industry_agg, left_on=['id2', 'industry_code'], right_on=['customer_id', 'industry_code'], how='left')

    trans_agg = trans.groupby('customer_id').agg(
        avg_spend=('transaction_amount', 'mean'),
        last_trans_date=('transaction_date', 'max')
    ).reset_index()
    df = df.merge(trans_agg, left_on='id2', right_on='customer_id', how='left')

    return df

# ===================================================================================
# 4. Model Training
# ===================================================================================
def train_model(X, y, groups, X_test):
    print("Starting model training...")
    gkf = GroupKFold(n_splits=5)
    oof = np.zeros(len(X))
    test_pred = np.zeros(len(X_test))

    for fold, (t_idx, v_idx) in enumerate(gkf.split(X, y, groups)):
        print(f"--- Fold {fold+1}/5 ---")
        Xt, Xv = X.iloc[t_idx], X.iloc[v_idx]
        yt, yv = y.iloc[t_idx].astype(float), y.iloc[v_idx].astype(float)

        gr_t = Xt['id2'].value_counts().sort_index().values
        gr_v = Xv['id2'].value_counts().sort_index().values
        Xt_, Xv_ = Xt.drop('id2', axis=1), Xv.drop('id2', axis=1)
        Xtest_ = X_test.drop('id2', axis=1)

        model = lgb.LGBMRanker(objective='lambdarank', metric='map', eval_at=7, learning_rate=0.03, n_estimators=2000, num_leaves=80, random_state=fold+42)
        model.fit(Xt_, yt, group=gr_t, eval_set=[(Xv_, yv)], eval_group=[gr_v], callbacks=[lgb.early_stopping(100)])

        pred_val = model.predict(Xv_)
        pred_test = model.predict(Xtest_)

        oof[v_idx] = pd.Series(pred_val).rank(method='first').values
        test_pred += pd.Series(pred_test).rank(method='first').values
        del model; gc.collect()

    test_pred /= gkf.get_n_splits()
    return oof, test_pred

# ===================================================================================
# 5. Main Execution Block
# ===================================================================================
if __name__ == '__main__':
    data_path = '.'
    train_df, test_df, offer_meta, events, trans = load_and_clean_data(data_path)

    train_df = feature_engineer(train_df, offer_meta, events, trans)
    test_df = feature_engineer(test_df, offer_meta, events, trans)

    feature_cols = [c for c in train_df.columns if c.startswith('f') and train_df[c].dtype in [np.float32, np.float64, np.int32, np.int64]] + [
        'imp_dow', 'offer_duration', 'total_customer_imps', 'total_customer_clicks',
        'days_since_last_any_click', 'customer_industry_imps', 'customer_industry_clicks', 'avg_spend']

    train_df.fillna(0, inplace=True)
    test_df.fillna(0, inplace=True)

    X = train_df[feature_cols + ['id2']]
    y = pd.to_numeric(train_df['y'], errors='coerce').fillna(0).astype(float)
    X_test = test_df[feature_cols + ['id2']]

    oof, test_preds = train_model(X, y, X['id2'], X_test)

    train_df['pred'] = oof
    oof_ranked = train_df.sort_values(['id2','pred'], ascending=[True,False]).groupby('id2')['id3'].apply(list).reset_index(name='predicted_offers')
    true_offers = train_df[train_df['y']==1].groupby('id2')['id3'].apply(list).reset_index(name='true_offers')
    eval_df = oof_ranked.merge(true_offers, on='id2', how='left')
    eval_df['true_offers'] = eval_df['true_offers'].apply(lambda x: x if isinstance(x, list) else [])

    final_map7 = mapk(eval_df['true_offers'], eval_df['predicted_offers'])
    print(f"\nFinal Estimated OOF MAP@7: {final_map7:.6f}")

    scaler = MinMaxScaler()
    test_df['pred'] = scaler.fit_transform(test_preds.reshape(-1, 1)).flatten()
    submission = test_df[['id1', 'id2', 'id3', 'id5', 'pred']].copy()
    submission['id5'] = pd.to_datetime(submission['id5']).dt.strftime('%m/%d/%Y')
    submission.to_csv('submission.csv', index=False)
    print("\n✅ Submission saved to 'submission.csv'")


Loading and cleaning data...
Engineering features for train set...
Engineering features for train set...
Starting model training...
--- Fold 1/5 ---
[LightGBM] [Info] Total groups: 37240, total data: 616131
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.273159 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 47330
[LightGBM] [Info] Number of data points in the train set: 616131, number of used features: 296
[LightGBM] [Info] Total groups: 9310, total data: 154033
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[237]	valid_0's map@7: 0.932995
--- Fold 2/5 ---
[LightGBM] [Info] Total groups: 37240, total data: 616131
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.323433 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory 

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import MinMinScaler
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRanker
import gc
import warnings

warnings.filterwarnings('ignore')

# ===================================================================================
# 1. Custom MAP@7 Metric
# ===================================================================================
def mapk(actuals, predicteds, k=7):
    return np.mean([apk(a, p, k) for a, p in zip(actuals, predicteds)])

def apk(actual, predicted, k=7):
    if len(predicted) > k: predicted = predicted[:k]
    score, num_hits = 0.0, 0.0
    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    if not actual: return 0.0
    return score / min(len(actual), k)

# ===================================================================================
# 2. Load and Prepare Data
# ===================================================================================
train = pd.read_parquet('train_data.parquet')
test = pd.read_parquet('test_data.parquet')
offer_meta = pd.read_parquet('offer_metadata.parquet')
events = pd.read_parquet('add_event.parquet')
trans = pd.read_parquet('add_trans.parquet')

for df in [train, test]:
    for col in df.columns:
        if col.startswith('f'):
            df[col] = pd.to_numeric(df[col], errors='coerce')
    df['id5'] = pd.to_datetime(df['id5'], errors='coerce')
    df['y'] = pd.to_numeric(df.get('y', 0), errors='coerce')

for df in [train, test, offer_meta, events, trans]:
    for col in ['id2', 'id3', 'customer_id', 'offer_id']:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

offer_meta.rename(columns={'id3': 'offer_id', 'id9': 'industry_code', 'id12': 'start_date', 'id13': 'end_date'}, inplace=True)
events.rename(columns={'id2': 'customer_id', 'id3': 'offer_id', 'id4': 'impression_timestamp', 'id7': 'click_timestamp'}, inplace=True)
trans.rename(columns={'id2': 'customer_id', 'f367': 'transaction_amount', 'f370': 'transaction_date_d', 'f371': 'transaction_date_t'}, inplace=True)
trans['transaction_date'] = pd.to_datetime(trans['transaction_date_d'] + ' ' + trans['transaction_date_t'], errors='coerce')

# ===================================================================================
# 3. Feature Engineering
# ===================================================================================
def feature_engineer(df):
    df = df.merge(offer_meta, left_on='id3', right_on='offer_id', how='left')
    df['imp_date'] = pd.to_datetime(df['id5'])
    df['imp_dow'] = df['imp_date'].dt.dayofweek
    df['offer_duration'] = (df['end_date'] - df['start_date']).dt.days

    events['has_clicked'] = events['click_timestamp'].notna().astype(int)
    cust_agg = events.groupby('customer_id').agg(
        total_customer_imps=('impression_timestamp', 'count'),
        total_customer_clicks=('has_clicked', 'sum'),
        last_click_date=('click_timestamp', 'max')
    ).reset_index()
    df = df.merge(cust_agg, left_on='id2', right_on='customer_id', how='left')
    df['days_since_last_any_click'] = (df['imp_date'] - df['last_click_date']).dt.days

    cust_industry_agg = events.merge(offer_meta[['offer_id', 'industry_code']], on='offer_id')
    cust_industry_agg = cust_industry_agg.groupby(['customer_id', 'industry_code']).agg(
        customer_industry_imps=('impression_timestamp', 'count'),
        customer_industry_clicks=('has_clicked', 'sum')
    ).reset_index()
    df = df.merge(cust_industry_agg, left_on=['id2', 'industry_code'], right_on=['customer_id', 'industry_code'], how='left')

    trans_agg = trans.groupby('customer_id').agg(
        avg_spend=('transaction_amount', 'mean'),
        last_trans_date=('transaction_date', 'max')
    ).reset_index()
    df = df.merge(trans_agg, left_on='id2', right_on='customer_id', how='left')

    return df

train = feature_engineer(train)
test = feature_engineer(test)

feature_cols = [c for c in train.columns if c.startswith('f') and train[c].dtype in [np.float32, np.float64, np.int32, np.int64]] + [
    'imp_dow', 'offer_duration', 'total_customer_imps', 'total_customer_clicks',
    'days_since_last_any_click', 'customer_industry_imps', 'customer_industry_clicks', 'avg_spend']

train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

X = train[feature_cols + ['id2']]
y = pd.to_numeric(train['y'], errors='coerce').fillna(0).astype(float)
X_test = test[feature_cols + ['id2']]

# ===================================================================================
# 4. Ensemble Training
# ===================================================================================
def train_ensemble(X, y, groups, X_test):
    print("Training ensemble model (LGB + XGB)...")
    gkf = GroupKFold(n_splits=5)
    oof_lgb, oof_xgb = np.zeros(len(X)), np.zeros(len(X))
    test_lgb, test_xgb = np.zeros(len(X_test)), np.zeros(len(X_test))

    for fold, (t_idx, v_idx) in enumerate(gkf.split(X, y, groups)):
        print(f"--- Fold {fold+1}/5 ---")
        Xt, Xv = X.iloc[t_idx], X.iloc[v_idx]
        yt, yv = y.iloc[t_idx], y.iloc[v_idx]

        gr_t = Xt['id2'].value_counts().sort_index().values
        gr_v = Xv['id2'].value_counts().sort_index().values

        Xt_, Xv_ = Xt.drop('id2', axis=1), Xv.drop('id2', axis=1)
        Xtest_ = X_test.drop('id2', axis=1)

        model_lgb = lgb.LGBMRanker(objective='lambdarank', metric='map', eval_at=7, learning_rate=0.03, n_estimators=500, num_leaves=50)
        model_lgb.fit(Xt_, yt, group=gr_t, eval_set=[(Xv_, yv)], eval_group=[gr_v], callbacks=[lgb.early_stopping(50)])
        oof_lgb[v_idx] = model_lgb.predict(Xv_)
        test_lgb += model_lgb.predict(Xtest_)

        model_xgb = XGBRanker(objective='rank:pairwise', learning_rate=0.03, n_estimators=500, max_depth=6)
        model_xgb.fit(Xt_, yt, group=gr_t, eval_set=[(Xv_, yv)], eval_group=[gr_v], early_stopping_rounds=50, verbose=False)
        oof_xgb[v_idx] = model_xgb.predict(Xv_)
        test_xgb += model_xgb.predict(Xtest_)

    test_lgb /= gkf.get_n_splits()
    test_xgb /= gkf.get_n_splits()

    oof_ensemble = 0.6 * oof_lgb + 0.4 * oof_xgb
    test_ensemble = 0.6 * test_lgb + 0.4 * test_xgb

    return oof_ensemble, test_ensemble

# ===================================================================================
# 5. Inference and Submission
# ===================================================================================
oof, test_preds = train_ensemble(X, y, X['id2'], X_test)
train['pred'] = oof

oof_ranked = train.sort_values(['id2','pred'], ascending=[True,False]).groupby('id2')['id3'].apply(list).reset_index(name='predicted_offers')
true_offers = train[train['y']==1].groupby('id2')['id3'].apply(list).reset_index(name='true_offers')
eval_df = oof_ranked.merge(true_offers, on='id2', how='left')
eval_df['true_offers'] = eval_df['true_offers'].apply(lambda x: x if isinstance(x, list) else [])
final_score = mapk(eval_df['true_offers'], eval_df['predicted_offers'])
print(f"\n📊 Final OOF MAP@7 Score: {final_score:.6f}")

scaler = MinMaxScaler()
test['pred'] = scaler.fit_transform(test_preds.reshape(-1, 1)).flatten()
submission = test[['id1', 'id2', 'id3', 'id5', 'pred']].copy()
submission['id5'] = pd.to_datetime(submission['id5']).dt.strftime('%m/%d/%Y')
submission.to_csv('submissionnew.csv', index=False)
print("\n✅ Submission file saved as 'submission.csv'")

ImportError: cannot import name 'MinMinScaler' from 'sklearn.preprocessing' (C:\Users\fahee\anaconda3\Lib\site-packages\sklearn\preprocessing\__init__.py)

In [5]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GroupKFold, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBRanker
import gc
import warnings

warnings.filterwarnings('ignore')

# ===================================================================================
# 1. Custom MAP@7 Metric
# ===================================================================================
def mapk(actuals, predicteds, k=7):
    return np.mean([apk(a, p, k) for a, p in zip(actuals, predicteds)])

def apk(actual, predicted, k=7):
    if len(predicted) > k: predicted = predicted[:k]
    score, num_hits = 0.0, 0.0
    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    if not actual: return 0.0
    return score / min(len(actual), k)

# ===================================================================================
# 2. Load and Prepare Data
# ===================================================================================
train = pd.read_parquet('train_data.parquet')
test = pd.read_parquet('test_data.parquet')
offer_meta = pd.read_parquet('offer_metadata.parquet')
events = pd.read_parquet('add_event.parquet')
trans = pd.read_parquet('add_trans.parquet')

for df in [train, test]:
    for col in df.columns:
        if col.startswith('f'):
            df[col] = pd.to_numeric(df[col], errors='coerce')
    df['id5'] = pd.to_datetime(df['id5'], errors='coerce')
    if 'y' in df.columns:
        df['y'] = pd.to_numeric(df['y'], errors='coerce').fillna(0).astype(float)

for df in [train, test, offer_meta, events, trans]:
    for col in ['id2', 'id3', 'customer_id', 'offer_id']:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

offer_meta.rename(columns={'id3': 'offer_id', 'id9': 'industry_code', 'id12': 'start_date', 'id13': 'end_date'}, inplace=True)
events.rename(columns={'id2': 'customer_id', 'id3': 'offer_id', 'id4': 'impression_timestamp', 'id7': 'click_timestamp'}, inplace=True)
trans.rename(columns={'id2': 'customer_id', 'f367': 'transaction_amount', 'f370': 'transaction_date_d', 'f371': 'transaction_date_t'}, inplace=True)
trans['transaction_date'] = pd.to_datetime(trans['transaction_date_d'] + ' ' + trans['transaction_date_t'], errors='coerce')

# ===================================================================================
# 3. Feature Engineering
# ===================================================================================
def feature_engineer(df):
    df = df.merge(offer_meta, left_on='id3', right_on='offer_id', how='left')
    df['imp_date'] = pd.to_datetime(df['id5'])
    df['imp_dow'] = df['imp_date'].dt.dayofweek
    df['offer_duration'] = (df['end_date'] - df['start_date']).dt.days

    events['has_clicked'] = events['click_timestamp'].notna().astype(int)
    cust_agg = events.groupby('customer_id').agg(
        total_customer_imps=('impression_timestamp', 'count'),
        total_customer_clicks=('has_clicked', 'sum'),
        last_click_date=('click_timestamp', 'max')
    ).reset_index()
    df = df.merge(cust_agg, left_on='id2', right_on='customer_id', how='left')
    df['days_since_last_any_click'] = (df['imp_date'] - df['last_click_date']).dt.days

    cust_industry_agg = events.merge(offer_meta[['offer_id', 'industry_code']], on='offer_id')
    cust_industry_agg = cust_industry_agg.groupby(['customer_id', 'industry_code']).agg(
        customer_industry_imps=('impression_timestamp', 'count'),
        customer_industry_clicks=('has_clicked', 'sum')
    ).reset_index()
    df = df.merge(cust_industry_agg, left_on=['id2', 'industry_code'], right_on=['customer_id', 'industry_code'], how='left')

    trans_agg = trans.groupby('customer_id').agg(
        avg_spend=('transaction_amount', 'mean'),
        last_trans_date=('transaction_date', 'max')
    ).reset_index()
    df = df.merge(trans_agg, left_on='id2', right_on='customer_id', how='left')

    return df

train = feature_engineer(train)
test = feature_engineer(test)

feature_cols = [c for c in train.columns if c.startswith('f') and train[c].dtype in [np.float32, np.float64, np.int32, np.int64]] + [
    'imp_dow', 'offer_duration', 'total_customer_imps', 'total_customer_clicks',
    'days_since_last_any_click', 'customer_industry_imps', 'customer_industry_clicks', 'avg_spend']

train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

X = train[feature_cols + ['id2']]
y = train['y']
X_test = test[feature_cols + ['id2']]

# ===================================================================================
# 4. Ensemble Training with Hyperparameter Tuning
# ===================================================================================
def train_ensemble(X, y, groups, X_test):
    gkf = GroupKFold(n_splits=5)
    oof_lgb, oof_xgb = np.zeros(len(X)), np.zeros(len(X))
    test_lgb, test_xgb = np.zeros(len(X_test)), np.zeros(len(X_test))

    for fold, (t_idx, v_idx) in enumerate(gkf.split(X, y, groups)):
        Xt, Xv = X.iloc[t_idx], X.iloc[v_idx]
        yt, yv = y.iloc[t_idx], y.iloc[v_idx]

        gr_t = Xt['id2'].value_counts().sort_index().values
        gr_v = Xv['id2'].value_counts().sort_index().values

        Xt_, Xv_ = Xt.drop('id2', axis=1), Xv.drop('id2', axis=1)
        Xtest_ = X_test.drop('id2', axis=1)

        model_lgb = lgb.LGBMRanker(objective='lambdarank', metric='map', eval_at=7, n_estimators=1000)
        model_lgb.fit(Xt_, yt, group=gr_t, eval_set=[(Xv_, yv)], eval_group=[gr_v], callbacks=[lgb.early_stopping(100)])
        oof_lgb[v_idx] = model_lgb.predict(Xv_)
        test_lgb += model_lgb.predict(Xtest_)

        model_xgb = XGBRanker(objective='rank:pairwise', learning_rate=0.03, n_estimators=1000, max_depth=6)
        model_xgb.fit(Xt_, yt, group=gr_t, eval_set=[(Xv_, yv)], eval_group=[gr_v], early_stopping_rounds=100, verbose=False)
        oof_xgb[v_idx] = model_xgb.predict(Xv_)
        test_xgb += model_xgb.predict(Xtest_)

    test_lgb /= gkf.get_n_splits()
    test_xgb /= gkf.get_n_splits()

    oof_ensemble = 0.6 * oof_lgb + 0.4 * oof_xgb
    test_ensemble = 0.6 * test_lgb + 0.4 * test_xgb

    return oof_ensemble, test_ensemble

# ===================================================================================
# 5. Inference and Submission
# ===================================================================================
oof, test_preds = train_ensemble(X, y, X['id2'], X_test)
train['pred'] = oof

oof_ranked = train.sort_values(['id2','pred'], ascending=[True,False]).groupby('id2')['id3'].apply(list).reset_index(name='predicted_offers')
true_offers = train[train['y']==1].groupby('id2')['id3'].apply(list).reset_index(name='true_offers')
eval_df = oof_ranked.merge(true_offers, on='id2', how='left')
eval_df['true_offers'] = eval_df['true_offers'].apply(lambda x: x if isinstance(x, list) else [])
final_score = mapk(eval_df['true_offers'], eval_df['predicted_offers'])
print(f"\n📊 Final OOF MAP@7 Score: {final_score:.6f}")

scaler = MinMaxScaler()
test['pred'] = scaler.fit_transform(test_preds.reshape(-1, 1)).flatten()
submission = test[['id1', 'id2', 'id3', 'id5', 'pred']].copy()
submission['id5'] = pd.to_datetime(submission['id5']).dt.strftime('%m/%d/%Y')
submission.to_csv('submissionnewd.csv', index=False)
print("\n✅ Submission file saved as 'submission.csv'")

TypeError: unsupported operand type(s) for -: 'str' and 'str'

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import optuna
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import ndcg_score
import matplotlib.pyplot as plt
import gc, os, warnings

warnings.filterwarnings('ignore')

# ===================================================================================
# 0. MAP@7 Metric
# ===================================================================================
def mapk(actuals, predicteds, k=7):
    return np.mean([apk(a, p, k) for a, p in zip(actuals, predicteds)])

def apk(actual, predicted, k=7):
    if len(predicted) > k: predicted = predicted[:k]
    score, num_hits = 0.0, 0.0
    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    if not actual: return 0.0
    return score / min(len(actual), k)

# ===================================================================================
# 1. Load and Feature Functions (Assume already implemented correctly)
# ===================================================================================
# Use your own load_and_clean_data() and feature_engineer() from previous step.

# ===================================================================================
# 2. Optuna-based Hyperparameter Tuning
# ===================================================================================
def objective(trial, X, y, groups):
    params = {
        'objective': 'lambdarank',
        'metric': 'map',
        'eval_at': 7,
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_float("learning_rate", 0.01, 0.1),
        'num_leaves': trial.suggest_int("num_leaves", 31, 150),
        'min_child_samples': trial.suggest_int("min_child_samples", 10, 100),
        'subsample': trial.suggest_float("subsample", 0.6, 1.0),
        'colsample_bytree': trial.suggest_float("colsample_bytree", 0.6, 1.0),
        'reg_alpha': trial.suggest_float("reg_alpha", 0.0, 1.0),
        'reg_lambda': trial.suggest_float("reg_lambda", 0.0, 1.0),
    }

    gkf = GroupKFold(n_splits=3)
    score_list = []

    for fold, (train_idx, valid_idx) in enumerate(gkf.split(X, y, groups)):
        Xt, Xv = X.iloc[train_idx], X.iloc[valid_idx]
        yt, yv = y.iloc[train_idx], y.iloc[valid_idx]
        group_train = Xt.groupby('id2').size().values
        group_valid = Xv.groupby('id2').size().values
        Xt_, Xv_ = Xt.drop('id2', axis=1), Xv.drop('id2', axis=1)

        model = lgb.LGBMRanker(**params, n_estimators=2000)
        model.fit(Xt_, yt, group=group_train,
                  eval_set=[(Xv_, yv)],
                  eval_group=[group_valid],
                  early_stopping_rounds=50, verbose=False)
        preds = model.predict(Xv_)
        Xv['pred'] = preds
        pred_ranked = Xv.sort_values(['id2', 'pred'], ascending=[True, False]).groupby('id2')['id3'].apply(list).reset_index(name='predicted_offers')
        true_ranked = Xv[Xv['y'] == 1].groupby('id2')['id3'].apply(list).reset_index(name='true_offers')
        eval_df = pred_ranked.merge(true_ranked, on='id2', how='left')
        eval_df['true_offers'] = eval_df['true_offers'].apply(lambda x: x if isinstance(x, list) else [])
        map7 = mapk(eval_df['true_offers'], eval_df['predicted_offers'])
        score_list.append(map7)
        del model; gc.collect()

    return np.mean(score_list)

def tune_hyperparams(X, y, groups, n_trials=30):
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, X, y, groups), n_trials=n_trials)
    print("Best Params:", study.best_params)
    return study.best_params

# ===================================================================================
# 3. Training with Ensemble
# ===================================================================================
def train_and_predict(X, y, X_test, groups, best_params=None, n_folds=5):
    oof = np.zeros(len(X))
    test_preds = np.zeros(len(X_test))
    feature_importances = pd.DataFrame()
    feature_importances["feature"] = X.drop('id2', axis=1).columns

    gkf = GroupKFold(n_splits=n_folds)
    for fold, (train_idx, valid_idx) in enumerate(gkf.split(X, y, groups)):
        print(f"\n🔁 Fold {fold+1}/{n_folds}")
        Xt, Xv = X.iloc[train_idx], X.iloc[valid_idx]
        yt, yv = y.iloc[train_idx], y.iloc[valid_idx]
        group_train = Xt.groupby('id2').size().values
        group_valid = Xv.groupby('id2').size().values

        Xt_, Xv_ = Xt.drop('id2', axis=1), Xv.drop('id2', axis=1)
        Xtest_ = X_test.drop('id2', axis=1)

        model = lgb.LGBMRanker(**best_params, n_estimators=3000)
        model.fit(Xt_, yt, group=group_train, eval_set=[(Xv_, yv)],
                  eval_group=[group_valid], early_stopping_rounds=100, verbose=100)

        oof[valid_idx] = model.predict(Xv_)
        test_preds += model.predict(Xtest_)
        feature_importances[f'fold_{fold+1}'] = model.feature_importances_

        del model; gc.collect()

    test_preds /= n_folds
    return oof, test_preds, feature_importances

# ===================================================================================
# 4. Visualization
# ===================================================================================
def plot_feature_importance(importance_df):
    importance_df['mean_importance'] = importance_df.drop('feature', axis=1).mean(axis=1)
    top_feats = importance_df.sort_values(by='mean_importance', ascending=False).head(20)
    plt.figure(figsize=(10, 6))
    plt.barh(top_feats['feature'], top_feats['mean_importance'])
    plt.title("Top 20 Feature Importances")
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()

# ===================================================================================
# 5. Main Execution
# ===================================================================================
if __name__ == '_main_':
    print("📂 Loading data and features...")
    train_df, test_df, offer_meta, events, trans = load_and_clean_data()
    train_df = feature_engineer(train_df, offer_meta, events, trans)
    test_df = feature_engineer(test_df, offer_meta, events, trans)

    train_df.fillna(0, inplace=True)
    test_df.fillna(0, inplace=True)

    numeric_cols = train_df.select_dtypes(include=[np.number]).columns.tolist()
    feature_cols = list(set(numeric_cols) - set(['y', 'id1', 'id2', 'id3']))
    X = train_df[feature_cols + ['id2']]
    y = train_df['y']
    X_test = test_df[feature_cols + ['id2']]

    print("🎯 Tuning hyperparameters...")
    best_params = tune_hyperparams(X, y, X['id2'], n_trials=30)
    print("✅ Best Params:", best_params)

    print("🧠 Training and predicting...")
    oof_preds, test_preds, fi_df = train_and_predict(X, y, X_test, X['id2'], best_params)

    print("📊 Ranking and evaluating...")
    train_df['pred'] = oof_preds
    oof_ranked = train_df.sort_values(['id2', 'pred'], ascending=[True, False]).groupby('id2')['id3'].apply(list).reset_index(name='predicted_offers')
    true_offers = train_df[train_df['y'] == 1].groupby('id2')['id3'].apply(list).reset_index(name='true_offers')
    eval_df = oof_ranked.merge(true_offers, on='id2', how='left')
    eval_df['true_offers'] = eval_df['true_offers'].apply(lambda x: x if isinstance(x, list) else [])
    final_map = mapk(eval_df['true_offers'], eval_df['predicted_offers'])
    print(f"📈 Final OOF MAP@7 = {final_map:.6f}")

    print("📦 Creating submission...")
    scaler = MinMaxScaler()
    test_df['pred'] = scaler.fit_transform(test_preds.reshape(-1, 1)).flatten()
    submission = test_df[['id1', 'id2', 'id3', 'id5', 'pred']].copy()
    submission['id5'] = pd.to_datetime(submission['id5']).dt.strftime('%m/%d/%Y')
    submission.to_csv('submissiondcgyidc.csv', index=False)
    print("✅ Submission saved to submission.csv")

    print("📉 Plotting feature importance...")
    plot_feature_importance(fi_df)

In [9]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.4.0-py3-none-any.whl (395 kB)
Downloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.9.0 optuna-4.4.0


In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GroupKFold, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBRanker
import gc
import warnings

warnings.filterwarnings('ignore')

# ===================================================================================
# 1. Custom MAP@7 Metric
# ===================================================================================
def mapk(actuals, predicteds, k=7):
    return np.mean([apk(a, p, k) for a, p in zip(actuals, predicteds)])

def apk(actual, predicted, k=7):
    if len(predicted) > k: predicted = predicted[:k]
    score, num_hits = 0.0, 0.0
    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    if not actual: return 0.0
    return score / min(len(actual), k)

# ===================================================================================
# 2. Load and Prepare Data
# ===================================================================================
print("📥 Loading datasets...")
train = pd.read_parquet('train_data.parquet')
test = pd.read_parquet('test_data.parquet')
offer_meta = pd.read_parquet('offer_metadata.parquet')
events = pd.read_parquet('add_event.parquet')
trans = pd.read_parquet('add_trans.parquet')

print("🔧 Preprocessing features...")
for df in [train, test]:
    for col in df.columns:
        if col.startswith('f'):
            df[col] = pd.to_numeric(df[col], errors='coerce')
    df['id5'] = pd.to_datetime(df['id5'], errors='coerce')
    if 'y' in df.columns:
        df['y'] = pd.to_numeric(df['y'], errors='coerce').fillna(0).astype(float)

for df in [train, test, offer_meta, events, trans]:
    for col in ['id2', 'id3', 'customer_id', 'offer_id']:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

offer_meta.rename(columns={'id3': 'offer_id', 'id9': 'industry_code', 'id12': 'start_date', 'id13': 'end_date'}, inplace=True)
events.rename(columns={'id2': 'customer_id', 'id3': 'offer_id', 'id4': 'impression_timestamp', 'id7': 'click_timestamp'}, inplace=True)
trans.rename(columns={'id2': 'customer_id', 'f367': 'transaction_amount', 'f370': 'transaction_date_d', 'f371': 'transaction_date_t'}, inplace=True)
trans['transaction_date'] = pd.to_datetime(trans['transaction_date_d'] + ' ' + trans['transaction_date_t'], errors='coerce')

# ===================================================================================
# 3. Feature Engineering
# ===================================================================================
def feature_engineer(df):
    print("🛠️  Feature engineering...")
    df = df.merge(offer_meta, left_on='id3', right_on='offer_id', how='left')
    df['imp_date'] = pd.to_datetime(df['id5'])
    df['imp_dow'] = df['imp_date'].dt.dayofweek
    df['offer_duration'] = (df['end_date'] - df['start_date']).dt.days

    events['has_clicked'] = events['click_timestamp'].notna().astype(int)
    cust_agg = events.groupby('customer_id').agg(
        total_customer_imps=('impression_timestamp', 'count'),
        total_customer_clicks=('has_clicked', 'sum'),
        last_click_date=('click_timestamp', 'max')
    ).reset_index()
    df = df.merge(cust_agg, left_on='id2', right_on='customer_id', how='left')
    df['days_since_last_any_click'] = (df['imp_date'] - df['last_click_date']).dt.days

    cust_industry_agg = events.merge(offer_meta[['offer_id', 'industry_code']], on='offer_id')
    cust_industry_agg = cust_industry_agg.groupby(['customer_id', 'industry_code']).agg(
        customer_industry_imps=('impression_timestamp', 'count'),
        customer_industry_clicks=('has_clicked', 'sum')
    ).reset_index()
    df = df.merge(cust_industry_agg, left_on=['id2', 'industry_code'], right_on=['customer_id', 'industry_code'], how='left')

    trans_agg = trans.groupby('customer_id').agg(
        avg_spend=('transaction_amount', 'mean'),
        last_trans_date=('transaction_date', 'max')
    ).reset_index()
    df = df.merge(trans_agg, left_on='id2', right_on='customer_id', how='left')

    return df

train = feature_engineer(train)
test = feature_engineer(test)

feature_cols = [c for c in train.columns if c.startswith('f') and train[c].dtype in [np.float32, np.float64, np.int32, np.int64]] + [
    'imp_dow', 'offer_duration', 'total_customer_imps', 'total_customer_clicks',
    'days_since_last_any_click', 'customer_industry_imps', 'customer_industry_clicks', 'avg_spend']

train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

X = train[feature_cols + ['id2']]
y = train['y']
X_test = test[feature_cols + ['id2']]

# ===================================================================================
# 4. Ensemble Training with Hyperparameter Tuning
# ===================================================================================
def train_ensemble(X, y, groups, X_test):
    print("📈 Training ensemble model...")
    gkf = GroupKFold(n_splits=5)
    oof_lgb, oof_xgb = np.zeros(len(X)), np.zeros(len(X))
    test_lgb, test_xgb = np.zeros(len(X_test)), np.zeros(len(X_test))

    for fold, (t_idx, v_idx) in enumerate(gkf.split(X, y, groups)):
        print(f"\n🔁 Fold {fold+1}/5")
        Xt, Xv = X.iloc[t_idx], X.iloc[v_idx]
        yt, yv = y.iloc[t_idx], y.iloc[v_idx]

        gr_t = Xt['id2'].value_counts().sort_index().values
        gr_v = Xv['id2'].value_counts().sort_index().values

        Xt_, Xv_ = Xt.drop('id2', axis=1), Xv.drop('id2', axis=1)
        Xtest_ = X_test.drop('id2', axis=1)

        model_lgb = lgb.LGBMRanker(objective='lambdarank', metric='map', eval_at=7, n_estimators=500, learning_rate=0.05, num_leaves=80)
        model_lgb.fit(Xt_, yt, group=gr_t, eval_set=[(Xv_, yv)], eval_group=[gr_v], callbacks=[lgb.early_stopping(50)])
        oof_lgb[v_idx] = model_lgb.predict(Xv_)
        test_lgb += model_lgb.predict(Xtest_)

        model_xgb = XGBRanker(objective='rank:pairwise', learning_rate=0.05, n_estimators=500, max_depth=6, verbosity=0)
        model_xgb.fit(Xt_, yt, group=gr_t, eval_set=[(Xv_, yv)], eval_group=[gr_v], early_stopping_rounds=50)
        oof_xgb[v_idx] = model_xgb.predict(Xv_)
        test_xgb += model_xgb.predict(Xtest_)

    test_lgb /= gkf.get_n_splits()
    test_xgb /= gkf.get_n_splits()

    print("🔗 Blending predictions...")
    oof_ensemble = 0.6 * oof_lgb + 0.4 * oof_xgb
    test_ensemble = 0.6 * test_lgb + 0.4 * test_xgb

    return oof_ensemble, test_ensemble

# ===================================================================================
# 5. Inference and Submission
# ===================================================================================
oof, test_preds = train_ensemble(X, y, X['id2'], X_test)
train['pred'] = oof

print("📤 Generating predictions and evaluating MAP@7...")
oof_ranked = train.sort_values(['id2','pred'], ascending=[True,False]).groupby('id2')['id3'].apply(list).reset_index(name='predicted_offers')
true_offers = train[train['y']==1].groupby('id2')['id3'].apply(list).reset_index(name='true_offers')
eval_df = oof_ranked.merge(true_offers, on='id2', how='left')
eval_df['true_offers'] = eval_df['true_offers'].apply(lambda x: x if isinstance(x, list) else [])
final_score = mapk(eval_df['true_offers'], eval_df['predicted_offers'])
print(f"\n📊 Final OOF MAP@7 Score: {final_score:.6f}")

print("🧪 Generating submission file...")
scaler = MinMaxScaler()
test['pred'] = scaler.fit_transform(test_preds.reshape(-1, 1)).flatten()
submission = test[['id1', 'id2', 'id3', 'id5', 'pred']].copy()
submission['id5'] = pd.to_datetime(submission['id5']).dt.strftime('%m/%d/%Y')
submission.to_csv('submission.csv', index=False)
print("\n✅ Submission file saved as 'submission.csv'")


📥 Loading datasets...
🔧 Preprocessing features...
🛠️  Feature engineering...


TypeError: unsupported operand type(s) for -: 'str' and 'str'

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GroupKFold, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBRanker
import gc
import warnings

warnings.filterwarnings('ignore')

# ===================================================================================
# 1. Custom MAP@7 Metric
# ===================================================================================
def mapk(actuals, predicteds, k=7):
    return np.mean([apk(a, p, k) for a, p in zip(actuals, predicteds)])

def apk(actual, predicted, k=7):
    if len(predicted) > k: predicted = predicted[:k]
    score, num_hits = 0.0, 0.0
    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    if not actual: return 0.0
    return score / min(len(actual), k)

# ===================================================================================
# 2. Load and Prepare Data
# ===================================================================================
print("📥 Loading datasets...")
train = pd.read_parquet('train_data.parquet')
test = pd.read_parquet('test_data.parquet')
offer_meta = pd.read_parquet('offer_metadata.parquet')
events = pd.read_parquet('add_event.parquet')
trans = pd.read_parquet('add_trans.parquet')

print("🔧 Preprocessing features...")
for df in [train, test]:
    for col in df.columns:
        if col.startswith('f'):
            df[col] = pd.to_numeric(df[col], errors='coerce')
    df['id5'] = pd.to_datetime(df['id5'], errors='coerce')
    if 'y' in df.columns:
        df['y'] = pd.to_numeric(df['y'], errors='coerce').fillna(0).astype(float)

for df in [train, test, offer_meta, events, trans]:
    for col in ['id2', 'id3', 'customer_id', 'offer_id']:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

offer_meta.rename(columns={'id3': 'offer_id', 'id9': 'industry_code', 'id12': 'start_date', 'id13': 'end_date'}, inplace=True)
events.rename(columns={'id2': 'customer_id', 'id3': 'offer_id', 'id4': 'impression_timestamp', 'id7': 'click_timestamp'}, inplace=True)
trans.rename(columns={'id2': 'customer_id', 'f367': 'transaction_amount', 'f370': 'transaction_date_d', 'f371': 'transaction_date_t'}, inplace=True)
trans['transaction_date'] = pd.to_datetime(trans['transaction_date_d'] + ' ' + trans['transaction_date_t'], errors='coerce')

# ===================================================================================
# 3. Feature Engineering
# ===================================================================================
def feature_engineer(df):
    print("🛠️  Feature engineering...")
    df = df.merge(offer_meta, left_on='id3', right_on='offer_id', how='left')
    df['imp_date'] = pd.to_datetime(df['id5'])
    df['imp_dow'] = df['imp_date'].dt.dayofweek
    df['start_date'] = pd.to_datetime(df['start_date'], errors='coerce')
    df['end_date'] = pd.to_datetime(df['end_date'], errors='coerce')
    df['offer_duration'] = (df['end_date'] - df['start_date']).dt.days

    events['click_timestamp'] = pd.to_datetime(events['click_timestamp'], errors='coerce')
    events['impression_timestamp'] = pd.to_datetime(events['impression_timestamp'], errors='coerce')
    events['has_clicked'] = events['click_timestamp'].notna().astype(int)

    cust_agg = events.groupby('customer_id').agg(
        total_customer_imps=('impression_timestamp', 'count'),
        total_customer_clicks=('has_clicked', 'sum'),
        last_click_date=('click_timestamp', lambda x: pd.to_datetime(x).max())
    ).reset_index()
    df = df.merge(cust_agg, left_on='id2', right_on='customer_id', how='left')
    df['days_since_last_any_click'] = (df['imp_date'] - df['last_click_date']).dt.days

    cust_industry_agg = events.merge(offer_meta[['offer_id', 'industry_code']], on='offer_id')
    cust_industry_agg = cust_industry_agg.groupby(['customer_id', 'industry_code']).agg(
        customer_industry_imps=('impression_timestamp', 'count'),
        customer_industry_clicks=('has_clicked', 'sum')
    ).reset_index()
    df = df.merge(cust_industry_agg, left_on=['id2', 'industry_code'], right_on=['customer_id', 'industry_code'], how='left')

    trans['transaction_amount'] = pd.to_numeric(trans['transaction_amount'], errors='coerce')
    trans_agg = trans.groupby('customer_id').agg(
        avg_spend=('transaction_amount', 'mean'),
        last_trans_date=('transaction_date', lambda x: pd.to_datetime(x).max())
    ).reset_index()
    df = df.merge(trans_agg, left_on='id2', right_on='customer_id', how='left')

    return df

train = feature_engineer(train)
test = feature_engineer(test)

feature_cols = [c for c in train.columns if c.startswith('f') and train[c].dtype in [np.float32, np.float64, np.int32, np.int64]] + [
    'imp_dow', 'offer_duration', 'total_customer_imps', 'total_customer_clicks',
    'days_since_last_any_click', 'customer_industry_imps', 'customer_industry_clicks', 'avg_spend']

train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

X = train[feature_cols + ['id2']]
y = train['y']
X_test = test[feature_cols + ['id2']]

# ===================================================================================
# 4. Ensemble Training with Hyperparameter Tuning
# ===================================================================================
def train_ensemble(X, y, groups, X_test):
    print("📈 Training ensemble model...")
    gkf = GroupKFold(n_splits=5)
    oof_lgb, oof_xgb = np.zeros(len(X)), np.zeros(len(X))
    test_lgb, test_xgb = np.zeros(len(X_test)), np.zeros(len(X_test))

    for fold, (t_idx, v_idx) in enumerate(gkf.split(X, y, groups)):
        print(f"\n🔁 Fold {fold+1}/5")
        Xt, Xv = X.iloc[t_idx], X.iloc[v_idx]
        yt, yv = y.iloc[t_idx], y.iloc[v_idx]

        gr_t = Xt['id2'].value_counts().sort_index().values
        gr_v = Xv['id2'].value_counts().sort_index().values

        Xt_, Xv_ = Xt.drop('id2', axis=1), Xv.drop('id2', axis=1)
        Xtest_ = X_test.drop('id2', axis=1)

        model_lgb = lgb.LGBMRanker(objective='lambdarank', metric='map', eval_at=7, n_estimators=500, learning_rate=0.05, num_leaves=80)
        model_lgb.fit(Xt_, yt, group=gr_t, eval_set=[(Xv_, yv)], eval_group=[gr_v], callbacks=[lgb.early_stopping(50)])
        oof_lgb[v_idx] = model_lgb.predict(Xv_)
        test_lgb += model_lgb.predict(Xtest_)

        model_xgb = XGBRanker(objective='rank:pairwise', learning_rate=0.05, n_estimators=500, max_depth=6, verbosity=0)
        model_xgb.fit(Xt_, yt, group=gr_t)
        oof_xgb[v_idx] = model_xgb.predict(Xv_)
        test_xgb += model_xgb.predict(Xtest_)

    test_lgb /= gkf.get_n_splits()
    test_xgb /= gkf.get_n_splits()

    print("🔗 Blending predictions...")
    oof_ensemble = 0.6 * oof_lgb + 0.4 * oof_xgb
    test_ensemble = 0.6 * test_lgb + 0.4 * test_xgb

    return oof_ensemble, test_ensemble

# ===================================================================================
# 5. Inference and Submission
# ===================================================================================
oof, test_preds = train_ensemble(X, y, X['id2'], X_test)
train['pred'] = oof

print("📤 Generating predictions and evaluating MAP@7...")
oof_ranked = train.sort_values(['id2','pred'], ascending=[True,False]).groupby('id2')['id3'].apply(list).reset_index(name='predicted_offers')
true_offers = train[train['y']==1].groupby('id2')['id3'].apply(list).reset_index(name='true_offers')
eval_df = oof_ranked.merge(true_offers, on='id2', how='left')
eval_df['true_offers'] = eval_df['true_offers'].apply(lambda x: x if isinstance(x, list) else [])
final_score = mapk(eval_df['true_offers'], eval_df['predicted_offers'])
print(f"\n📊 Final OOF MAP@7 Score: {final_score:.6f}")

print("🧪 Generating submission file...")
scaler = MinMaxScaler()
test['pred'] = scaler.fit_transform(test_preds.reshape(-1, 1)).flatten()
submission = test[['id1', 'id2', 'id3', 'id5', 'pred']].copy()
submission['id5'] = pd.to_datetime(submission['id5']).dt.strftime('%m/%d/%Y')
submission.to_csv('submission.csv', index=False)
print("\n✅ Submission file saved as 'submission.csv'")


📥 Loading datasets...


In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GroupKFold, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
import gc
import warnings

warnings.filterwarnings('ignore')

# ===================================================================================
# 1. Custom MAP@7 Metric
# ===================================================================================
def mapk(actuals, predicteds, k=7):
    return np.mean([apk(a, p, k) for a, p in zip(actuals, predicteds)])

def apk(actual, predicted, k=7):
    if len(predicted) > k: predicted = predicted[:k]
    score, num_hits = 0.0, 0.0
    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    if not actual: return 0.0
    return score / min(len(actual), k)

# ===================================================================================
# 2. Load and Prepare Data
# ===================================================================================
print("📥 Loading datasets...")
train = pd.read_parquet('train_data.parquet')
test = pd.read_parquet('test_data.parquet')
offer_meta = pd.read_parquet('offer_metadata.parquet')
events = pd.read_parquet('add_event.parquet')
trans = pd.read_parquet('add_trans.parquet')

print("🔧 Preprocessing features...")
for df in [train, test]:
    for col in df.columns:
        if col.startswith('f'):
            df[col] = pd.to_numeric(df[col], errors='coerce')
    df['id5'] = pd.to_datetime(df['id5'], errors='coerce')
    if 'y' in df.columns:
        df['y'] = pd.to_numeric(df['y'], errors='coerce').fillna(0).astype(float)

for df in [train, test, offer_meta, events, trans]:
    for col in ['id2', 'id3', 'customer_id', 'offer_id']:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

offer_meta.rename(columns={'id3': 'offer_id', 'id9': 'industry_code', 'id12': 'start_date', 'id13': 'end_date'}, inplace=True)
events.rename(columns={'id2': 'customer_id', 'id3': 'offer_id', 'id4': 'impression_timestamp', 'id7': 'click_timestamp'}, inplace=True)
trans.rename(columns={'id2': 'customer_id', 'f367': 'transaction_amount', 'f370': 'transaction_date_d', 'f371': 'transaction_date_t'}, inplace=True)
trans['transaction_date'] = pd.to_datetime(trans['transaction_date_d'] + ' ' + trans['transaction_date_t'], errors='coerce')

# ===================================================================================
# 3. Feature Engineering
# ===================================================================================
def feature_engineer(df):
    print("🛠️  Feature engineering...")
    df = df.merge(offer_meta, left_on='id3', right_on='offer_id', how='left')
    df['imp_date'] = pd.to_datetime(df['id5'])
    df['imp_dow'] = df['imp_date'].dt.dayofweek
    df['start_date'] = pd.to_datetime(df['start_date'], errors='coerce')
    df['end_date'] = pd.to_datetime(df['end_date'], errors='coerce')
    df['offer_duration'] = (df['end_date'] - df['start_date']).dt.days

    events['click_timestamp'] = pd.to_datetime(events['click_timestamp'], errors='coerce')
    events['impression_timestamp'] = pd.to_datetime(events['impression_timestamp'], errors='coerce')
    events['has_clicked'] = events['click_timestamp'].notna().astype(int)

    cust_agg = events.groupby('customer_id').agg(
        total_customer_imps=('impression_timestamp', 'count'),
        total_customer_clicks=('has_clicked', 'sum'),
        last_click_date=('click_timestamp', lambda x: pd.to_datetime(x.dropna()).max())
    ).reset_index()
    df = df.merge(cust_agg, left_on='id2', right_on='customer_id', how='left')
    df['days_since_last_any_click'] = (df['imp_date'] - df['last_click_date']).dt.days

    cust_industry_agg = events.merge(offer_meta[['offer_id', 'industry_code']], on='offer_id')
    cust_industry_agg = cust_industry_agg.groupby(['customer_id', 'industry_code']).agg(
        customer_industry_imps=('impression_timestamp', 'count'),
        customer_industry_clicks=('has_clicked', 'sum')
    ).reset_index()
    df = df.merge(cust_industry_agg, left_on=['id2', 'industry_code'], right_on=['customer_id', 'industry_code'], how='left')

    trans['transaction_amount'] = pd.to_numeric(trans['transaction_amount'], errors='coerce')
    trans_agg = trans.groupby('customer_id').agg(
        avg_spend=('transaction_amount', 'mean'),
        last_trans_date=('transaction_date', lambda x: pd.to_datetime(x.dropna()).max())
    ).reset_index()
    df = df.merge(trans_agg, left_on='id2', right_on='customer_id', how='left')

    return df

train = feature_engineer(train)
test = feature_engineer(test)

feature_cols = [c for c in train.columns if c.startswith('f') and train[c].dtype in [np.float32, np.float64, np.int32, np.int64]] + [
    'imp_dow', 'offer_duration', 'total_customer_imps', 'total_customer_clicks',
    'days_since_last_any_click', 'customer_industry_imps', 'customer_industry_clicks', 'avg_spend']

train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

X = train[feature_cols + ['id2']]
y = train['y']
X_test = test[feature_cols + ['id2']]

# ===================================================================================
# 4. LightGBM Training with Hyperparameter Tuning
# ===================================================================================
def train_lgb(X, y, groups, X_test):
    print("📈 Training LightGBM model...")
    gkf = GroupKFold(n_splits=5)
    oof = np.zeros(len(X))
    test_preds = np.zeros(len(X_test))

    params = {
        'objective': 'lambdarank',
        'metric': 'map',
        'eval_at': 7,
        'learning_rate': 0.05,
        'num_leaves': 64,
        'n_estimators': 300,
        'verbose': -1
    }

    for fold, (t_idx, v_idx) in enumerate(gkf.split(X, y, groups)):
        print(f"\n🔁 Fold {fold+1}/5")
        Xt, Xv = X.iloc[t_idx], X.iloc[v_idx]
        yt, yv = y.iloc[t_idx], y.iloc[v_idx]

        gr_t = Xt['id2'].value_counts().sort_index().values
        gr_v = Xv['id2'].value_counts().sort_index().values

        Xt_, Xv_ = Xt.drop('id2', axis=1), Xv.drop('id2', axis=1)
        Xtest_ = X_test.drop('id2', axis=1)

        model = lgb.LGBMRanker(**params)
        model.fit(Xt_, yt, group=gr_t, eval_set=[(Xv_, yv)], eval_group=[gr_v], callbacks=[lgb.early_stopping(50)])
        oof[v_idx] = model.predict(Xv_)
        test_preds += model.predict(Xtest_)

    test_preds /= gkf.get_n_splits()
    return oof, test_preds

# ===================================================================================
# 5. Inference and Submission
# ===================================================================================
oof, test_preds = train_lgb(X, y, X['id2'], X_test)
train['pred'] = oof

print("📤 Generating predictions and evaluating MAP@7...")
oof_ranked = train.sort_values(['id2','pred'], ascending=[True,False]).groupby('id2')['id3'].apply(list).reset_index(name='predicted_offers')
true_offers = train[train['y']==1].groupby('id2')['id3'].apply(list).reset_index(name='true_offers')
eval_df = oof_ranked.merge(true_offers, on='id2', how='left')
eval_df['true_offers'] = eval_df['true_offers'].apply(lambda x: x if isinstance(x, list) else [])
final_score = mapk(eval_df['true_offers'], eval_df['predicted_offers'])
print(f"\n📊 Final OOF MAP@7 Score: {final_score:.6f}")

print("🧪 Generating submission file...")
scaler = MinMaxScaler()
test['pred'] = scaler.fit_transform(test_preds.reshape(-1, 1)).flatten()
submission = test[['id1', 'id2', 'id3', 'id5', 'pred']].copy()
submission['id5'] = pd.to_datetime(submission['id5']).dt.strftime('%m/%d/%Y')
submission.to_csv('submissionvery.csv', index=False)
print("\n✅ Submission file saved as 'submission.csv'")


📥 Loading datasets...
🔧 Preprocessing features...
🛠️  Feature engineering...
🛠️  Feature engineering...
