In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
import re
import warnings
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GroupKFold
from sklearn.metrics import average_precision_score
import matplotlib.pyplot as plt

In [2]:
warnings.filterwarnings('ignore')


In [3]:
train = pd.read_csv("Featured_train.csv")
test = pd.read_csv("Featured_test.csv")

In [4]:
id_columns = ['id1', 'id2', 'id3', 'id5']
target_col = 'y'
feature_cols = [col for col in train.columns if col not in id_columns + [target_col]]

X = train[feature_cols].copy()
y = train[target_col].copy()
X_test = test[feature_cols].copy()
groups = train['id2'].copy()


In [5]:
X.columns = [re.sub(r'\W+', '_', col) for col in X.columns]
X_test.columns = [re.sub(r'\W+', '_', col) for col in X_test.columns]

X.fillna(X.median(), inplace=True)
X_test.fillna(X_test.median(), inplace=True)

In [None]:
for df in [X, X_test]:
    for col in df.columns:
        df[col + '_rank'] = df.groupby(groups if df is X else test['id2'])[col].rank(pct=True)

X_test.fillna(0, inplace=True)

In [None]:
n_splits = 5
kf = GroupKFold(n_splits=n_splits)
folds = list(kf.split(X, y, groups))

In [None]:
lgb_params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'ndcg_eval_at': [7],
    'learning_rate': 0.02,
    'num_leaves': 85,
    'max_depth': 12,
    'min_data_in_leaf': 30,
    'feature_fraction': 0.85,
    'bagging_fraction': 0.85,
    'bagging_freq': 5,
    'lambda_l1': 1,
    'lambda_l2': 1,
    'n_estimators': 1000,
    'random_state': 42,
    'verbosity': -1
}


In [None]:
def map_at_7(y_true, y_score, user_ids, k=7):
    df = pd.DataFrame({'y': y_true, 'score': y_score, 'user': user_ids})
    ap_list = []
    for _, sub in df.groupby('user'):
        top = sub.sort_values('score', ascending=False).head(k)
        if top['y'].sum() > 0:
            ap_list.append(average_precision_score(top['y'], top['score']))
    return np.mean(ap_list)


In [None]:
oof_preds = np.zeros(X.shape[0])
test_preds = np.zeros(X_test.shape[0])

In [None]:
for fold, (train_idx, val_idx) in enumerate(folds):
    print(f"\nFold {fold+1}")

    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
    groups_train = groups.iloc[train_idx]
    groups_val = groups.iloc[val_idx]

    lgb_train = lgb.Dataset(X_train, y_train, group=groups_train.value_counts().sort_index().values)
    lgb_val = lgb.Dataset(X_val, y_val, group=groups_val.value_counts().sort_index().values)

    model = lgb.train(
        lgb_params,
        lgb_train,
        valid_sets=[lgb_val],
        callbacks=[lgb.early_stopping(100), lgb.log_evaluation(100)]
    )

    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    oof_preds[val_idx] = val_pred

    test_preds += model.predict(X_test, num_iteration=model.best_iteration) / n_splits


In [30]:
map7_score = map_at_7(y, oof_preds, groups)
print(f"\n📈 Overall MAP@7: {map7_score:.5f}")


📈 Overall MAP@7: 0.72095


In [None]:
submission = test[['id1', 'id2', 'id3', 'id5']].copy()
submission['pred'] = test_preds
submission['pred'] = submission.groupby('id2')['pred'].transform(
    lambda x: (x - x.min()) / (x.max() - x.min()) if x.max() > x.min() else 0.5
)
submission.to_csv("submission_map7_optimized.csv", index=False)