In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from lightgbm import early_stopping, log_evaluation
from sklearn.preprocessing import LabelEncoder
import warnings

warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv("data/train_dataset.csv")
user = pd.read_csv("data/user_features.csv")
ad = pd.read_csv("data/ad_features.csv")
test = pd.read_csv("data/test_dataset.csv")

#合并大数据集
train = train.merge(user, on='user_id', how='left')
train = train.merge(ad, on='ad_id', how='left')

test = test.merge(user, on='user_id', how='left')
test = test.merge(ad, on='ad_id', how='left')

# 合并train和test
all_data = pd.concat([train, test], ignore_index=True)

In [None]:
#特征工程
def add_features(df):
    # 时间
    df['exposure_time'] = pd.to_datetime(df['exposure_time'], errors='coerce')
    df['hour'] = df['exposure_time'].dt.hour
    df['weekday'] = df['exposure_time'].dt.dayofweek
    df['is_weekend'] = (df['weekday'] >= 5).astype(int)
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)#"周期性"
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

    # 交互
    cols = ['activity_score', 'product_price', 'advertiser_score', 'historical_ctr']
    for i in range(len(cols)):
        for j in range(i + 1, len(cols)):
            a, b = cols[i], cols[j]
            df[f'{a}_{b}_sum'] = df[a] + df[b] 
            df[f'{a}_{b}_diff'] = df[a] - df[b]
            df[f'{a}_{b}_prod'] = df[a] * df[b]
    return df

all_data = add_features(all_data)

In [None]:
# 类别编码
cat_cols = ['occupation', 'region', 'device', 'category', 'material_type']
for col in cat_cols:
    le = LabelEncoder()
    all_data[col] = le.fit_transform(all_data[col].astype(str))
    all_data[f'{col}_count'] = all_data[col].map(all_data[col].value_counts())

In [None]:
# 训练/测试
train_set = all_data[all_data['is_click'].notnull()].reset_index(drop=True)
test_set = all_data[all_data['is_click'].isnull()].reset_index(drop=True)

drop_cols = ['user_id', 'ad_id', 'exposure_time', 'is_click']
features = [col for col in train_set.columns if col not in drop_cols]
len(features)

In [None]:
X = train_set[features]
y = train_set['is_click']

In [None]:
# k折
'''
k=5
k=10
k=15
'''
k = 15
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
auc_scores = []
models = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
    print(f"\nFold {fold}")

    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

    dtrain = lgb.Dataset(X_tr, label=y_tr, categorical_feature=cat_cols)
    dval = lgb.Dataset(X_val, label=y_val, categorical_feature=cat_cols)

    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'learning_rate': 0.01,
        'num_leaves': 31,
        'max_depth': 7,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'seed': 42,
        'verbosity': -1,
        'bagging_freq': 5,
        'lambda_l1': 1.0,
        'lambda_l2': 1.0
    }

    model = lgb.train(
        params,
        dtrain,
        valid_sets=[dval],
        num_boost_round=2000,#n
        callbacks=[
            early_stopping(150),
            log_evaluation(100)
        ]
    )

    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    auc = roc_auc_score(y_val, val_pred)
    print(f"Fold {fold} AUC: {auc:.4f}")

    auc_scores.append(auc)
    models.append(model)

print(f"\n平均AUC: {np.mean(auc_scores):.4f}")

In [None]:
# 预测,k个均值
test_preds = np.mean(
    [m.predict(test_set[features], num_iteration=m.best_iteration) for m in models],
    axis=0
)

# 导出结果
submission = test_set[['user_id', 'ad_id', 'exposure_time']].copy()
submission['prediction'] = test_preds
submission.to_csv("/work/prediction/submission.csv", index=False)