In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from collections import defaultdict
from tqdm import tqdm
import warnings
import time

warnings.filterwarnings('ignore')

# --- 1. 数据加载与全局特征预计算 (优化) ---
print("--- 1. Loading and Pre-calculating Global Features ---")
start_time = time.time()

# 加载数据
try:
    train_df = pd.read_csv('data/Antai_hackathon_train.csv', parse_dates=['create_order_time'])
    test_df = pd.read_csv('data/dianshang_test.csv', parse_dates=['create_order_time'])
    item_attr_df = pd.read_csv('data/Antai_hackathon_attr.csv')
except FileNotFoundError:
    print("错误：请确保 train.csv, test_without_last.csv, item_attr.csv 文件在当前目录下！")
    exit()

# 合并商品属性
train_df = pd.merge(train_df, item_attr_df, on='item_id', how='left')
test_df = pd.merge(test_df, item_attr_df, on='item_id', how='left')

# 时间转换
train_df['create_order_time'] = pd.to_datetime(train_df['create_order_time'])
test_df['create_order_time'] = pd.to_datetime(test_df['create_order_time'])

# 按时间排序，为后续特征工程做准备
train_df = train_df.sort_values(by=['buyer_admin_id', 'create_order_time'])
test_df = test_df.sort_values(by=['buyer_admin_id', 'create_order_time'])


# a. 商品侧特征预计算
print("Pre-calculating item features...")
item_features = item_attr_df.copy()
item_pop = train_df['item_id'].value_counts().reset_index()
item_pop.columns = ['item_id', 'item_pop']
cate_pop = train_df['cate_id'].value_counts().reset_index()
cate_pop.columns = ['cate_id', 'cate_pop']
store_pop = train_df['store_id'].value_counts().reset_index()
store_pop.columns = ['store_id', 'store_pop']

item_features = pd.merge(item_features, item_pop, on='item_id', how='left')
item_features = pd.merge(item_features, cate_pop, on='cate_id', how='left')
item_features = pd.merge(item_features, store_pop, on='store_id', how='left')
item_features.fillna(0, inplace=True)
item_features.set_index('item_id', inplace=True) # 使用索引以极大加速后续合并

# b. 用户侧特征预计算
print("Pre-calculating user features...")
user_features = train_df.groupby('buyer_admin_id').agg(
    user_total_purchases=('item_id', 'count'),
    user_unique_items=('item_id', 'nunique'),
    user_unique_cates=('cate_id', 'nunique'),
    user_avg_price=('item_price', 'mean')
).reset_index()
user_features.set_index('buyer_admin_id', inplace=True) # 使用索引

# c. 序列Item-CF召回模型 (保持不变，这部分效率尚可)
print("Building ItemCF recall model...")
item_cf_dict = defaultdict(lambda: defaultdict(int))
train_df = train_df.sort_values(by=['buyer_admin_id', 'create_order_time'])
for user_id, user_group in tqdm(train_df.groupby('buyer_admin_id'), desc="Building ItemCF"):
    item_sequence = user_group['item_id'].tolist()
    for i in range(len(item_sequence) - 1):
        item_cf_dict[item_sequence[i]][item_sequence[i+1]] += 1

global_top_items = train_df['item_id'].value_counts().head(200).index.tolist()
print(f"Preprocessing finished in {(time.time() - start_time)/60:.2f} minutes.")


# --- 2. 召回与训练集构建 (向量化重构) ---
print("\n--- 2. Building Training Set (Vectorized) ---")
start_time = time.time()

# a. 划分训练/验证集的用户
all_train_users = train_df['buyer_admin_id'].unique()
np.random.shuffle(all_train_users)
split_idx = int(len(all_train_users) * 0.8)
train_user_ids, val_user_ids = all_train_users[:split_idx], all_train_users[split_idx:]

train_part = train_df[train_df['buyer_admin_id'].isin(train_user_ids)]
val_part = train_df[train_df['buyer_admin_id'].isin(val_user_ids)]

def generate_candidates_and_features(data, is_training=True):
    # 1. 召回
    # 获取每个用户的历史和最后一次购买
    last_items = data.sort_values('create_order_time').drop_duplicates('buyer_admin_id', keep='last')
    user_last_item_map = dict(zip(last_items['buyer_admin_id'], last_items['item_id']))
    
    candidate_pairs = []
    for user_id, user_group in tqdm(data.groupby('buyer_admin_id'), desc="Generating Candidates"):
        # a. 历史购买过的
        candidates = set(user_group['item_id'].unique())
        # b. Item-CF召回
        last_item = user_last_item_map.get(user_id)
        if last_item and last_item in item_cf_dict:
            related_items = sorted(item_cf_dict[last_item].items(), key=lambda x: x[1], reverse=True)[:50]
            candidates.update([item for item, score in related_items])
        # c. 全局热门补充
        candidates.update(global_top_items)
        
        for item_id in list(candidates)[:300]:
            candidate_pairs.append((user_id, item_id))
    
    # 构建候选集DataFrame
    candidates_df = pd.DataFrame(candidate_pairs, columns=['buyer_admin_id', 'item_id'])
    
    # 2. 构造标签
    if is_training:
        # 在训练/验证模式下，我们知道真实的目标
        targets = data[data['irank'] == 1][['buyer_admin_id', 'item_id']]
        targets['label'] = 1
        candidates_df = pd.merge(candidates_df, targets, on=['buyer_admin_id', 'item_id'], how='left')
        candidates_df['label'].fillna(0, inplace=True)
    
    # 3. 特征工程 (向量化)
    print("Vectorized Feature Engineering...")
    # a. 合并商品侧特征
    candidates_df = pd.merge(candidates_df, item_features, on='item_id', how='left')
    # b. 合并用户侧特征
    candidates_df = pd.merge(candidates_df, user_features, on='buyer_admin_id', how='left')
    
    # c. 交叉特征
    # has_bought_before & times_bought
    user_item_history = train_df.groupby(['buyer_admin_id', 'item_id']).size().reset_index(name='times_bought')
    candidates_df = pd.merge(candidates_df, user_item_history, on=['buyer_admin_id', 'item_id'], how='left')
    candidates_df['times_bought'].fillna(0, inplace=True)
    candidates_df['has_bought_before'] = (candidates_df['times_bought'] > 0).astype(int)
    
    # seq_cf_score
    last_items_df = last_items[['buyer_admin_id', 'item_id']].rename(columns={'item_id': 'last_item_id'})
    candidates_df = pd.merge(candidates_df, last_items_df, on='buyer_admin_id', how='left')
    
    def cf_score_lookup(row):
        return item_cf_dict.get(row['last_item_id'], {}).get(row['item_id'], 0)
    
    # apply虽然是循环，但比Python外层循环快得多，且数据量已大大减少
    candidates_df['seq_cf_score'] = candidates_df.apply(cf_score_lookup, axis=1)
    
    candidates_df.drop(columns=['last_item_id'], inplace=True)
    candidates_df.fillna(0, inplace=True)
    
    return candidates_df

# 构建训练和验证数据集
train_data = generate_candidates_and_features(train_part, is_training=True)
val_data = generate_candidates_and_features(val_part, is_training=True) # 验证集也需要标签

print(f"Dataset creation finished in {(time.time() - start_time)/60:.2f} minutes.")


# --- 3. 模型训练 (保持不变) ---
print("\n--- 3. Model Training (LGBMRanker) ---")
start_time = time.time()

train_groups = train_data.groupby('buyer_admin_id').size().to_list()
train_features = train_data.drop(columns=['buyer_admin_id', 'item_id', 'label', 'cate_id', 'store_id'])
train_labels = train_data['label']

val_groups = val_data.groupby('buyer_admin_id').size().to_list()
val_features = val_data.drop(columns=['buyer_admin_id', 'item_id', 'label', 'cate_id', 'store_id'])
val_labels = val_data['label']

ranker = lgb.LGBMRanker(
    objective="lambdarank", metric="mrr", n_estimators=500, learning_rate=0.05,
    num_leaves=63, verbose=-1, random_state=42, n_jobs=-1
)

ranker.fit(
    train_features, train_labels, group=train_groups,
    eval_set=[(val_features, val_labels)], eval_group=[val_groups],
    eval_at=[30], callbacks=[lgb.early_stopping(20, verbose=True)]
)
print(f"Model training finished in {(time.time() - start_time)/60:.2f} minutes.")


# --- 4. 预测与生成提交文件 (优化) ---
print("\n--- 4. Prediction and Submission ---")
start_time = time.time()

# 使用与训练过程完全相同的逻辑来构建测试集的候选和特征
test_data = generate_candidates_and_features(test_df, is_training=False)

# 预测分数
test_features = test_data.drop(columns=['buyer_admin_id', 'item_id', 'cate_id', 'store_id'])
test_data['score'] = ranker.predict(test_features)

# 排序并生成提交
submission_df = test_data.sort_values(['buyer_admin_id', 'score'], ascending=[True, False])
submission_df = submission_df.groupby('buyer_admin_id').head(30)
submission_pivot = submission_df.pivot_table(index='buyer_admin_id', columns=submission_df.groupby('buyer_admin_id').cumcount()+1, values='item_id').reset_index()
submission_pivot.columns = ['buyer_admin_id'] + [f'predict {i}' for i in range(1, 31)]

submission_filename = 'lgbm_ranker_submission_optimized.csv'
submission_pivot.to_csv(submission_filename, index=False)

print(f"Submission file created in {(time.time() - start_time)/60:.2f} minutes.")
print(f"Submission file '{submission_filename}' created successfully!")
print("Preview:")
print(submission_pivot.head())