In [None]:
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import defaultdict


from sklearn import model_selection, metrics

from sklearn.preprocessing import QuantileTransformer

# 自定义py文件
from transform import transform_pub
from utils import *

## 参数设定

In [None]:
N_splits = 5
seed = 0
random.seed(seed)

# 第一组

In [None]:
# 加载已经提取过特征的test数据

candidates_path = "data/v3/processed/valid_candidate_all.pkl"
paper_ids_path = "data/v3/processed/valid_paper_ids.pkl"
valid_features_path = "data/v3/processed/valid_features.pkl"

# candidates_path = "data/v3/processed/test_candidate_all.pkl"
# paper_ids_path = "data/v3/processed/test_paper_ids.pkl"
# valid_features_path = "data/v3/processed/test_features.pkl"

candidates = load_pickle(candidates_path)
paper_ids = load_pickle(paper_ids_path)
valid_features = load_pickle(valid_features_path)

In [None]:
indices = []
for arr in valid_features:
    indices.append(int(np.array(arr).shape[0]))

new_valid_features = np.vstack(valid_features)
print(new_valid_features.shape)

In [None]:
qt_filename = "data/v3/processed/qt.pkl"
qt = load_pickle(qt_filename)

# 按方差或最终效果剔除某些特征 78 -6 =72
remove_features = [0,2,20,24,44,76]
features = np.delete(new_valid_features, remove_features, axis=1)
# 这里用QuantileTransformer来进行scaling
transformed_features = qt.transform(features)

# 根据相关性和特征重要性进行特征交叉 16
cross_features = np.array([
    transformed_features[:,36] * transformed_features[:,10],
    transformed_features[:,1] * transformed_features[:,5],
    transformed_features[:,4] * transformed_features[:,6],
    transformed_features[:,35] * transformed_features[:,32],
    transformed_features[:,37] * transformed_features[:,39],
    transformed_features[:,3] * transformed_features[:,8],
    transformed_features[:,50] * transformed_features[:,52],
    transformed_features[:,40] * transformed_features[:,38],
    transformed_features[:,40] * transformed_features[:,41],
    transformed_features[:,50] * transformed_features[:,53],
    transformed_features[:,41] * transformed_features[:,34],
    transformed_features[:,7] * transformed_features[:,8],
    transformed_features[:,34] * transformed_features[:,37],
    transformed_features[:,22] * transformed_features[:,23],
    transformed_features[:,3] * transformed_features[:,7],
    transformed_features[:,18] * transformed_features[:,10],
])

# 72 + 16 = 88
valid_features = np.hstack([transformed_features, cross_features.T])
valid_features.shape

In [None]:
# 标准化
mean_path = "data/v3/processed/mean.pkl"
std_path = "data/v3/processed/std.pkl"

mean = load_pickle(mean_path)
std = load_pickle(std_path)

valid_features = (valid_features - mean) / std

# Inference

In [None]:
%%time
model_save_path = "data/v3/processed/models/"

result_dict = {"rf":defaultdict(list), "xgb":defaultdict(list), "lgbm":defaultdict(list), "cat":defaultdict(list), "ensemble":defaultdict(list)}

rf_prob = 0.0
xgb_prob = 0.0
lgbm_prob = 0.0
cat_prob = 0.0


for i in tqdm(range(N_splits)):
    rf = pickle.load(open(model_save_path + f"model_{i}_rf.dat", "rb"))
    rf_prob += rf.predict_proba(valid_features)[:, 1] / N_splits

for i in tqdm(range(N_splits)):
    xgb = pickle.load(open(model_save_path + f"model_{i}_xgb.dat", "rb"))
    xgb_prob += xgb.predict_proba(valid_features)[:, 1] / N_splits

for i in tqdm(range(N_splits)):
    lgbm = pickle.load(open(model_save_path + f"model_{i}_lgbm.dat", "rb"))
    lgbm_prob += lgbm.predict_proba(valid_features)[:, 1] / N_splits

for i in tqdm(range(N_splits)):
    cat = pickle.load(open(model_save_path + f"model_{i}_cat.dat", "rb"))
    cat_prob += cat.predict_proba(valid_features)[:, 1] / N_splits

In [None]:
#%%time
tmp_idx = 0
for num, idx in enumerate(tqdm(indices)):

    rf_prob_tmp = rf_prob[tmp_idx: (tmp_idx + idx)]
    xgb_prob_tmp = xgb_prob[tmp_idx: (tmp_idx + idx)]
    lgbm_prob_tmp = lgbm_prob[tmp_idx: (tmp_idx + idx)]
    cat_prob_tmp = cat_prob[tmp_idx: (tmp_idx + idx)]
    
    rf_idx = np.argmax(np.array(rf_prob_tmp))
    xgb_idx = np.argmax(np.array(xgb_prob_tmp))
    lgbm_idx = np.argmax(np.array(lgbm_prob_tmp))
    cat_idx = np.argmax(np.array(cat_prob_tmp))
    
    ensemble_prob = rf_prob_tmp + xgb_prob_tmp + lgbm_prob_tmp + cat_prob_tmp
    ensemble_idx = np.argmax(np.array(ensemble_prob))


    result_dict["rf"][candidates[num][rf_idx]].append((paper_ids[num], np.array(rf_prob_tmp)[rf_idx]))
    result_dict["xgb"][candidates[num][xgb_idx]].append((paper_ids[num], np.array(xgb_prob_tmp)[xgb_idx]))
    result_dict["lgbm"][candidates[num][lgbm_idx]].append((paper_ids[num], np.array(lgbm_prob_tmp)[lgbm_idx]))
    result_dict["cat"][candidates[num][cat_idx]].append((paper_ids[num], np.array(cat_prob_tmp)[lgbm_idx]))
    result_dict["ensemble"][candidates[num][ensemble_idx]].append((paper_ids[num], np.array(ensemble_prob)[ensemble_idx]))
    
    tmp_idx += idx
    

In [None]:
#%%time
# 多阈值设定
thresholds = [0.6, 0.7, 0.8]
models = ['rf', 'xgb', 'lgbm', 'cat', "ensemble"]
num_dicision_votes = 8  # 3 * 5 / 2 = 8

result_votes = defaultdict(int)
result = defaultdict(list)

#多阈值处理 -> (作者ID, paperId) : 投票数
                    
for model in models:
    num = 0
    for items in result_dict[model]:  # 该模型搜索出的作者名字
        for item in result_dict[model][items]: # 作者名字对应的作者ID
            paperId, prob = item
            for threshold in thresholds: # 多阈值
                if prob >= threshold:
                    result_votes[(items,paperId)] += 1

# 生成提交结果
for item in result_votes.keys():
    tmp_votes = result_votes[item]
    if tmp_votes >= num_dicision_votes:
        result[item[0]].append(item[1])
        
        

# 导出结果  
dump_json("vote_result_all.json", result)