# 描述
使用xgboost实现rank

In [67]:
import pandas as pd
import numpy as np
from xgboost import DMatrix,train
import xgboost as xgb

In [68]:
# 使用xgboost原生接口实现 https://www.jianshu.com/p/9caef967ec0a
xgb_rank_params = {
    'booster': 'gbtree',
    'eta': 0.1,
    'gamma': 1.0,
    'min_child_weight': 0.1,
    'objective': 'rank:pairwise',
    'eval_metric': 'ndcg',
    'max_depth': 6,
    'num_boost_round': 10
}

#generate training dataset
# 一共2组*每组3条，6条样本，特征维数是2
n_group = 2
n_choice = 3
dtrain = np.random.uniform(0, 100, [n_group * n_choice, 2]) # (6, 2)
dtarget = np.array([np.random.choice([0, 1, 2], 3, False) for i in range(n_group)]).flatten()
# n_group用于表示从前到后每组各自有多少样本，前提是样本中各组是连续的，[3，3]表示一共6条样本中前3条是第一组，后3条是第二组
dgroup = np.array([n_choice for i in range(n_group)]).flatten()

# concate Train data, very import here !
xgbTrain = DMatrix(dtrain, label=dtarget)
xgbTrain.set_group(dgroup)

# generate eval data
dtrain_eval = np.random.uniform(0, 100, [n_group * n_choice, 2])
xgbTrain_eval = DMatrix(dtrain_eval, label=dtarget)
xgbTrain_eval.set_group(dgroup)
evallist = [(xgbTrain, 'train'), (xgbTrain_eval, 'eval')]

# train model
rankModel = train(xgb_rank_params, xgbTrain, num_boost_round=20, evals=evallist)

#test dataset
dtest = np.random.uniform(0, 100, [n_group * n_choice, 2])
dtestgroup = np.array([n_choice for i in range(n_group)]).flatten()
xgbTest = DMatrix(dtest)
xgbTest.set_group(dgroup)

# test
print(rankModel.predict(xgbTest))


Parameters: { "num_boost_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	train-ndcg:0.89835	eval-ndcg:0.82624
[1]	train-ndcg:0.89835	eval-ndcg:0.82624
[2]	train-ndcg:1.00000	eval-ndcg:0.82624
[3]	train-ndcg:1.00000	eval-ndcg:0.82624
[4]	train-ndcg:1.00000	eval-ndcg:0.82624
[5]	train-ndcg:1.00000	eval-ndcg:0.82624
[6]	train-ndcg:1.00000	eval-ndcg:0.82624
[7]	train-ndcg:1.00000	eval-ndcg:0.82624
[8]	train-ndcg:1.00000	eval-ndcg:0.82624
[9]	train-ndcg:1.00000	eval-ndcg:0.82624
[10]	train-ndcg:1.00000	eval-ndcg:0.82624
[11]	train-ndcg:1.00000	eval-ndcg:0.82624
[12]	train-ndcg:1.00000	eval-ndcg:0.82624
[13]	train-ndcg:1.00000	eval-ndcg:0.82624
[14]	train-ndcg:1.00000	eval-ndcg:0.82624
[15]	train-ndcg:1.00000	eval-ndcg:0.82624
[16]	train-ndcg:

In [69]:
# 使用sklearn接口实现 https://zhuanlan.zhihu.com/p/384661987

#generate training dataset
# 一共2组*每组3条，6条样本，特征维数是2
n_group = 2
n_choice = 3
df_fea = pd.DataFrame(np.random.uniform(0, 100, size=(n_group * n_choice, 2)), columns=['x1', 'x2'])
df_target = pd.DataFrame(np.array([np.random.choice([0, 1, 2], 3, False) for i in range(n_group)]).flatten(), columns=['label'])
df_id = pd.DataFrame(np.array([i for i in range(n_group) for j in range(n_choice)]), columns=['id'])
df_train = pd.concat([df_id, df_fea, df_target], axis=1)
x_train, y_train = df_train[['x1', 'x2']].values, df_train['label'].values

# generate eval data
df_fea = pd.DataFrame(np.random.uniform(0, 100, size=(n_group * n_choice, 2)), columns=['x1', 'x2'])
df_target = pd.DataFrame(np.array([np.random.choice([0, 1, 2], 3, False) for i in range(n_group)]).flatten(), columns=['label'])
df_id = pd.DataFrame(np.array([i for i in range(n_group) for j in range(n_choice)]), columns=['id'])
df_eval = pd.concat([df_id, df_fea, df_target], axis=1)
x_eval, y_eval = df_eval[['x1', 'x2']].values, df_eval['label'].values

# train model
model = xgb.XGBRanker(booster='gbtree',
                      objective='rank:pairwise',
                      learning_rate=0.001,
                      colsample_bytree=0.9,
                      max_depth=6,
                      n_estimators=20,
                      eval_metric=['ndcg'])
model.fit(x_train, y_train, qid=df_train['id'].values, eval_set=[(x_train, y_train), (x_eval, y_eval)], eval_qid=[df_train['id'].values, df_eval['id'].values], verbose=True)

#test dataset
df_fea = pd.DataFrame(np.random.uniform(0, 100, size=(n_group * n_choice, 2)), columns=['x1', 'x2'])
df_target = pd.DataFrame(np.array([np.random.choice([0, 1, 2], 3, False) for i in range(n_group)]).flatten(), columns=['label'])
df_id = pd.DataFrame(np.array([i for i in range(n_group) for j in range(n_choice)]), columns=['id'])
df_test = pd.concat([df_id, df_fea, df_target], axis=1)
x_test, y_test = df_test[['x1', 'x2']].values, df_test['label'].values

# test
print(model.predict(x_test))


[0]	validation_0-ndcg:0.74262	validation_1-ndcg:0.79344
[1]	validation_0-ndcg:0.79671	validation_1-ndcg:0.82950
[2]	validation_0-ndcg:0.79671	validation_1-ndcg:0.98197
[3]	validation_0-ndcg:0.89835	validation_1-ndcg:0.98197
[4]	validation_0-ndcg:0.89835	validation_1-ndcg:0.98197
[5]	validation_0-ndcg:0.89835	validation_1-ndcg:0.98197
[6]	validation_0-ndcg:0.89835	validation_1-ndcg:1.00000
[7]	validation_0-ndcg:0.89835	validation_1-ndcg:0.89835
[8]	validation_0-ndcg:0.89835	validation_1-ndcg:0.89835
[9]	validation_0-ndcg:0.89835	validation_1-ndcg:0.88032
[10]	validation_0-ndcg:0.89835	validation_1-ndcg:0.88032
[11]	validation_0-ndcg:0.89835	validation_1-ndcg:0.88032
[12]	validation_0-ndcg:0.89835	validation_1-ndcg:0.88032
[13]	validation_0-ndcg:0.89835	validation_1-ndcg:0.72786
[14]	validation_0-ndcg:1.00000	validation_1-ndcg:0.88032
[15]	validation_0-ndcg:1.00000	validation_1-ndcg:0.88032
[16]	validation_0-ndcg:0.98197	validation_1-ndcg:0.88032
[17]	validation_0-ndcg:0.98197	validation