In [None]:
import pandas as pd
from catboost import CatBoostRanker, Pool
from sklearn.model_selection import KFold

In [3]:
data = pd.read_csv('.../intern_task.csv')

In [4]:
# добавление признака "релевантность запроса"
data['relevance'] = data['rank'].max() - data['rank']

In [10]:
# массив со всеми айдишниками сессий
queries = data['query_id'].unique()

# инициализация модели
model = CatBoostRanker(
    iterations=5000,
    depth=10,
    loss_function='QueryRMSE',
    use_best_model=True,
    verbose=0,
    custom_metric=['NDCG', 'NDCG:top=5'],
    eval_metric='NDCG',
    task_type='GPU',
    random_state=42,
    early_stopping_rounds=500,
    )

# кросс-валидация
kf = KFold(shuffle=True, random_state=42)

for fold, (train_index, val_index) in enumerate(kf.split(queries)):

    # айдишники сессий
    train_queries = queries[train_index]
    val_queries = queries[val_index]

    # отбор данных по айдишникам сессий
    train_data = data[data['query_id'].isin(train_queries)]
    val_data = data[data['query_id'].isin(val_queries)]

    # создание датасетов
    train_pool = Pool(
        data=train_data.drop(['rank', 'relevance', 'query_id'], axis=1),
        label=train_data['relevance'],
        group_id=train_data['query_id'],
        )

    val_pool = Pool(
        data=val_data.drop(['rank', 'relevance', 'query_id'], axis=1),
        label=val_data['relevance'],
        group_id=val_data['query_id']
        )

    # обучение модели
    model.fit(X=train_pool, eval_set=val_pool)

    # вывод на экран метрики NDCG лучшей модели
    ndcg = model.best_score_['validation']['NDCG:type=Base']
    ndcg5 = model.best_score_['validation']['NDCG:top=5;type=Base']
    print(f"Fold {fold + 1}. \
            \nNDCG:   {ndcg:.5f} \
            \nNDCG@5: {ndcg5:.5f}\n")

Default metric period is 5 because NDCG is/are not implemented for GPU
Metric NDCG:type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:top=5;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


Fold 1.             
NDCG: 0.97923             
NDCG@5: 0.94391



Default metric period is 5 because NDCG is/are not implemented for GPU
Metric NDCG:type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:top=5;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


Fold 2.             
NDCG: 0.97910             
NDCG@5: 0.94850



Default metric period is 5 because NDCG is/are not implemented for GPU
Metric NDCG:type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:top=5;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


Fold 3.             
NDCG: 0.98036             
NDCG@5: 0.95425



Default metric period is 5 because NDCG is/are not implemented for GPU
Metric NDCG:type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:top=5;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


Fold 4.             
NDCG: 0.97960             
NDCG@5: 0.95300



Default metric period is 5 because NDCG is/are not implemented for GPU
Metric NDCG:type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:top=5;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


Fold 5.             
NDCG: 0.97885             
NDCG@5: 0.94842

