# Тестовое задание VK

In [1]:
import pandas as pd
from catboost import CatBoostRanker, Pool
from sklearn.metrics import ndcg_score
from sklearn.model_selection import train_test_split
from copy import deepcopy
import numpy as np

In [2]:
train_df = pd.read_csv('train_df.csv')
test_df = pd.read_csv('test_df.csv')

In [3]:
train_df.head()

Unnamed: 0,search_id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_70,feature_71,feature_72,feature_73,feature_74,feature_75,feature_76,feature_77,feature_78,target
0,758,9,0,0,1,20,3,40,0,3,...,0.204682,0.271755,0.055623,0,0,0,0.38648,0.0,0.0,0
1,758,9,0,0,1,20,3,40,0,3,...,0.195531,0.188787,0.036914,0,0,0,0.10982,0.0,0.0,0
2,758,9,0,0,1,20,3,40,0,3,...,0.148609,0.186517,0.027718,0,0,0,0.03674,0.0,0.0,0
3,758,9,0,0,1,20,3,40,0,3,...,0.223748,0.229039,0.051247,0,0,0,0.0,0.0,0.0,0
4,758,9,0,0,1,20,3,40,0,3,...,0.170935,0.249031,0.042568,0,0,0,0.0,0.0,0.0,0


In [4]:
queries = train_df['search_id'].unique()

Выберем из тренировочного датасета несколько групп для валидации модели

In [5]:
queries_train, queries_val = train_test_split(queries, test_size=0.2, random_state=42)

In [6]:
X_train = train_df[(train_df['search_id'].isin(queries_train))]
X_val = train_df[(train_df['search_id'].isin(queries_val))]
print(f"Количество строк в train датасете:\t {len(X_train)}")
print(f"Количество строк в val датасете:\t {len(X_val)}")
print(f"Количество строк в test датасете:\t {len(test_df)}")

Количество строк в train датасете:	 12120
Количество строк в val датасете:	 2961
Количество строк в test датасете:	 1529


In [7]:
queries_train = X_train['search_id'].values
y_train = X_train['target'].values
X_train = X_train.drop(columns=['search_id', 'target']).values

queries_val = X_val['search_id'].values
y_val = X_val['target'].values
X_val = X_val.drop(columns=['search_id', 'target']).values

queries_test = test_df['search_id'].values
y_test = test_df['target'].values
X_test = test_df.drop(columns=['search_id', 'target']).values

In [8]:
print(f"Количество групп в train датасете:\t {len(np.unique(queries_train))}")
print(f"Количество групп в val датасете:\t {len(np.unique(queries_val))}")
print(f"Количество групп в test датасете:\t {len(np.unique(queries_test))}")

Количество групп в train датасете:	 800
Количество групп в val датасете:	 200
Количество групп в test датасете:	 100


In [9]:
train = Pool(
    data=X_train,
    label=y_train,
    group_id=queries_train
)

val = Pool(
    data=X_val,
    label=y_val,
    group_id=queries_val
)

test = Pool(
    data=X_test,
    label=y_test,
    group_id=queries_test
)

Обучим на данных модель градиентного бустинга CatBoostRanker

In [10]:
default_parameters = {
    'iterations': 2000,
    'custom_metric': ['NDCG', 'PFound', 'AverageGain:top=10'],
    'verbose': False,
    'random_seed': 0,
    'early_stopping_rounds' : 100,
}

In [11]:
def fit_model(loss_function, additional_params=None, train_pool=train, test_pool=val):
    parameters = deepcopy(default_parameters)
    parameters['loss_function'] = loss_function
    
    if additional_params is not None:
        parameters.update(additional_params)
        
    model = CatBoostRanker(**parameters)
    model.fit(train_pool, eval_set=test_pool, plot=True)
    
    return model

In [12]:
model = fit_model('QueryRMSE')

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Оценка работы модели

In [13]:
ndcg_train = model.score(X_train, y_train, group_id=queries_train, top=len(y_train))
print(f"NDCG Score on Train Data: {ndcg_train}")
ndcg_val = model.score(X_val, y_val, group_id=queries_val, top=len(y_val))
print(f"NDCG Score on Val Data: {ndcg_val}")
print()
ndcg_test = model.score(X_test, y_test, group_id=queries_test, top=len(y_test))
print(f"NDCG Score on Test Data: {ndcg_test}")

NDCG Score on Train Data: 0.9418774235741215
NDCG Score on Val Data: 0.9037034850910662

NDCG Score on Test Data: 0.9050492574016954


In [14]:
model.save_model('catboost_model.cbm')