In [1]:
from catboost import CatBoostRanker, Pool, MetricVisualizer
from copy import deepcopy
import numpy as np
import os
import pandas as pd
from tqdm import tqdm
from collections import Counter
import json

#pd.set_option('display.max_columns', 250)

### Dataset

In [2]:
with open('imat2009_new_split/imat2009_train_new.txt') as f:
    train_data = f.readlines()
    
with open('imat2009_new_split/imat2009_test_new.txt') as f:
    test_data = f.readlines()


In [3]:
def extract_vals(line):
    n_features = 245
    
    items = line[:-1].replace(' #', '').split(' ')
        
    row_dict = {}
    row_dict[0] = float(items[0])

    for i in range(1, n_features+1):
        row_dict[i] = 0
        
    for item in items[1:-1]:
        key, val = item.split(':')
        row_dict[int(key)] = float(val)

    row_dict[n_features+1] = int(items[-1])

    return row_dict


print('generating train dataframe...')
train_df = []
for line in tqdm(train_data, total=len(train_data), bar_format='{l_bar}{bar:30}{r_bar}{bar:-10b}'):
    train_df.append(extract_vals(line))
train_df = pd.DataFrame(train_df)

print('generating test dataframe...')
test_df = []
for line in tqdm(test_data, total=len(test_data), bar_format='{l_bar}{bar:30}{r_bar}{bar:-10b}'):
    test_df.append(extract_vals(line))
test_df = pd.DataFrame(test_df)

train_df.head(5)


generating train dataframe...


100%|██████████████████████████████| 77714/77714 [00:03<00:00, 22492.96it/s]    


generating test dataframe...


100%|██████████████████████████████| 19576/19576 [00:00<00:00, 23260.24it/s]    


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,237,238,239,240,241,242,243,244,245,246
0,1.0,2.3e-05,0.0,0.0,0.0,0.0,0.0,0.704953,0.550315,0.032294,...,0.0,0.0,0.0,0.0,0.0,0.0,2.3e-05,1.0,2.3e-05,3382
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.273423,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.862745,0.0,3382
2,1.0,0.0,0.0,0.0068,0.051546,0.0,0.0,0.671346,0.0,0.0,...,0.0,0.0,0.0,0.154346,0.0,0.0,0.0,0.811765,0.0,3382
3,1.0,0.0,0.0,0.000862,0.030928,0.0,0.0,0.573946,0.0,0.0,...,0.0,0.0,0.0,0.039509,0.0,0.0,0.0,1.0,0.0,3382
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.261436,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.882353,0.0,3382


In [4]:
# We drop the relevance (1st column) and queryid (last column)
X_train = train_df.drop([0, train_df.shape[1]-1], axis=1).values
y_train = train_df[0].values
queries_train = train_df[train_df.shape[1]-1].values

X_test = test_df.drop([0, test_df.shape[1]-1], axis=1).values
y_test = test_df[0].values
queries_test = test_df[test_df.shape[1]-1].values


### Dataset analysis

In [5]:
train_num_documents = X_train.shape[0]
print('Number of documents (train):', train_num_documents)

test_num_documents = X_test.shape[0]
print('Number of documents (test):', test_num_documents)

# print('Distribution of relevance scores:')
# Counter(y_train).items()

# Normalization of relevance scores
max_relevance = np.max(y_train)
y_train /= max_relevance
y_test /= max_relevance

train_num_queries = np.unique(queries_train).shape[0]
print('Number of train queries:', train_num_queries)

test_num_queries = np.unique(queries_test).shape[0]
print('Number of test queries:', test_num_queries)

num_features = X_train.shape[1]
print('Number of features:', num_features)


Number of documents (train): 77714
Number of documents (test): 19576
Number of train queries: 7300
Number of test queries: 1824
Number of features: 245


### Creation of CatBoost pools

In [6]:
train = Pool(
    data=X_train,
    label=y_train,
    group_id=queries_train
)

test = Pool(
    data=X_test,
    label=y_test,
    group_id=queries_test
)


### Reducing problem to machine learning task

In [7]:
default_parameters = {
    'iterations': 2000,
    'custom_metric': ['NDCG'],
    'verbose': False,
    'random_seed': 0,
}

parameters = {}

def fit_model(loss_function, additional_params=None, train_pool=train, test_pool=test):
    parameters = deepcopy(default_parameters)
    parameters['loss_function'] = loss_function
    parameters['train_dir'] = loss_function
    
    if additional_params is not None:
        parameters.update(additional_params)
        
    model = CatBoostRanker(**parameters)
    model.fit(train_pool, eval_set=test_pool, plot=True)
    
    return model


### 1st variant: PairLogit

In [8]:
pair_logit_model = fit_model('PairLogit')

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [9]:
print('best iteration', pair_logit_model.best_iteration_)
print(json.dumps(pair_logit_model.best_score_, indent=2))


best iteration 1399
{
  "learn": {
    "PairLogit": 0.31398503926498245
  },
  "validation": {
    "NDCG:type=Base": 0.8943728100892232,
    "PairLogit": 0.5564476911949912
  }
}


In [10]:
pair_logit_model.get_all_params()

{'nan_mode': 'Min',
 'eval_metric': 'PairLogit',
 'iterations': 2000,
 'sampling_frequency': 'PerTree',
 'leaf_estimation_method': 'Newton',
 'grow_policy': 'SymmetricTree',
 'penalties_coefficient': 1,
 'boosting_type': 'Plain',
 'model_shrink_mode': 'Constant',
 'feature_border_type': 'GreedyLogSum',
 'bayesian_matrix_reg': 0.10000000149011612,
 'eval_fraction': 0,
 'force_unit_auto_pair_weights': False,
 'l2_leaf_reg': 3,
 'random_strength': 1,
 'rsm': 1,
 'boost_from_average': False,
 'model_size_reg': 0.5,
 'pool_metainfo_options': {'tags': {}},
 'subsample': 0.800000011920929,
 'use_best_model': True,
 'random_seed': 0,
 'depth': 6,
 'posterior_sampling': False,
 'border_count': 254,
 'classes_count': 0,
 'auto_class_weights': 'None',
 'sparse_features_conflict_fraction': 0,
 'custom_metric': ['NDCG'],
 'leaf_estimation_backtracking': 'AnyImprovement',
 'best_model_min_trees': 1,
 'model_shrink_rate': 0,
 'min_data_in_leaf': 1,
 'loss_function': 'PairLogit',
 'learning_rate': 0.0

### 2nd variant: YetiRank

In [11]:
yeti_rank_model = fit_model('YetiRank')

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))