## Import

In [1]:
import os
import sys
import itertools

In [2]:
import numpy as np
import pandas as pd

In [3]:
module_path = os.path.abspath(os.path.join('..', 'src'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import utils
import metrics
from models.adaRank import AdaRank

## Utils

## Constants

In [4]:
DATA_PATH = 'OHSUMED.csv'

In [5]:
TEST_SIZE = 0.1
TRAIN_SIZE = 1 - TEST_SIZE

## Data processing

In [6]:
documents = pd.read_csv(DATA_PATH)

features_columns = np.array([col for col in documents.columns if col.startswith('feat')])

In [7]:
documents.head(1)

Unnamed: 0,relevent_val,qid,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,...,feat17,feat18,feat19,feat20,feat21,feat22,feat23,feat24,feat25,doc_id
0,2.0,1,3.0,2.079442,0.272727,0.261034,37.330565,11.431241,37.29975,1.138657,...,24.808785,0.393091,57.416517,3.294893,25.0231,3.219799,-3.87098,-3.90273,-3.87512,40626


### train-test split

In [8]:
qid_grid = documents.qid.unique()
nqid = qid_grid.shape[0]

qid_threshold = int(nqid * TRAIN_SIZE)
train_qids = qid_grid[:qid_threshold]
test_qids = qid_grid[qid_threshold:]

assert(train_qids.shape[0] + test_qids.shape[0] == nqid)

In [9]:
train_mask = np.in1d(documents.qid.values, train_qids)

documents_train = documents.loc[train_mask]
documents_test = documents.loc[~train_mask]

### Construct listwise

In [10]:
qids_train, X_train, y_train = utils.construct_listwise(
    documents_train,
    features_columns
)

qids_test, X_test, y_test = utils.construct_listwise(
    documents_test,
    features_columns
)

## AdaRank

In [11]:
from models.adaMetrics import NDCGScorer

In [12]:
adaRank = AdaRank(
    verbose=True,
    max_iter=5000,
    tol=0.000001,
    estop=10,
    scorer=NDCGScorer(k=20)
)

In [13]:
%%time
adaRank.fit(X_train, y_train, qids_train)

1	0.46874805961843297	9	[0.30210799 0.54387771 0.86103913 0.34959007 0.82587332]	train 0.4372	valid 0.4372
2	0.417035742268734	7	[0.34059647 0.54117935 0.73798885 0.34855266 0.78984982]	train 0.4426	valid 0.4426
3	0.4170584820993096	7	[0.34059647 0.54117935 0.73798885 0.34855266 0.78984982]	train 0.4426	valid 0.4426
4	0.4173050930402886	7	[0.34059647 0.54117935 0.73798885 0.34855266 0.78984982]	train 0.4436	valid 0.4436
5	0.41720936079204307	7	[0.34059647 0.54117935 0.73798885 0.34855266 0.78984982]	train 0.4437	valid 0.4437
6	0.41690706852307746	7	[0.34059647 0.54117935 0.73798885 0.34855266 0.78984982]	train 0.4431	valid 0.4431
7	0.4171506074269735	7	[0.34059647 0.54117935 0.73798885 0.34855266 0.78984982]	train 0.4456	valid 0.4456
8	0.4171560204812066	7	[0.34059647 0.54117935 0.73798885 0.34855266 0.78984982]	train 0.4438	valid 0.4438
9	0.41702316557085356	7	[0.34059647 0.54117935 0.73798885 0.34855266 0.78984982]	train 0.4423	valid 0.4423
10	0.4168563326315577	7	[0.34059647 0.54117

CPU times: user 1.35 s, sys: 832 ms, total: 2.18 s
Wall time: 562 ms


16	0.41729442983617043	7	[0.34059647 0.54117935 0.73798885 0.34855266 0.78984982]	train 0.4433	valid 0.4433
17	0.41728535722239923	7	[0.34059647 0.54117935 0.73798885 0.34855266 0.78984982]	train 0.4430	valid 0.4430
18	0.4171270794550289	7	[0.34059647 0.54117935 0.73798885 0.34855266 0.78984982]	train 0.4431	valid 0.4431
19	0.41707451178199173	7	[0.34059647 0.54117935 0.73798885 0.34855266 0.78984982]	train 0.4424	valid 0.4424
20	0.4169430324181998	7	[0.34059647 0.54117935 0.73798885 0.34855266 0.78984982]	train 0.4427	valid 0.4427
21	0.4169931983008315	7	[0.34059647 0.54117935 0.73798885 0.34855266 0.78984982]	train 0.4418	valid 0.4418
22	0.4171660827637524	7	[0.34059647 0.54117935 0.73798885 0.34855266 0.78984982]	train 0.4416	valid 0.4416


AdaRank(estop=10, max_iter=5000,
        scorer=<models.adaMetrics.NDCGScorer object at 0x7f8fc0b3fd90>,
        tol=1e-06, verbose=True)

In [14]:
predictions = adaRank.predict(X_test, qids_test)

predictions = [np.array(predictions[start:end]) for start, end in utils.group_offsets(qids_test)]

In [15]:
ranked_targets = []
for prediction in predictions:
    idx = np.argsort(prediction)[::-1]
    
    ranked_targets.append(y_test[idx])

In [16]:
metrics.ndcg_k(ranked_targets, 10)

0.18951331767445428

In [17]:
metrics.ndcg_k(ranked_targets)

0.5558109581255387

In [18]:
metrics.map_k(ranked_targets)

0.18225424462803755

In [19]:
metrics.mrr(ranked_targets)

5.7272727272727275