In [167]:
# Google file system
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

Mounted at /gdrive


From GIT HUB we implement the AdaRank algorithm. 

In [168]:
import numpy as np
import re
import sys
import tabulate


def group_counts(arr):
    d = np.ones(arr.size, dtype=int)
    d[1:] = (arr[:-1] != arr[1:]).astype(int)
    return np.diff(np.where(np.append(d, 1))[0])


def group_offsets(arr):
    """Return a sequence of start/end offsets for the value subgroups in the input"""
    d = np.ones(arr.size, dtype=int)
    d[1:] = (arr[:-1] != arr[1:]).astype(int)
    idx = np.where(np.append(d, 1))[0]
    return zip(idx, idx[1:])


def load_docno(fname, letor=False):
    """Load docnos from the input in the SVMLight format"""
    if letor:
        docno_pattern = re.compile(r'#\s*docid\s*=\s*(\S+)')
    else:
        docno_pattern = re.compile(r'#\s*(\S+)')

    docno = []
    for line in open(fname):
        if line.startswith('#'):
            continue
        m = re.search(docno_pattern, line)
        if m is not None:
            docno.append(m.group(1))
    return np.array(docno)


def print_ranking(qid, docno, pred, output=None):
    table = []
    headers = ["qid", "docno", "rank", "score"]
    if output is None:
        output = sys.stdout
    for a, b in group_offsets(qid):
        idx = np.argsort(-pred[a:b]) + a  # note the minus and plus a
        for rank, i in enumerate(idx, 1):
            table.append([qid[i], docno[i], rank, round(pred[i],3)])
    output.write(tabulate.tabulate(table, headers, tablefmt="pretty"))
    
    


In [169]:
from __future__ import division

import numpy as np


class Scorer(object):
    def __init__(self, score_func, **kwargs):
        self.score_func = score_func
        self.kwargs = kwargs

    def __call__(self, *args):
        return self.score_func(*args, **self.kwargs)


# Precision
#
def _p_score(y_true, y_pred, k=None):
    order = np.argsort(-y_pred)
    y_true = np.take(y_true, order[:k])
    return np.sum(y_true > 0) / len(y_true)


def p_score(y_true, y_pred, qid, k=None):
    return np.array([_p_score(y_true[a:b], y_pred[a:b], k=k) for a, b in group_offsets(qid)])


class PScorer(Scorer):
    def __init__(self, **kwargs):
        super(PScorer, self).__init__(_p_score, **kwargs)


# AP (Average Precision)
#
def _ap_score(y_true, y_pred):
    order = np.argsort(-y_pred)
    y_true = np.take(y_true, order)
    pos = 1 + np.where(y_true > 0)[0]
    n_rels = 1 + np.arange(len(pos))
    return np.mean(n_rels / pos) if len(pos) > 0 else 0


def ap_score(y_true, y_pred, qid):
    return np.array([_ap_score(y_true[a:b], y_pred[a:b]) for a, b in group_offsets(qid)])


class APScorer(Scorer):
    def __init__(self):
        super(APScorer, self).__init__(_ap_score)


# DCG/nDCG (Normalized Discounted Cumulative Gain)
#
def _burges_dcg(y_true, y_pred, k=None):
    # order = np.argsort(y_pred)[::-1]
    order = np.argsort(-y_pred)
    y_true = np.take(y_true, order[:k])
    gain = 2 ** y_true - 1
    discounts = np.log2(np.arange(len(gain)) + 2)
    return np.sum(gain / discounts)


def _trec_dcg(y_true, y_pred, k=None):
    order = np.argsort(-y_pred)
    y_true = np.take(y_true, order[:k])
    gain = y_true
    discounts = np.log2(np.arange(len(gain)) + 2)
    return np.sum(gain / discounts)


def _dcg_score(y_true, y_pred, qid, k=None, dcg_func=None):
    assert dcg_func is not None
    y_true = np.maximum(y_true, 0)
    return np.array([dcg_func(y_true[a:b], y_pred[a:b], k=k) for a, b in group_offsets(qid)])


def _ndcg_score(y_true, y_pred, qid, k=None, dcg_func=None):
    assert dcg_func is not None
    y_true = np.maximum(y_true, 0)
    dcg = _dcg_score(y_true, y_pred, qid, k=k, dcg_func=dcg_func)
    idcg = np.array([dcg_func(np.sort(y_true[a:b]), np.arange(0, b - a), k=k)
                     for a, b in group_offsets(qid)])
    assert (dcg <= idcg).all()
    idcg[idcg == 0] = 1
    return dcg / idcg


def ndcg_score(y_true, y_pred, qid, k=None, version='burges'):
    assert version in ['burges', 'trec']
    dcg_func = _burges_dcg if version == 'burges' else _trec_dcg
    return _ndcg_score(y_true, y_pred, qid, k=k, dcg_func=dcg_func)



class NDCGScorer(Scorer):
    def __init__(self, **kwargs):
        super(NDCGScorer, self).__init__(ndcg_score, **kwargs)



In [170]:
"""
AdaRank algorithm
"""
from __future__ import print_function, division

import math
import numpy as np
import sklearn
import sys

from sklearn.utils import check_X_y



class AdaRank(sklearn.base.BaseEstimator):
    """AdaRank algorithm"""

    def __init__(self, max_iter=500, tol=0.0001, estop=1, verbose=False, scorer=None):
        self.max_iter = max_iter
        self.tol = tol
        self.estop = estop
        self.verbose = verbose
        self.scorer = scorer

    def fit(self, X, y, qid, X_valid=None, y_valid=None, qid_valid=None):
        """Fit a model to the data"""
        X, y = check_X_y(X, y, 'csr')
        X = X.toarray()

        if X_valid is None:
            X_valid, y_valid, qid_valid = X, y, qid
        else:
            X_valid, y_valid = check_X_y(X_valid, y_valid, 'csr')
            X_valid = X_valid.toarray()

        n_queries = np.unique(qid).shape[0]
        weights = np.ones(n_queries, dtype=np.float64) / n_queries
        weak_rankers = []
        coef = np.zeros(X.shape[1])

        # use nDCG@10 as the default scorer
        if self.scorer is None:
            self.scorer = NDCGScorer(k=10)

        # precompute performance measurements for all weak rankers
        weak_ranker_score = []
        for j in range(X.shape[1]):
            pred = X[:, j].ravel()
            weak_ranker_score.append(self.scorer(y, pred, qid))

        best_perf_train = -np.inf
        best_perf_valid = -np.inf
        used_fids = []
        estop = None

        self.n_iter = 0
        while self.n_iter < self.max_iter:
            self.n_iter += 1

            best_weighted_average = -np.inf
            best_weak_ranker = None
            for fid, score in enumerate(weak_ranker_score):
                if fid in used_fids:
                    continue
                weighted_average = np.dot(weights, score)
                if weighted_average > best_weighted_average:
                    best_weak_ranker = {'fid': fid, 'score': score}
                    best_weighted_average = weighted_average

            # stop when all the weaker rankers are out
            if best_weak_ranker is None:
                break

            h = best_weak_ranker
            h['alpha'] = 0.5 * (math.log(np.dot(weights, 1 + h['score']) /
                                         np.dot(weights, 1 - h['score'])))
            weak_rankers.append(h)

            # update the ranker
            coef[h['fid']] += h['alpha']

            # if len(used_fids) > 5:
            #     used_fids.pop(0)
            # used_fids.append(h['fid'])

            # score both training and validation data
            score_train = self.scorer(y, np.dot(X, coef), qid)
            perf_train = score_train.mean()

            perf_valid = perf_train
            if X_valid is not X:
                perf_valid = self.scorer(y_valid, np.dot(X_valid, coef), qid_valid).mean()

            if self.verbose:
                print('{n_iter}\t{alpha}\t{fid}\t{score}\ttrain {train:.4f}\tvalid {valid:.4f}'.
                      format(n_iter=self.n_iter, alpha=h['alpha'], fid=h['fid'],
                             score=h['score'][:5], train=perf_train, valid=perf_valid),
                      file=sys.stderr)

            # update the best validation scores
            if perf_valid > best_perf_valid + self.tol:
                estop = 0
                best_perf_valid = perf_valid
                self.coef_ = coef.copy()
            else:
                estop += 1

            # update the best training score
            if perf_train > best_perf_train + self.tol:
                best_perf_train = perf_train
            else:
                # stop if scores on both sets fail to improve
                if estop >= self.estop:
                    break

            # update weights
            new_weights = np.exp(-score_train)
            weights = new_weights / new_weights.sum()

        return self

    def predict(self, X, qid):
        """Make predictions"""
        return np.dot(X.toarray(), self.coef_)


Model implementation for our dataset.

Import of the necessary libraries

In [171]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import dump_svmlight_file, load_svmlight_file
import re
import nltk
from nltk import PorterStemmer, WordNetLemmatizer
import string as st
import numpy as np
import os

**FIRST MODEL - ORIGINAL DATASET**

We obtain the data set that we are going to use for the development of the model. 

In [172]:
sheets_dict = pd.read_excel("/gdrive/My Drive/MASTER/INFO.BIOMÉDICA/loinc_dataset_labels-v2.xlsx", sheet_name=None, skiprows=1, header=1)
all_sheets = []
for name, sheet in sheets_dict.items():
    all_sheets.append(sheet)
    df= pd.concat(all_sheets)
    df.reset_index(inplace=True, drop=True)

Download of 
 - Natural Language ToolKit - allows the developmnet of the AdaRank model. 

In [173]:
import nltk
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In order to carry out the AdaRank algorithm the first thing we do is to download the "stopwords" and "wordnet" that is implemented in nltk. 

As a series of errors can appear what we do is to implement a "try/except" to be able to download everything correctl

In [174]:
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')
try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


The next thing we do is to clear the text (column long_common name) to be able to insert it in the model. 


The first thing we want to do is to remove the punctuation marks, for this we use string.punctuation imported earlier, which contains all the punctuation symbols.  
We generate a function to be able to apply it later on the data (we will do this with all the elements necessary for the cleaning of the text). 

In [175]:
def remove_punct(text):
    return ("".join([ch for ch in text if ch not in st.punctuation]))

We need to tokenize each of the words in each document in order to enter them into the model.

In [176]:
def tokenize(text):
    text = re.split('\s+' ,text)
    return [x.lower() for x in text]

Removal of all tokens with lengths less than 3 as we assume that they are not sufficiently relevant for training the model.

In [177]:
def remove_small_words(text):
    return [x for x in text if len(x) > 3 ]

Removal of all elements called "stopwords" (articles, prepositions, pronouns, etc). To do this we set the language to English, as this is the language in which we have our data. 

It is done by using NLTK corpus stopwords list to match. 

In [178]:
def remove_stopwords(text):
    return [word for word in text if word not in nltk.corpus.stopwords.words('english')]

Using NLTK's PorterStemmer() we can keep the root words by removing the morphological suffixes and prefixes. 

In [179]:
def stemming(text):
    ps = PorterStemmer()
    return [ps.stem(word) for word in text]

Using WordNetLemmatizer, again a module of NLTK, we can carry out the "lemmatization" of the words that remain in the text of each of the documents. 

In this way, we obtain all those words that have a semantic relationship with each other. 

In [180]:
def lemmatize(text):
    word_net = WordNetLemmatizer()
    return [word_net.lemmatize(word) for word in text]

Finally for each of the words that are in token form we join them together, generating a whole sentence in token form. 

In [181]:
def return_sentences(tokens):
    return " ".join([word for word in tokens])

For all the documents that we have in our DataFrame we have to apply each of these steps, so, in order to simplify the code we generate a function that implements all of them.

In [182]:
def preprocess_text(text):
    return return_sentences(
    lemmatize(stemming (remove_stopwords(remove_small_words(
                   (tokenize(remove_punct(text))))))))

Convert documents to feature vectors. 

In [183]:
tfidf = TfidfVectorizer()
df['clean_text'] = df['long_common_name'].apply(lambda x: preprocess_text(x))
print(df['clean_text'])
X = tfidf.fit_transform(df['clean_text']).toarray()

0                 reactiv protein massvolum serum plasma
1                              bicarbon molesvolum blood
2                                             type blood
3                    trimethoprimsulfamethoxazol suscept
4                    bilirubintot massvolum serum plasma
                             ...                        
196                                  monocyt volum blood
197                           major crossmatch interpret
198                                   ampicillin suscept
199    alanin aminotransferas enzymat activityvolum s...
200                          ampicillinsulbactam suscept
Name: clean_text, Length: 201, dtype: object


We check how correctly a new column has been added in which we have the text completely processed.


In [184]:
print(X)
print("long. X:", len(X))

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.43361229 0.57014337 0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]
long. X: 201


As we can see what we get is an array with a length of 201 elements (as many as rows as the DataFrame has) in the form of tokens.

We add a new column called features, in which we have the previous vectorization. (Each document has its own vectorization).  

In addition, we define the train (70%) and test (30%)  sets.

In [185]:
df['features'] = X.tolist()

#Get train and test set
df = df.sample(frac=1, random_state = 2)
train_size = int(0.7 * len(df))
train_set = df[:train_size].sort_values(by=['qid'])
test_set = df[train_size:].sort_values(by=['qid'])

The next thing we will do is to convert the DataFrame to a numpy array and differ from X features and Y labels. 


In [186]:
# DataFrame to numpy array
qid = df['qid'].to_numpy()
y = df['Label'].to_numpy()
X_train = np.array(train_set['features'].values.tolist())
qid_train = train_set['qid'].to_numpy()
y_train = train_set['Label'].to_numpy()
X_test = np.array(test_set['features'].values.tolist())
qid_test = test_set['qid'].to_numpy()
y_test = test_set['Label'].to_numpy()

We converted the arrays created just before into "svmlight" format documents, because the implemented AdaRank model uses this type of files. 

In addition, we add a document number to each of the documents in our data in order to identify them. So, all those that do not have a document number are deleted. 
- We do this in order to later obtain an adequate ranking of the prediction made by the model, being able to obtain the number of documents located in each position. 
- To simplify the code we use the following function.

In [187]:
def add_docnos(docnos, file, train_docnos, train_file, test_docnos, test_file):
    if file == 'loinc_dataset-v2_without_docnos.dat':
        out_file = 'loinc_dataset-v2.dat'
        out_train_file = 'train_loinc_dataset-v2.dat'
        out_test_file = 'test_loinc_dataset-v2.dat'
    else:
        out_file = 'extended_loinc_dataset-v2.dat'
        out_train_file = 'extended_train_loinc_dataset-v2.dat'
        out_test_file = 'extended_test_loinc_dataset-v2.dat'
        
    in_files = [file, train_file, test_file]
    out_files = [out_file, out_train_file, out_test_file]
    
    for i in range(3):
        if i == 0:
            data = docnos
        elif i == 1:
            data = train_docnos
        else:
            data = test_docnos
        
        with open(in_files[i]) as fin, open(out_files[i], 'w') as fout:
                index = 0
                for line in fin:
                    fout.write(line.replace('\n', ' ' + str(data[index]) + '\n'))
                    index += 1
    return out_file, out_train_file, out_test_file

In [188]:
def df_to_svmlight_files(df, X, y, X_train, y_train, X_test, y_test):
    if len(df) == 201: #201 is the length of our original loinc dataset 
        file = 'loinc_dataset-v2_without_doc_numb.dat'
        train_file = 'train_loinc_dataset-v2_without_doc_numb.dat'
        test_file = 'test_loinc_dataset-v2_without_doc_numb.dat'
    else:
        file = 'extended_loinc_dataset-v2_without_doc_numb.dat'
        train_file = 'extended_train_loinc_dataset-v2_without_doc_numb.dat'
        test_file = 'extended_test_loinc_dataset-v2_without_doc_numb.dat'
  
  
    # Numpy arrays into svmlight files
    dump_svmlight_file(X, y, file, query_id=qid)
    dump_svmlight_file(X_train, y_train, train_file, query_id=qid_train)
    dump_svmlight_file(X_test, y_test, test_file, query_id=qid_test)
    
    # Add docnos to svmlight files
    out_file, out_train_file, out_test_file = add_docnos(df['doc_numb'].tolist(), file, train_set['doc_numb'].tolist()
                                                         , train_file, test_set['doc_numb'].tolist(), test_file)
    
    # Remove files without docnos
    my_dir = os.getcwd()
    for fname in os.listdir(my_dir):
        if 'docnos' in fname :
            os.remove(os.path.join(my_dir, fname))
    return out_file, out_train_file, out_test_file

In [189]:
file, train_file, test_file  = df_to_svmlight_files(df, X, y, X_train, y_train, X_test, y_test)

Once we have preprocessed all the text and have it in the proper format we perform the AdaRank algorithm. 

Definimos cada uno de los conjuntos de train y test
Redifine train and test set in the svmlight files obtained.

In [190]:
X_train, y_train, qid_train = load_svmlight_file(train_file, query_id=True)
X_test, y_test, qid_test = load_svmlight_file(test_file, query_id=True)

Implementation of the AdaRank algorithm:
- Maximum iterations: 100
- Stop = 10 
- Scorer = NDCG with K = 20.

We use Normalized Discounted Cumulative Gain as the score function because it is a quality ranking measure that is used to measure the effectiveness of different algorithms. 

AdaRank is implemented for 100 iterations where NDCG@20 is optimized. When no improvement is made in the previous 10 iterations the algorithm stops.




In [191]:
model = AdaRank(max_iter=100, estop=10, scorer=NDCGScorer(k=5)).fit(X_train, y_train, qid_train)

Predicción del modelo utilizando el conjunto de test.

In [192]:
pred = model.predict(X_test, qid_test)

Obtenemos los resultados del score de NDCG.

In [193]:
for k in (1, 2, 3, 4, 5, 10, 20, 40):
    score = NDCGScorer(k=k)(y_test, pred, qid_test).mean()
    print('nDCG@{}\t{}'.format(k, score))

nDCG@1	0.6666666666666666
nDCG@2	0.4797939499646728
nDCG@3	0.446458651520288
nDCG@4	0.446458651520288
nDCG@5	0.446458651520288
nDCG@10	0.519261130459676
nDCG@20	0.5601962312253753
nDCG@40	0.5601962312253753


We also implement AP_score and precison score. 

In [194]:
ap_score = APScorer()(y_test, pred)
print("ap_score:", ap_score)

ap_score: 0.3400857326242435


In order to calculate the precision metric we round the decimal numbers of the predictions obtained to their nearest whole number without decimal places. 

In [202]:
round_pred = np.round(pred, decimals = 0)

In [201]:
prec_score = PScorer()(y_test, round_pred)
print("precision score:", prec_score)

precision score: 0.08196721311475409


We obtain the ranking of the documents in the test set. 

We use the load_docno function already implemented, which returns the document number of each of the documents that make up the test set.

In [203]:
docno = load_docno(test_file, letor=False)

With the print_ranking function we obtain the ranking of each of the documents in the test set, as well as the score given to each of them.

In [204]:
print_ranking(qid_test, docno, pred)

+-----+-------+------+-------+
| qid | docno | rank | score |
+-----+-------+------+-------+
|  1  |  23   |  1   | 1.121 |
|  1  |  53   |  2   | 0.976 |
|  1  |   5   |  3   | 0.819 |
|  1  |  27   |  4   | 0.328 |
|  1  |  33   |  5   | 0.304 |
|  1  |  50   |  6   | 0.281 |
|  1  |  39   |  7   | 0.224 |
|  1  |  47   |  8   | 0.217 |
|  1  |  59   |  9   | 0.203 |
|  1  |  16   |  10  | 0.196 |
|  1  |  44   |  11  | 0.155 |
|  1  |  32   |  12  | 0.145 |
|  1  |   8   |  13  |  0.0  |
|  1  |  34   |  14  |  0.0  |
|  1  |  52   |  15  |  0.0  |
|  1  |  64   |  16  |  0.0  |
|  1  |  40   |  17  |  0.0  |
|  1  |  48   |  18  |  0.0  |
|  1  |  51   |  19  |  0.0  |
|  1  |  38   |  20  |  0.0  |
|  2  |  17   |  1   | 0.328 |
|  2  |  38   |  2   | 0.304 |
|  2  |  37   |  3   | 0.304 |
|  2  |  36   |  4   | 0.303 |
|  2  |  30   |  5   | 0.303 |
|  2  |  14   |  6   | 0.246 |
|  2  |  39   |  7   | 0.229 |
|  2  |  10   |  8   | 0.222 |
|  2  |  44   |  9   | 0.212 |
|  2  | 

**FIRST MODEL - ORIGINAL DATASET**

We follow the same steps as above for this new dataset 

In [205]:
sheets_dict = pd.read_excel("/gdrive/My Drive/MASTER/INFO.BIOMÉDICA/extended_loinc_dataset-v2.xlsx", sheet_name=None, skiprows=1, header=1)
all_sheets = []
for name, sheet in sheets_dict.items():
    all_sheets.append(sheet)
    df_ext= pd.concat(all_sheets)
    df_ext.reset_index(inplace=True, drop=True)

In [206]:
tfidf = TfidfVectorizer()
df_ext['clean_text'] = df_ext['long_common_name'].apply(lambda x: preprocess_text(x))
print(df_ext['clean_text'])
X = tfidf.fit_transform(df_ext['clean_text']).toarray()

0                 reactiv protein massvolum serum plasma
1                              bicarbon molesvolum blood
2                                             type blood
3                    trimethoprimsulfamethoxazol suscept
4                    bilirubintot massvolum serum plasma
                             ...                        
595                       caroten massvolum serum plasma
596    hepat viru gene mutat detect identifi genotyp ...
597                     lutropin unitsvolum serum plasma
598              25hydroxyvitamin massvolum serum plasma
599                                   zinc massmass hair
Name: clean_text, Length: 600, dtype: object


In [207]:
print(X)
print("long. X:", len(X))

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.85637899 0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.57735027]]
long. X: 600


In [208]:
df_ext['features'] = X.tolist()

#Get train and test set
df_ext= df_ext.sample(frac=1, random_state = 2)
train_size = int(0.7 * len(df_ext))
train_set = df_ext[:train_size].sort_values(by=['qid'])
test_set = df_ext[train_size:].sort_values(by=['qid'])

In [209]:
# DataFrame to numpy array
qid = df_ext['qid'].to_numpy()
y = df_ext['Label'].to_numpy()
X_train = np.array(train_set['features'].values.tolist())
qid_train = train_set['qid'].to_numpy()
y_train = train_set['Label'].to_numpy()
X_test = np.array(test_set['features'].values.tolist())
qid_test = test_set['qid'].to_numpy()
y_test = test_set['Label'].to_numpy()

In [210]:
file, train_file, test_file  = df_to_svmlight_files(df_ext, X, y, X_train, y_train, X_test, y_test)

In [211]:
ranking_file = 'ranking_extended.txt'
X_train, y_train, qid_train = load_svmlight_file(train_file, query_id=True)
X_test, y_test, qid_test = load_svmlight_file(test_file, query_id=True)

In [212]:
model = AdaRank(max_iter=100, estop=10, scorer=NDCGScorer(k=5)).fit(X_train, y_train, qid_train)

In [213]:
pred = model.predict(X_test, qid_test)

In [214]:
for k in (1, 2, 3, 4, 5, 10, 20, 40):
    score = NDCGScorer(k=k)(y_test, pred, qid_test).mean()
    print('nDCG@{}\t{}'.format(k, score))

nDCG@1	0.0
nDCG@2	0.0
nDCG@3	0.0
nDCG@4	0.08802270752419696
nDCG@5	0.15249817539662058
nDCG@10	0.24650686715920522
nDCG@20	0.25682062871869804
nDCG@40	0.33714155969496207


In [215]:
docno = load_docno(test_file, letor=False)

In [216]:
print_ranking(qid_test, docno, pred)

+-----+-------+------+-------+
| qid | docno | rank | score |
+-----+-------+------+-------+
|  1  |  62   |  1   | 0.757 |
|  1  |  48   |  2   | 0.611 |
|  1  |  23   |  3   | 0.597 |
|  1  |  57   |  4   | 0.551 |
|  1  |  51   |  5   | 0.476 |
|  1  |  10   |  6   | 0.061 |
|  1  |  50   |  7   | 0.058 |
|  1  |  22   |  8   | 0.058 |
|  1  |   9   |  9   | 0.058 |
|  1  |   1   |  10  | 0.051 |
|  1  |  60   |  11  | 0.05  |
|  1  |  47   |  12  | 0.044 |
|  1  |  59   |  13  | 0.041 |
|  1  |  44   |  14  | 0.032 |
|  1  |  32   |  15  | 0.029 |
|  1  |  46   |  16  | 0.025 |
|  1  |  52   |  17  |  0.0  |
|  1  |  35   |  18  |  0.0  |
|  1  |  37   |  19  |  0.0  |
|  1  |  49   |  20  |  0.0  |
|  1  |  27   |  21  |  0.0  |
|  1  |  20   |  22  |  0.0  |
|  1  |  64   |  23  |  0.0  |
|  2  |  24   |  1   | 0.757 |
|  2  |  23   |  2   | 0.611 |
|  2  |  31   |  3   | 0.596 |
|  2  |   1   |  4   | 0.429 |
|  2  |  15   |  5   | 0.07  |
|  2  |  59   |  6   | 0.061 |
|  2  | 

In [217]:
ap_score = APScorer()(y_test, pred)
print("ap_score:", ap_score)

ap_score: 0.1378849059560028


In [219]:
round_pred = np.round(pred, decimals = 0)

In [220]:
prec_score = PScorer()(y_test, round_pred)
print("precision score:", prec_score)

precision score: 0.07777777777777778
