In [173]:
from __future__ import print_function

import sys
import numpy as np
from sklearn.preprocessing import QuantileTransformer
from sklearn.model_selection import train_test_split

sys.path.append('..')
import tensorflow as tf
from src.models.supplementary_code_direct_ranker.DirectRanker import directRanker
from src.models.supplementary_code_direct_ranker.helpers import readData, nDCGScorer_cls, MAP_cls

In [174]:
import json
TOTAL_SCORE_PATH = 'data/total_score.json'

In [175]:
x_train, y_train, q_train = readData(data_path="OHSUMED_TRAIN.txt", binary=False, 
                                     at=10, number_features=25, bin_cutoff=0.9, cut_zeros=False)
x_test, y_test, q_test = readData(data_path="OHSUMED_TEST.txt", binary=False, 
                                  at=10, number_features=25, bin_cutoff=0.9, cut_zeros=False)

In [176]:
def lambda_cost(nn, y0):
    return tf.reduce_mean(tf.log(1+tf.exp(nn))-nn)


# Load directRanker, train, and test
dr = directRanker(
    feature_activation=tf.nn.tanh,
    ranking_activation=tf.nn.tanh,
    # max_steps=10000,
    # For debugging
    #cost=lambda_cost,
    max_steps=12000,
    print_step=500,
    start_batch_size=4,
    end_batch_size=6,
    start_qids=20,
    end_qids=96,
    feature_bias=True,
    hidden_layers=[100, 30, 5],
    validation_size=1
)

In [177]:
dr.fit(x_train, y_train, ranking=True)

INFO:tensorflow:Scale of 0 disables regularizer.
step: 0, value: 1.4544665813446045, samples: 4, queries: 20
step: 500, value: 0.9079053997993469, samples: 4, queries: 21
step: 1000, value: 0.7941244840621948, samples: 4, queries: 22
step: 1500, value: 0.8779572248458862, samples: 4, queries: 24
step: 2000, value: 0.824640691280365, samples: 4, queries: 25
step: 2500, value: 0.7783437967300415, samples: 4, queries: 27
step: 3000, value: 0.8378933072090149, samples: 4, queries: 29
step: 3500, value: 0.7188060879707336, samples: 4, queries: 31
step: 4000, value: 0.7363569140434265, samples: 4, queries: 33
step: 4500, value: 0.637319028377533, samples: 4, queries: 36
step: 5000, value: 0.596937358379364, samples: 4, queries: 38
step: 5500, value: 0.698322057723999, samples: 4, queries: 41
step: 6000, value: 0.5675614476203918, samples: 4, queries: 43
step: 6500, value: 0.6824402809143066, samples: 4, queries: 46
step: 7000, value: 0.5559028387069702, samples: 5, queries: 49
step: 7500, va

In [178]:
prediction = []
ideal_rank = []
for i in range(len(x_test)):
    pred_q = dr.predict_proba(x_test[i])
    
    sort_idx = np.argsort(np.concatenate(pred_q))
    sorted_list = y_test[i][sort_idx][::-1] #по мнению модели
    yref = sorted(y_test[i], reverse=True) #идеальное ранжирование для запроса
    
    prediction.append(sorted_list)
    ideal_rank.append(yref)

In [179]:
prediction = [x.flatten() for x in prediction]

In [160]:
g = lambda x: 2 ** x - 1
d = lambda i: 1 / np.log2(i + 1)
def _dcg_k(ranked_target, k=None):
    if k is None:
        k = ranked_target.shape[0]

    ranked_target = ranked_target[:k]
    return np.sum(
        g(ranked_target) * d(np.arange(1, k + 1))
    )


def _ndcg_k(ranked_target, k=None):
    dcg_value = _dcg_k(ranked_target, k)

    ideal_dcg = _dcg_k(
        np.sort(ranked_target)[::-1],
        k
    )

    if ideal_dcg == 0:
        return 1
    return dcg_value / ideal_dcg


def dcg_k(ranked_target_list, k=None):
    scores = []
    for ranked_target in ranked_target_list:

        scores.append(
            _dcg_k(ranked_target, k)
        )

    return np.mean(scores)


def ndcg_k(ranked_target_list, k=None):
    scores = []
    for ranked_target in ranked_target_list:

        scores.append(
            _ndcg_k(ranked_target, k)
        )

    return np.mean(scores)


def _precision_at_k(ranked_target):
    return np.cumsum(ranked_target / 2) / np.arange(1, ranked_target.shape[0] + 1)


def _aprecision_at_k(ranked_target, k=None):
    if k is None:
        k = ranked_target.shape[0]

    precisions = _precision_at_k(ranked_target)[:k]
    ranked_target = ranked_target[:k]

    result = ranked_target * 1. / ranked_target.sum()

    return np.sum(result * precisions)


def map_k(ranked_target_list, k=None):
    scores = []
    for ranked_target in ranked_target_list:

        scores.append(
            _aprecision_at_k(ranked_target, k)
        )

    return np.mean(scores)


def mrr(ranked_target_list):
    total_score = 0
    for ranked_target in ranked_target_list:
        score = ranked_target.argmax()
        if ranked_target.max() != 2.:
            score = ranked_target.shape[0]

        total_score += score

    return total_score / len(ranked_target_list)



In [180]:
ndcg_k(prediction)

0.5846116029437558

In [181]:
map_k(prediction)

0.24437291502214054

In [182]:
mrr(prediction)

4.1

---

In [183]:
model_metrics = {
    'name': 'DirectRanker',
    'scores': {
        'ndcg_n': ndcg_k(prediction),
        'map_n': map_k(prediction),
        'mrr': mrr(prediction),
    }
}

In [185]:
import os
total_scores = []
if os.path.exists(TOTAL_SCORE_PATH):
    with open(TOTAL_SCORE_PATH) as input_stream:
        total_scores = json.load(input_stream)
    
total_scores.append(model_metrics)
with open(TOTAL_SCORE_PATH, 'w') as output_stream:
    json.dump(total_scores, output_stream)