## Import

In [1]:
import os
import sys
import itertools
import pickle
import json

In [2]:
import numpy as np
import pandas as pd

In [3]:
from sklearn.svm import SVC

In [4]:
module_path = os.path.abspath(os.path.join('..', 'src'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import utils
import metrics

In [5]:
import functools

## Utils

## Constants

In [6]:
DATA_PATH = 'data/OHSUMED.csv'
SVM_MODEL_PATH = 'data/rankSVM.pickle'

SVM_LOAD = True

TOTAL_SCORE_PATH = 'data/total_score.json'

In [7]:
TEST_SIZE = 0.7
TRAIN_SIZE = 1 - TEST_SIZE

## Data processing

In [8]:
documents = pd.read_csv(DATA_PATH)

features_columns = np.array([col for col in documents.columns if col.startswith('feat')])

In [9]:
documents.head(1)

Unnamed: 0,relevent_val,qid,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,...,feat17,feat18,feat19,feat20,feat21,feat22,feat23,feat24,feat25,doc_id
0,2.0,1,3.0,2.079442,0.272727,0.261034,37.330565,11.431241,37.29975,1.138657,...,24.808785,0.393091,57.416517,3.294893,25.0231,3.219799,-3.87098,-3.90273,-3.87512,40626


### train-test split

In [10]:
qid_grid = documents.qid.unique()
nqid = qid_grid.shape[0]

qid_threshold = int(nqid * TRAIN_SIZE)
qid_test_threshold = qid_threshold 
train_qids = qid_grid[:qid_threshold]
test_qids = qid_grid[qid_threshold:]

assert(train_qids.shape[0] + test_qids.shape[0] == nqid)

In [11]:
train_mask = np.in1d(documents.qid.values, train_qids)

documents_train = documents.loc[train_mask]
documents_test = documents.loc[~train_mask]

### pairwise processing

In [12]:
X_train, y_train = utils.construct_pairwise(documents_train, features_columns)

## RankSVM

In [13]:
rankSVM = None
if not SVM_LOAD:
    rankSVM = SVC(
        kernel='linear',
        verbose=100,
        C=.1
    )

In [14]:
%%time
if not SVM_LOAD:
    rankSVM.fit(X_train, y_train)

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 6.68 µs


In [15]:
if not SVM_LOAD:
    pickle.dump(rankSVM, open(SVM_MODEL_PATH, 'wb'))
    
if SVM_LOAD:
    with open(SVM_MODEL_PATH, 'rb') as input_stream:
        rankSVM = pickle.load(input_stream)

In [16]:
#берем 10% от всех запросов на тест
TEST_SIZE = 0.1

test_nqid = int(nqid * TEST_SIZE)
test_nqid = np.unique(documents_test.qid.values)[-test_nqid:]
test_mask = np.in1d(documents_test.qid.values, test_nqid)
doc_test = documents_test[test_mask]

In [17]:
def cmp(a, b):
    x1 = a[features_columns]
    x2 = b[features_columns]
    x1x2 = (x1-x2).values.reshape(1,-1)
    y_pred = rankSVM.predict(x1x2)
    if y_pred > 0:
        return 1
    else:
        return -1

In [18]:
def q_predict(docs_q):
    test_list = [docs_q.iloc[i,:] for i in range(0,len(docs_q))]
    return sorted(test_list, key=functools.cmp_to_key(cmp))

In [None]:
predict_forallq = []
for q in test_nqid:
    docs_q = doc_test[doc_test['qid'] == q]
    predict_forallq.append(q_predict(docs_q))

In [None]:
ranked_target_list = []
for q_predict in predict_forallq:
    ranked_target_list.append(
        np.array([x.relevent_val for x in q_predict])
    )

In [None]:
metrics.ndcg_k(ranked_target_list)

In [None]:
metrics.map_k(ranked_target_list)

In [None]:
metrics.mrr(ranked_target_list)

In [None]:
model_metrics = {
    'name': 'RankSVM',
    'scores': {
        'ndcg_n': metrics.ndcg_k(ranked_target_list),
        'map_n': metrics.map_k(ranked_target_list),
        'mrr': metrics.mrr(ranked_target_list),
    }
}

In [None]:
total_scores = []
if os.path.exists(TOTAL_SCORE_PATH):
    with open(TOTAL_SCORE_PATH) as input_stream:
        total_scores = json.load(input_stream)
    
total_scores.append(model_metrics)
with open(TOTAL_SCORE_PATH, 'w') as output_stream:
    json.dump(total_scores, output_stream)