## Import

In [1]:
import os
import sys
import itertools

In [2]:
import numpy as np
import pandas as pd

In [3]:
from sklearn.svm import SVC

In [4]:
module_path = os.path.abspath(os.path.join('..', 'src'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import utils

In [108]:
import functools

## Utils

## Constants

In [5]:
DATA_PATH = 'OHSUMED.csv'

In [71]:
TEST_SIZE = 0.7
TRAIN_SIZE = 1 - TEST_SIZE

## Data processing

In [72]:
documents = pd.read_csv(DATA_PATH)

features_columns = np.array([col for col in documents.columns if col.startswith('feat')])

In [73]:
documents.head(1)

Unnamed: 0,relevent_val,qid,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,...,feat17,feat18,feat19,feat20,feat21,feat22,feat23,feat24,feat25,doc_id
0,2.0,1,3.0,2.079442,0.272727,0.261034,37.330565,11.431241,37.29975,1.138657,...,24.808785,0.393091,57.416517,3.294893,25.0231,3.219799,-3.87098,-3.90273,-3.87512,40626


### train-test split

In [74]:
qid_grid = documents.qid.unique()
nqid = qid_grid.shape[0]

qid_threshold = int(nqid * TRAIN_SIZE)
qid_test_threshold = qid_threshold 
train_qids = qid_grid[:qid_threshold]
test_qids = qid_grid[qid_threshold:]

assert(train_qids.shape[0] + test_qids.shape[0] == nqid)

In [75]:
train_mask = np.in1d(documents.qid.values, train_qids)

documents_train = documents.loc[train_mask]
documents_test = documents.loc[~train_mask]

### pairwise processing

In [76]:
X_train, y_train = utils.construct_pairwise(documents_train, features_columns)

In [77]:
np.unique(y_train)

array([-1.,  1.])

In [115]:
X_train[0].shape

(25,)

## RankSVM

In [23]:
rankSVM = SVC(
    kernel='linear',
    verbose=100,
    C=.1
)

In [24]:
%%time
rankSVM.fit(X_train, y_train)

[LibSVM]CPU times: user 41min 41s, sys: 50.3 s, total: 42min 31s
Wall time: 44min 24s


SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=100)

In [28]:
import pickle
model_path = 'rankSVM.pickle'
pickle.dump(rankSVM, open(model_path, 'wb')) 

In [98]:
#берем 10% от всех запросов на тест
TEST_SIZE = 0.1
test_nqid = int(nqid * TEST_SIZE)
test_nqid = np.unique(documents_test.qid.values)[:test_nqid]
test_mask = np.in1d(documents_test.qid.values, test_nqid)
doc_test = documents_test[test_mask]

In [134]:
def cmp(a, b):
    x1 = a[features_columns]
    x2 = b[features_columns]
    x1x2 = (x1-x2).values.reshape(1,-1)
    y_pred = rankSVM.predict(x1x2)
    if y_pred > 0:
        return 1
    else:
        return -1

In [135]:
def q_predict(docs_q):
    test_list = [docs_q.iloc[i,:] for i in range(0,len(docs_q))]
    return sorted(test_list, key=functools.cmp_to_key(cmp))

In [137]:
predict_forallq = []
for q in test_nqid:
    docs_q = doc_test[doc_test['qid'] == q]
    predict_forallq.append(q_predict(docs_q))

In [136]:
#так выглядит предикт для одного запроса
docs_q = doc_test[doc_test['qid'] == 32]
q_predict(docs_q)

[relevent_val        1.000000
 qid                32.000000
 feat1               2.000000
 feat2               1.386294
 feat3               0.666667
 feat4               0.575364
 feat5              16.986361
 feat6               5.129234
 feat7              16.970689
 feat8               2.299437
 feat9              12.951908
 feat10             10.736858
 feat11              0.000000
 feat12              0.000000
 feat13              0.000000
 feat14              0.000000
 feat15             15.162743
 feat16              4.764712
 feat17             13.128457
 feat18              0.000000
 feat19              0.000000
 feat20              0.000000
 feat21             18.602100
 feat22              2.923274
 feat23             -3.942780
 feat24             -2.705590
 feat25             -2.705590
 doc_id          12472.000000
 Name: 3857, dtype: float64, relevent_val         2.000000
 qid                 32.000000
 feat1                2.000000
 feat2                1.386294
 feat3  