## Import

In [36]:
import os
import sys
import itertools
import json

In [27]:
import numpy as np
import pandas as pd

import sklearn.datasets

In [4]:
module_path = os.path.abspath(os.path.join('..', 'src'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import utils
import metrics

## Utils

## Constants

In [57]:
DATA_PATH = 'OHSUMED.csv'

TRAIN_DATA_PATH = 'data/OHSUMED_train.txt'
TEST_DATA_PATH = 'data/OHSUMED_test.txt'
OUTPUT_PATH = 'data/svm_map_output'

In [29]:
TEST_SIZE = 0.1
TRAIN_SIZE = 1 - TEST_SIZE

## Data processing

In [30]:
documents = pd.read_csv(DATA_PATH)

documents.drop(['doc_id'], axis=1, inplace=True)
features_columns = np.array([col for col in documents.columns if col.startswith('feat')])

In [31]:
documents.head(1)

Unnamed: 0,relevent_val,qid,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,...,feat16,feat17,feat18,feat19,feat20,feat21,feat22,feat23,feat24,feat25
0,2.0,1,3.0,2.079442,0.272727,0.261034,37.330565,11.431241,37.29975,1.138657,...,9.340024,24.808785,0.393091,57.416517,3.294893,25.0231,3.219799,-3.87098,-3.90273,-3.87512


### train-test split

In [32]:
qid_grid = documents.qid.unique()
nqid = qid_grid.shape[0]

qid_threshold = int(nqid * TRAIN_SIZE)
train_qids = qid_grid[:qid_threshold]
test_qids = qid_grid[qid_threshold:]

assert(train_qids.shape[0] + test_qids.shape[0] == nqid)

In [33]:
train_mask = np.in1d(documents.qid.values, train_qids)

documents_train = documents.loc[train_mask]
documents_test = documents.loc[~train_mask]

## SVM MAP

In [34]:
target_train = documents_train.relevent_val.values
qids_train = documents_train.qid.values
documents_train.drop(['relevent_val', 'qid'], axis=1, inplace=True)

with open(TRAIN_DATA_PATH, 'wb') as output_stream:
    sklearn.datasets.dump_svmlight_file(
        documents_train,
        target_train,
        output_stream,
        query_id=qids_train
    )

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [35]:
target_test = documents_test.relevent_val.values
qids_test = documents_test.qid.values
documents_test.drop(['relevent_val', 'qid'], axis=1, inplace=True)

with open(TEST_DATA_PATH, 'wb') as output_stream:
    sklearn.datasets.dump_svmlight_file(
        documents_test,
        target_test,
        output_stream,
        query_id=qids_test
    )

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [39]:
predictions = None
with open(OUTPUT_PATH) as input_stream:
    predictions = np.array(json.load(input_stream))

In [53]:
qids_test_grid = np.unique(qids_test)

ranked_targets = []
for qid in qids_test_grid:
    mask = qids_test == qid
    
    target = target_test[mask]
    prediction = predictions[mask]
    
    idx = np.argsort(prediction)[::-1]
    
    ranked_targets.append(target[idx])

In [54]:
metrics.ndcg_k(ranked_targets)

0.6820564876811456

In [55]:
metrics.map_k(ranked_targets)

0.3248096165928001

In [56]:
metrics.mrr(ranked_targets)

3.6363636363636362