## Import

In [26]:
import os
import sys
import itertools
import json

In [2]:
import numpy as np
import pandas as pd

In [3]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader

In [4]:
from sklearn.svm import SVC

In [5]:
module_path = os.path.abspath(os.path.join('..', 'src'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import utils
import metrics
from models.listNet import ListNet, model_train

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Utils

## Constants

In [24]:
DATA_PATH = 'data/OHSUMED.csv'

TOTAL_SCORE_PATH = 'data/total_score.json'

In [8]:
TEST_SIZE = 0.1
TRAIN_SIZE = 1 - TEST_SIZE

## Data processing

In [9]:
documents = pd.read_csv(DATA_PATH)

features_columns = np.array([col for col in documents.columns if col.startswith('feat')])

In [10]:
documents.head(1)

Unnamed: 0,relevent_val,qid,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,...,feat17,feat18,feat19,feat20,feat21,feat22,feat23,feat24,feat25,doc_id
0,2.0,1,3.0,2.079442,0.272727,0.261034,37.330565,11.431241,37.29975,1.138657,...,24.808785,0.393091,57.416517,3.294893,25.0231,3.219799,-3.87098,-3.90273,-3.87512,40626


### train-test split

In [11]:
qid_grid = documents.qid.unique()
nqid = qid_grid.shape[0]

qid_threshold = int(nqid * TRAIN_SIZE)
train_qids = qid_grid[:qid_threshold]
test_qids = qid_grid[qid_threshold:]

assert(train_qids.shape[0] + test_qids.shape[0] == nqid)

In [12]:
train_mask = np.in1d(documents.qid.values, train_qids)

documents_train = documents.loc[train_mask]
documents_test = documents.loc[~train_mask]

### pairwise processing

In [13]:
X_train, y_train = utils.construct_pairwise(documents_train, features_columns)

## ListNet

In [14]:
INPUT_SIZE = X_train.shape[1]
NUNIT = 1

listNet = ListNet(
    INPUT_SIZE,
    NUNIT,
    device
)
optimizer = optim.Adam(listNet.parameters(), lr=0.001)

In [15]:
train = utils.RankDataset(
    documents_train,
    device,
    features_columns,
)

test = utils.RankDataset(
    documents_test,
    device,
    features_columns,
)

In [16]:
model_train(
    listNet,
    optimizer,
    train,
    metrics.cross_entropy,
    100,
)

0 Loss:  5.457131476151316
10 Loss:  4.91885986328125
20 Loss:  4.916736482319079
30 Loss:  4.915906404194079
40 Loss:  4.915295731393915
50 Loss:  4.914847604851974
60 Loss:  4.914523154810856
70 Loss:  4.91428865131579
80 Loss:  4.914123856393915
90 Loss:  4.914003392269737


In [17]:
X_list, y_list = zip(*list(test))

In [18]:
prediction_list = listNet.predict_qid(X_list)

ranked_targets = [y[idx].numpy() for idx, y in zip(prediction_list, y_list)]

In [19]:
metrics.ndcg_k(ranked_targets)

0.6524911346903617

In [20]:
metrics.map_k(ranked_targets)

0.3141322021544454

In [21]:
metrics.mrr(ranked_targets)

4.818181818181818

In [22]:
model_metrics = {
    'name': 'ListNet',
    'scores': {
        'ndcg_n': metrics.ndcg_k(ranked_targets),
        'map_n': metrics.map_k(ranked_targets),
        'mrr': metrics.mrr(ranked_targets),
    }
}

In [27]:
total_scores = []
if os.path.exists(TOTAL_SCORE_PATH):
    with open(TOTAL_SCORE_PATH) as input_stream:
        total_scores = json.load(input_stream)
    
total_scores.append(model_metrics)
with open(TOTAL_SCORE_PATH, 'w') as output_stream:
    json.dump(total_scores, output_stream)