DSSM Training

In [8]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [22]:
query = pd.read_csv("~/data/queries_train_indices.csv", header = None)
display(query)
query.isnull().sum()

Unnamed: 0,0,1
0,121352,1924 2363
1,634306,29570 321 193 198
2,920825,131 8242 1880 36968
3,510633,4711 38 31
4,737889,39755 99
...,...,...
808726,633855,752 703 2420 321
808727,1059728,6398 36572 136
808728,210839,1087 22
808729,908165,4 685 45818 233


0        0
1    15181
dtype: int64

In [2]:
def sparse_to_dense(idx, vocab_len):
    index_tensor = torch.LongTensor([idx])
    value_tensor = torch.Tensor([1]*len(idx))
    dense_tensor = torch.sparse.FloatTensor(index_tensor, value_tensor, torch.Size([vocab_len,])).to_dense()
    return dense_tensor

In [1]:
# test case:
top_dict = {1:[11,12,13,14], 2:[21,22,23,24]}
rating_dict = {1:{11:3,12:2,13:1}, 2:{21:2,23:1}}
query_test_dict = {1:[0,5,2], 2:[1,3,4]}
passage_dict = {11:[0,1,2,3,4], 12:[0,3,3,4], 13:[0,1], 14:[1,1,3],21:[1,2,1], 22:[0,2,5], 23:[1,2,2], 24:[0,0,5]}
result_dict = {1:{11:0.9, 12: 0, 13:0.5, 14:0.1}, 2:{21:0.3, 22: 0.6, 23:0.9, 24:0}}

In [28]:
import math
def get_ndcg_precision_rr(true_dict, test_dict, rank):
    sorted_result = sorted(test_dict.items(), key=lambda x: (x[1], [-1,1][random.randrange(2)]), reverse=True)
    original_rank = rank
    rank = min(rank, len(sorted_result))
    cumulative_gain = 0
    ideal_dict = {}
    num_positive = 0
    rr = float("NaN")
    for i in range(len(sorted_result)):
        pid = sorted_result[i][0]
        if pid in true_dict:
            rr = 1 / (i + 1)
            break
    for i in range(rank):
        pid = sorted_result[i][0]
        if pid in true_dict:
            num_positive += 1
    sorted_result = sorted(test_dict.items(), key=lambda x: x[1], reverse=True)
    for i in range(rank):
        pid = sorted_result[i][0]
        relevance = 0
        if pid in true_dict:
            relevance = true_dict[pid]
        ideal_dict[pid] = relevance
        discounted_gain = relevance / math.log2(2 + i)
        cumulative_gain += discounted_gain
    sorted_ideal = sorted(ideal_dict.items(), key=lambda x: x[1], reverse=True)
    ideal_gain = 0
    for i in range(rank):
        relevance = sorted_ideal[i][1]
        discounted_gain = relevance / math.log2(2 + i)
        ideal_gain += discounted_gain
    ndcg = 0
    if ideal_gain != 0:
         ndcg = cumulative_gain / ideal_gain
    return ndcg, num_positive / original_rank, rr

In [29]:
import random
result_dict = {1:10, 2:9, 3:8, 4:7, 5:6, 6:5}
rating_dict = {1:3, 2:2, 3:3, 5:1, 6:2, 7:3, 8:2}
get_ndcg_precision_rr(rating_dict, result_dict, 6)

(0.9608081943360617, 0.8333333333333334, 1.0)

In [71]:
import torch
torch.cosine_similarity(torch.FloatTensor([0,1,0]).unsqueeze(0),torch.FloatTensor([1,0,0]).unsqueeze(0))

tensor([0.])

In [4]:
import sys
import os
import torch
sys.path.insert(0, '/home/jianx/search-exposure/')
from load_data import obj_reader
from load_data import obj_writer

CURRENT_GPU = 0
if not os.path.exists(GPU_ROOT):
    obj_writer([0,0,0,0], GPU_ROOT)

In [31]:
def select_device():
    argmin = -1
    if not torch.cuda.is_available():
        device = torch.device("cpu")
    else:
        gpu_usage_list = obj_reader(GPU_ROOT)
        min = 100000
        argmin = 0
        for i, count in enumerate(gpu_usage_list):
            if count < min:
                argmin = i
                min = count
        gpu_usage_list[argmin] += 1
        print(gpu_usage_list)
        device = torch.device("cuda:" + str(argmin))
        obj_writer(gpu_usage_list, GPU_ROOT)
    return device,argmin

def cleanup_gpu_list():
    gpu_usage_list = obj_reader(GPU_ROOT)
    gpu_usage_list[CURRENT_GPU] -=1
    obj_writer(gpu_usage_list, GPU_ROOT)
device,CURRENT_GPU = select_device()
print(CURRENT_GPU)
print(torch.cuda.get_device_name(device))

[4, 3, 3, 3]
0
Tesla P100-PCIE-16GB


In [29]:
cleanup_gpu_list()




def evaluate_results(results, qrels):
    mrr = 0
    ncg = 0
    ndcg = 0
    for qid, docs in results.items():
        if qid not in qrels:
            continue
        qrels_q = qrels[qid]
        gains = [qrels_q.get(doc[0], 0) for doc in docs]
        ideal_gains = sorted(list(qrels_q.values()), reverse=True)
        max_metric_pos_disc = min(len(gains), 10)
        max_metric_pos = min(len(gains), 100 if task_docs else 1000)
        ideal_max_metric_pos_disc = min(len(ideal_gains), 10)
        ideal_max_metric_pos = min(len(ideal_gains), 100 if task_docs else 1000)
        cg = sum([gains[i] for i in range(max_metric_pos)])
        dcg = sum([gains[i] / math.log2(i + 2) for i in range(max_metric_pos_disc)])
        ideal_cg = sum([ideal_gains[i] for i in range(ideal_max_metric_pos)])
        ideal_dcg = sum([ideal_gains[i] / math.log2(i + 2) for i in range(ideal_max_metric_pos_disc)])
        ncg += cg / ideal_cg if ideal_cg > 0 else 0
        ndcg += dcg / ideal_dcg if ideal_dcg > 0 else 0
        try:
            if task_docs:
                mrr += 1 / ([1 if gain > 1 else 0 for gain in gains].index(1) + 1)
            else:
                mrr += 1 / ([min(gain, 1) for gain in gains].index(1) + 1)
        except Exception:
            pass
    mrr /= len(qrels)
    ncg /= len(qrels)
    ndcg /= len(qrels)
    return mrr, ncg, ndcg