In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import os
import torch.nn.functional as F 
from collections import defaultdict

In [2]:
from metrics import NDCG
from load_mslr import get_time, DataLoader

In [3]:
class Model(nn.Module):
    def __init__(self, model_structure, sigma=1.0):
        super(Model, self).__init__()
        modules = []
        for i in range(len(model_structure) - 1):
            modules.append(nn.Linear(model_structure[i], model_structure[i+1]))
            modules.append(nn.LeakyReLU())
        modules.append(nn.Linear(model_structure[-1], 1))
        self.model = nn.Sequential(*modules)
        self.sigma = sigma
        self.activation = nn.ReLU6()
        
    def forward(self, inputs):
        outputs = self.model(inputs)
        outputs = self.activation(outputs)*self.sigma
        return outputs

In [4]:
precision = torch.float32
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
lr=0.0001

model_structure = [136, 64, 16]
sigma = 1.0
model = Model(model_structure, sigma).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.75) # decrease lr while training

ndcg_gain_in_train = 'exp2' # or 'identity'
ideal_dcg = NDCG(2**9, ndcg_gain_in_train)

In [5]:
# load data
train_file, valid_file = "train.txt", "vali.txt"
data_dir = 'data/mslr-web10k/'

train_data = os.path.join(data_dir, train_file)
train_loader = DataLoader(train_data)
df_train = train_loader.load()

valid_data = os.path.join(data_dir, valid_file)
valid_loader = DataLoader(valid_data)
df_valid = valid_loader.load()

2020-04-21 13:10:55 load from pickle file data/mslr-web10k/train.pkl
2020-04-21 13:10:55 load from pickle file data/mslr-web10k/vali.pkl


In [6]:
def eval_cross_entropy_loss(model, device, loader, epoch, sigma=1.0):
    """
    formula in https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/MSR-TR-2010-82.pdf

    C = 0.5 * (1 - S_ij) * sigma * (si - sj) + log(1 + exp(-sigma * (si - sj)))
    when S_ij = 1:  C = log(1 + exp(-sigma(si - sj)))
    when S_ij = -1: C = log(1 + exp(-sigma(sj - si)))
    sigma can change the shape of the curve
    """
    model.eval()
    with torch.set_grad_enabled(False):
        total_cost = 0
        total_pairs = loader.get_num_pairs()
        pairs_in_compute = 0
        for X, Y in loader.generate_batch_per_query(loader.df):
            Y = Y.reshape(-1, 1)
            rel_diff = Y - Y.T
            pos_pairs = (rel_diff > 0).astype(np.float32)
            num_pos_pairs = np.sum(pos_pairs, (0, 1))
            # skip negative sessions, no relevant info:
            if num_pos_pairs == 0:
                continue
            neg_pairs = (rel_diff < 0).astype(np.float32)
            num_pairs = 2 * num_pos_pairs  # num pos pairs and neg pairs are always the same
            pos_pairs = torch.tensor(pos_pairs, device=device)
            neg_pairs = torch.tensor(neg_pairs, device=device)
            Sij = pos_pairs - neg_pairs
            # only calculate the different pairs
            diff_pairs = pos_pairs + neg_pairs
            pairs_in_compute += num_pairs

            X_tensor = torch.Tensor(X).to(device)
            y_pred = model(X_tensor)
            y_pred_diff = y_pred - y_pred.t()

            # logsigmoid(x) = log(1 / (1 + exp(-x))) equivalent to log(1 + exp(-x))
            C = 0.5 * (1 - Sij) * sigma * y_pred_diff - F.logsigmoid(-sigma * y_pred_diff)
            C = C * diff_pairs
            cost = torch.sum(C, (0, 1))
            if cost.item() == float('inf') or np.isnan(cost.item()):
                import ipdb; ipdb.set_trace()
            total_cost += cost

        assert total_pairs == pairs_in_compute
        avg_cost = total_cost / total_pairs
    print("Epoch {}: pairwise corss entropy loss {:.6f}, total_paris {}".format(
            epoch, avg_cost.item(), total_pairs))

In [7]:
def eval_ndcg_at_k(inference_model, device, df_valid, valid_loader, batch_size, k_list, epoch):
    print("Eval Phase evaluate NDCG @ {}".format(k_list))
    ndcg_metrics = {k: NDCG(k) for k in k_list}
    qids, rels, scores = [], [], []
    inference_model.eval()
    with torch.no_grad():
        for qid, rel, x in valid_loader.generate_query_batch(df_valid, batch_size):
            if x is None or x.shape[0] == 0:
                continue
            y_tensor = inference_model.forward(torch.Tensor(x).to(device))
            scores.append(y_tensor.cpu().numpy().squeeze())
            qids.append(qid)
            rels.append(rel)

    qids = np.hstack(qids)
    rels = np.hstack(rels)
    scores = np.hstack(scores)
    result_df = pd.DataFrame({'qid': qids, 'rel': rels, 'score': scores})
    session_ndcgs = defaultdict(list)
    for qid in result_df.qid.unique():
        result_qid = result_df[result_df.qid == qid].sort_values('score', ascending=False)
        rel_rank = result_qid.rel.values
        for k, ndcg in ndcg_metrics.items():
            if ndcg.maxDCG(rel_rank) == 0:
                continue
            ndcg_k = ndcg.evaluate(rel_rank)
            if not np.isnan(ndcg_k):
                session_ndcgs[k].append(ndcg_k)

    ndcg_result = {k: np.mean(session_ndcgs[k]) for k in k_list}
    ndcg_result_print = ", ".join(["NDCG@{}: {:.5f}".format(k, ndcg_result[k]) for k in k_list])
    print("evaluate {}".format(ndcg_result_print))
    return ndcg_result

In [25]:
batch_size = 200
X, Y = next(valid_loader.generate_batch_per_query(df_valid))

In [46]:
def apply_solver(pairwise_scores, ofe_score_min_cap=0, ofe_score_max_cap=1):
    # num_docs = pairwise_scores.shape[0]
    ofe_tree_sum_clipped = pairwise_scores.clip(ofe_score_min_cap, ofe_score_max_cap)
    ofe_tree_sum_norm = 6 * (ofe_tree_sum_clipped-ofe_score_min_cap)/(ofe_score_max_cap-ofe_score_min_cap) - 3
    ofe_score = 1/(1+np.exp(-ofe_tree_sum_norm))
    ofe_score_diag = ofe_score.copy()
    np.fill_diagonal(ofe_score_diag, 0.5)
    scores = ofe_score_diag.sum(axis=1)/10
    return scores, ofe_score_diag, ofe_score 

In [50]:
a = np.random.rand(5,5)
np.fill_diagonal(a, 0.5)
apply_solver(a)

(array([0.25350825, 0.24352709, 0.16855005, 0.2261107 , 0.25504928]),
 array([[0.5       , 0.69938651, 0.582792  , 0.30517379, 0.44773022],
        [0.85577158, 0.5       , 0.157737  , 0.40811404, 0.5136483 ],
        [0.70748361, 0.04809095, 0.5       , 0.06596003, 0.36396593],
        [0.24138335, 0.89969618, 0.2447741 , 0.5       , 0.37525341],
        [0.46686904, 0.15403587, 0.59950058, 0.83008734, 0.5       ]]),
 array([[0.5       , 0.69938651, 0.582792  , 0.30517379, 0.44773022],
        [0.85577158, 0.5       , 0.157737  , 0.40811404, 0.5136483 ],
        [0.70748361, 0.04809095, 0.5       , 0.06596003, 0.36396593],
        [0.24138335, 0.89969618, 0.2447741 , 0.5       , 0.37525341],
        [0.46686904, 0.15403587, 0.59950058, 0.83008734, 0.5       ]]))

In [None]:
# for i in range(100):
#     model.train()
#     model.zero_grad()

#     count = 0
#     batch_size = 200
#     grad_batch, y_pred_batch = [], []

#     for X, Y in train_loader.generate_batch_per_query():
#         if np.sum(Y) == 0:
#             # negative session, cannot learn useful signal
#             continue
#         N = 1.0 / ideal_dcg.maxDCG(Y)

#         X_tensor = torch.tensor(X, dtype=precision, device=device)
#         y_pred = model(X_tensor)
#         y_pred_batch.append(y_pred)
#         # compute the rank order of each document
#         rank_df = pd.DataFrame({"Y": Y, "doc": np.arange(Y.shape[0])})
#         rank_df = rank_df.sort_values("Y").reset_index(drop=True)
#         rank_order = rank_df.sort_values("doc").index.values + 1

#         with torch.no_grad():
#             pos_pairs_score_diff = 1.0 + torch.exp(sigma * (y_pred - y_pred.t()))

#             Y_tensor = torch.tensor(Y, dtype=precision, device=device).view(-1, 1)
#             rel_diff = Y_tensor - Y_tensor.t()
#             pos_pairs = (rel_diff > 0).type(precision)
#             neg_pairs = (rel_diff < 0).type(precision)
#             Sij = pos_pairs - neg_pairs
#             if ndcg_gain_in_train == "exp2":
#                 gain_diff = torch.pow(2.0, Y_tensor) - torch.pow(2.0, Y_tensor.t())
#             elif ndcg_gain_in_train == "identity":
#                 gain_diff = Y_tensor - Y_tensor.t()

#             rank_order_tensor = torch.tensor(rank_order, dtype=precision, device=device).view(-1, 1)
#             decay_diff = 1.0 / torch.log2(rank_order_tensor + 1.0) - 1.0 / torch.log2(rank_order_tensor.t() + 1.0)

#             delta_ndcg = torch.abs(N * gain_diff * decay_diff)
#             lambda_update = sigma * (0.5 * (1 - Sij) - 1 / pos_pairs_score_diff) * delta_ndcg
#             lambda_update = torch.sum(lambda_update, 1, keepdim=True)

#             assert lambda_update.shape == y_pred.shape
#             grad_batch.append(lambda_update)

#         count += 1
#         if count % batch_size == 0:
#             for grad, y_pred in zip(grad_batch, y_pred_batch):
#                 y_pred.backward(grad / batch_size)

#             optimizer.step()
#             optimizer.zero_grad()
#             grad_batch, y_pred_batch = [], []  # grad_batch, y_pred_batch used for gradient_acc
            
#             # loss = F.mse_loss(y_pred, Y_tensor).item()
#             # print('queries: ', count, '| loss: ', loss, ' (not a good metric)')
            
#     if i % 5 == 0: # validate model every 5 epoch
#         print(get_time(), "eval for epoch: {}".format(i))
#         eval_cross_entropy_loss(model, device, valid_loader, i)
#         eval_ndcg_at_k(model, device, df_valid, valid_loader, 100000, [10, 30], i)

#     # optimizer.step()
#     print(get_time(), "training dataset at epoch {}, total queries: {}".format(i, count))
#     scheduler.step()

In [None]:
# print(get_time(), "eval for epoch: {}".format(i))
# eval_cross_entropy_loss(model, device, valid_loader, i)
# eval_ndcg_at_k(model, device, df_valid, valid_loader, 100000, [10, 30], i)

In [None]:
# # save model
# torch.save(model)