In [84]:
# https://github.com/houchenyu/L2R/blob/master/RankNet.py
import pandas as pd
import torch
from torch.nn import functional as F
import torch.nn as nn
import numpy as np
import time
df_small = pd.read_csv('train_cleaned_small.csv')
df_val = pd.read_csv('val_cleaned_small.csv')

In [85]:
df_small = pd.read_csv('df_train_new.csv')

In [86]:
def dcg(scores):
    scores = np.array(scores,dtype = float)
    num = 2**scores-1
    for i in range(len(num)):
        num[i] /= np.log2(i+2)
    return np.sum(num)


def ndcg_k(scores, k):
    top_k = scores[:k]
    ideal_top_k = sorted(scores)[::-1][:k]
    ndcg = dcg(top_k)
    indcg = dcg(ideal_top_k)
    return ndcg/indcg

In [87]:
class RankNet(nn.Module):
    
    def __init__(self,input_dim, hidden_dim1, hidden_dim2):
        super(RankNet, self).__init__()
  
        self.model = nn.Sequential(
            torch.nn.Linear(input_dim, hidden_dim1),
            torch.nn.ReLU(),
            torch.nn.Linear(hidden_dim1,hidden_dim2),
            torch.nn.ReLU(),
            torch.nn.Linear(hidden_dim2, 1),
        )
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, input_1, input_2):
        s1 = self.model(input_1)
        s2 = self.model(input_2)
        out = self.sigmoid(s1-s2)
        return out

    def predict(self, x):
        s = self.model(x)
        n = s.data.numpy()[0]
        return n

In [88]:
def query_document(df):      
    query_doc = {}
    for idx, row in df.iterrows():
        query_id = row['query_id']
        if query_id in query_doc:
            query_doc[query_id].append(idx)
        else:
            query_doc[query_id] = []
            query_doc[query_id].append(idx)
    return query_doc

In [89]:
def find_winning_pairs(labels):
    n = len(labels)
    winning_pairs = []
    for i in range(n):
        for j in range(n):
            if labels[i] > labels[j]:
                winning_pairs.append((i,j))
    return winning_pairs

In [90]:
def split_documents(winning_pairs_dict,df):
    winning_idx = []
    losing_idx = []
    for query in winning_pairs_dict: ## loop over queries
        start_idx = df[df['query_id']==query].index[0]
        for pair in winning_pairs_dict[query]: #iterate over winning pairs
            win_idx, lose_idx = pair # (E,g, (3,0) )
            win_idx += start_idx
            lose_idx += start_idx
            winning_idx.append(win_idx)
            losing_idx.append(lose_idx)
    return winning_idx, losing_idx

In [75]:
t0 = time.time()

model = RankNet(input_dim = 136, hidden_dim1 = 512, hidden_dim2 = 256)
query_doc = query_document(df_small)
query_idx = query_doc.keys() ## all query_ids

true_labels = []
query_ids = []
for qid in query_idx:
    query_ids.append(qid)
    true_labels.append(df_small.iloc[query_doc[qid]]['relevance_label'].tolist())
                      
winning_pairs_dict = {}
for idx, labels in enumerate(true_labels):
    winning_pairs_dict[query_ids[idx]] = find_winning_pairs(labels)
      
winning_idx, losing_idx = split_documents(winning_pairs_dict ,df_small)  #index of winning and losing documents
winning_doc = np.array(df_small.iloc[winning_idx])
losing_doc = np.array(df_small.iloc[losing_idx])

X1 = torch.tensor(winning_doc[:, 2:])
X2 = torch.tensor(losing_doc[:, 2:])
y = torch.tensor(np.ones((X1.shape[0], 1)))

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fun = torch.nn.BCELoss()

print('Traning………………\n')

ndcg_regret = []
for idx, query in enumerate(query_ids):
    for i in range(1):
        X1_batch = torch.Tensor(winning_doc[winning_doc[:,1]==query][:,2:])
        X2_batch = torch.Tensor(losing_doc[losing_doc[:,1]==query][:,2:])
        y_batch = torch.Tensor(np.ones((X1_batch.shape[0], 1)))
        
        ## Make prediction first to calculate Regret
        true_labels = np.array(df_small.iloc[query_doc[query]]['relevance_label'])
        X_test = torch.Tensor(np.array(df_small.iloc[query_doc[query],2:]))
        y_pred = [model.predict(x.data) for x in X_test]
        rank_pred = np.argsort(y_pred)[::-1].astype(int)
        score_pred = true_labels[rank_pred]
        ndcg = ndcg_k(score_pred, k = 10)
        ndcg_regret.append(ndcg)
        
        # Gradient update step
        optimizer.zero_grad()
        y_pred = model(X1_batch, X2_batch)
        loss = loss_fun(y_pred, y_batch) ### What we can do is also check in real-time if it was wrong or not (online lR)
        loss.backward()
        optimizer.step()     
t1 = time.time()
print('Time:', t1-t0)
torch.save(model.state_dict(), 'parameters.pkl')

Traning………………



  return ndcg/indcg


Time: 31.82490038871765


In [82]:
np.nanmean(ndcg_regret)

0.3842177715533297

In [83]:
k = 10

model = RankNet(136, 512, 256)
model.load_state_dict(torch.load('parameters.pkl'))


query_doc = query_document(df_small)
query_idx = query_doc.keys()
ndcg_list = []
for q in query_idx:
    true_labels = np.array(df_small.iloc[query_doc[q]]['relevance_label'])
    X_test = torch.Tensor(np.array(df_small.iloc[query_doc[q],2:]))
    y_pred = [model.predict(x.data) for x in X_test]
    rank_pred = np.argsort(y_pred)[::-1].astype(int)
    score_pred = true_labels[rank_pred]
    ndcg = ndcg_k(score_pred, k = 10)
    ndcg_list.append(ndcg)
print("NDCG k = {} : {}".format(10, np.nanmean(ndcg_list)))

  return ndcg/indcg


NDCG k = 10 : 0.40775672848574485
