# Learning to Rank
______


## Dataset

**About**
The datasets are machine learning data, in which queries and urls are represented by IDs. The datasets consist of feature vectors extracted from query-url pairs along with relevance judgment labels:
- Reference: https://www.microsoft.com/en-us/research/project/mslr/

**Labels**
- The relevance judgments take 5 values from 0 (irrelevant) to 4 (perfectly relevant). 
- The larger value the relevance label has, the more relevant the query-url pair is.

**Features**:
- A query-url pair is represented by a 136-dimensional feature vector
- Each row corresponds to a query-url pair.
- First column is relevance label of the pair, 
- Second column is query id, and the following columns are features. 

In [40]:
import sys
import pandas as pd
import numpy as np
import argparse
import lightgbm as lgb
from sklearn.metrics import ndcg_score

In [2]:
def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--data_dir", default=None, type=str)
    parser.add_argument("--num_leaves", type=int, default=10)
    parser.add_argument("--learning_rate", type=float, default=0.20)
    parser.add_argument("--reg_lambda", type=float, default=2)
    return parser

sys.argv = ['-','--data_dir','data']
args = get_args().parse_args()

In [14]:
def get_data(data_path):
    dfs = {
        "train": pd.read_csv(f"{data_path}/train.txt", delimiter=" "),
        "valid": pd.read_csv(f"{data_path}/vali.txt", delimiter=" "),
        "test": pd.read_csv(f"{data_path}/test.txt", delimiter=" "),
    }

    for df in dfs.values():
        df.columns = np.arange(len(df.columns))
        # delete columns where all data is missing
        df.drop(columns=df.columns[df.isna().all()].tolist(), inplace=True)
        
    return dfs

In [100]:
def train_test_split(dfs):
    
    split = {}
    split["X_train"] = dfs["train"].iloc[:, 1:]
    split["X_valid"] = dfs["valid"].iloc[:, 1:]
    split["X_test"] = dfs["test"].iloc[:, 1:]
    
    # In order to use the Light GBM framework, we need to 
    # create variables group_train and group_vali, which contain
    # number of examples for each query ID. This will allow LGBMRanker 
    # to group examples by query during training.

    # Train
    g = split["X_train"].groupby(by=1)
    size = g.size()
    group_train = size.to_list()

    # Validation
    g = split["X_valid"].groupby(by=1)
    size = g.size()
    group_valid = size.to_list()

    # Relevance Labels
    y_train = dfs["train"].iloc[:, 0]
    y_valid = dfs["valid"].iloc[:, 0]
    y_test = dfs["test"].iloc[:, 0]
    
    return split["X_train"], split["X_valid"], split["X_test"], group_train, group_valid, y_train, y_test, y_valid, 

In [85]:
def preprocess(X):
    """ Preprocess Train/Validation/Test Dataset 
    
    Task:
    1. Remove the irrelevant information at the beginning of each feature value
    2. Drop the query ID column since it is not a feature
    """
    X = X.astype(str).applymap(lambda x: x.split(":")[-1]).astype(float)
    return X

In [105]:
def train_model(args, X_train, X_valid, y_train, y_valid, group_train, group_valid):
    """ Train LightGBM Ranker Model """ 
    
    gbm = lgb.LGBMRanker(
        n_estimators=10000,
        num_leaves=args.num_leaves,
        learning_rate=args.learning_rate,
        reg_lambda=args.reg_lambda,
    )

    gbm.fit(
        X_train,
        y_train,
        group=group_train,
        eval_group=[group_valid],
        eval_set=[(X_valid, y_valid)],
        early_stopping_rounds=150,
        verbose=0
    )
    
    return gbm

In [110]:
def evaluate_model(model, X_test, y_test):
    
    """
    Evalute model results
    
    Compute Normalized Discounted Cumulative Gain.

    Sum the true scores ranked in the order induced by the predicted scores,
    after applying a logarithmic discount. Then divide by the best possible
    score (Ideal DCG, obtained for a perfect ranking) to obtain a score between
    0 and 1.

    This ranking metric yields a high value if true labels are ranked high by
    `y_score`.
    
    """
    
    # ground truth by sorting it according to our model's predictions.
    true_relevance = y_test.sort_values(ascending=False)

    # prediction
    test_pred = model.predict(X_test)

    # Save Results
    y_test = pd.DataFrame({"relevance_score": y_test, "predicted_ranking": test_pred})
    relevance_score = y_test.sort_values("predicted_ranking", ascending=False)
    
    # Normalized Discounted Cumulative Gain
    score = ndcg_score([true_relevance.to_numpy()], [relevance_score["relevance_score"].to_numpy()])
    print(f"nDCG score: {score:.2f}")

### Main

In [101]:
# Load Data
dfs = get_data(args.data_dir)

# Split Train/Test
X_train, X_valid, X_test, group_train, group_valid, y_train, y_valid, y_test = train_test_split(dfs)

# Preprocess 
X_train = preprocess(X_train)
X_valid = preprocess(X_valid)
X_test = preprocess(X_test)

In [106]:
# train model
model = train_model(args, X_train, X_valid, y_train, y_valid, group_train, group_valid)

In [111]:
# evaluate model
evaluate_model(model,X_test,y_test)

nDCG score: 0.86
