## Online LTR - results evaluation

In [1]:
import pandas as pd
import json
import numpy as np

In [2]:
# def aggregate_results(model_name, save=True):
#     aggregated_metrics = {}
#     for i in range(10):
#         with open(f"./outputs/{model_name}_{i}.json", "r") as reader:
#             result = json.load(reader)
#             for metric, (v, std) in result["test_metrics"].items():
#                 aggregated_metrics.setdefault(metric, []).append(v)

#     aggregated_stats = {
#         metric: {"mean": np.mean(vals), "std": np.std(vals)}
#         for metric, vals in aggregated_metrics.items()
#     }

#     if save:
#         with open(f"outputs/{model_name}_avg.json", "w") as writer:
#             json.dump(aggregated_stats, writer, indent=4)

#     return aggregated_stats


# # Compute aggregated metrics
# biased = aggregate_results("biased_listNet", save=False)
# unbiased = aggregate_results("unbiased_listNet", save=False)

# # Display selected metrics
# print_metrics = ["ndcg", "ndcg@20", "precision@05", "recall@20"]
# print_biased = {metric: biased[metric] for metric in print_metrics if metric in biased}
# print_unbiased = {
#     metric: unbiased[metric] for metric in print_metrics if metric in unbiased
# }

# data = []
# for model_name, model_results in zip(
#     ["biased", "unbiased"], [print_biased, print_unbiased]
# ):
#     for metric, stats in model_results.items():
#         data.append([model_name, metric, stats["mean"], stats["std"]])

# df = pd.DataFrame(data, columns=["Model", "Metric", "Mean", "Std"])
# pd.set_option("display.precision", 3)
# df

## Logging policy impact and modifications

In [3]:
from argparse import Namespace
from ltr.utils import create_results
from ltr.train import train_unbiased_listNet
from ltr.utils import seed
from ltr.model import CLTRModel
from ltr.logging_policy import LoggingPolicy
from ltr.dataset import load_data

data = load_data()

### BM25 based logging policy

In [4]:
# # NOTE: the idea was dumped for now due to different indexing
# # IMPORTANT: DO NOT CHANGE ANY OF THE FILE PATHS.

# COLLECTION_PATH = "./data/collection.tsv"
# QUERIES_PATH = "./data/queries.tsv"
# TRAIN_PATH = "./data/train_pairs_graded.tsv"
# DEV_PATH = "./data/dev_pairs_graded.tsv"
# TEST_PATH = "./data/test_pairs_graded.tsv"
# STOP_WORDS_PATH = "./data/common_words"


# from ltr.dataset import Queries, Documents, Preprocess

# prp = Preprocess(STOP_WORDS_PATH)

# documents = Documents(prp)
# documents.process_documents(COLLECTION_PATH)
# queries = Queries(prp)
# queries.preprocess_queries(QUERIES_PATH)


# from ltr.dataset import FeatureExtraction
# from ltr.logging_policy import LoggingPolicy

# feature_extractor = FeatureExtraction({}, documents, queries)
# logging_policy = LoggingPolicy(policy_path="data/", feature_extractor=feature_extractor)


# # Gather the clicks on the SERP for query 20
# for i in range(10):
#     clicked_docs = np.where(logging_policy.gather_clicks(20))[0]
#     clicked_positions = logging_policy.query_positions(20)[clicked_docs]
#     print(
#         f"clicks for session {i+1} on documents",
#         clicked_docs,
#         "on positions",
#         clicked_positions,
#     )

### Noise and topk evaluation

In [5]:
# # NOTE: comment it out once it's done
# # NOISES = [0.05, 0.1, 0.2, 0.3]
# # noise_name_list = ["005", "010", "020", "030"]
# # TOPKS = [5, 10, 20, 30]
# NOISES = [0.4]
# noise_name_list = ["040"]
# TOPKS = [5]
# data = load_data()

# logging_policy = LoggingPolicy(policy_path="data/")

# seed(42)
# for topk in TOPKS:
#     for n_name, noise in enumerate(NOISES):
#         print("Training Model", topk, noise)
#         params = Namespace(
#             epochs=20,
#             lr=1e-4,
#             batch_size=1,
#             propensity=logging_policy.propensity,
#             metrics={"ndcg@10", "precision@10", "recall@10"},
#             log_policy_noise=noise,
#             log_policy_topk=topk,
#         )
#         for i in range(5):
#             print("Training Model", i)
#             unbiased_net = CLTRModel(15, width=20)
#             create_results(
#                 data,
#                 unbiased_net,
#                 train_unbiased_listNet,
#                 unbiased_net,
#                 f"./outputs/logging_policy/unbiased_listNet_top{topk}_noise{noise_name_list[n_name]}_{i}.json",
#                 params,
#             )

In [6]:
import os
import json
import numpy as np
import pandas as pd
import re


def aggregate_results_from_directory(directory):
    aggregated_metrics = {}
    pattern = re.compile(r"unbiased_listNet_top(\d+)_noise(\d+)_\d+.json")

    for filename in os.listdir(directory):
        match = pattern.match(filename)
        if not match:
            continue

        topk, noise = match.groups()
        key = (int(topk), int(noise))

        with open(os.path.join(directory, filename), "r") as reader:
            result = json.load(reader)
            for metric, (v, std) in result["test_metrics"].items():
                if metric not in ["ndcg", "precision@05", "recall@20"]:
                    continue
                if key not in aggregated_metrics:
                    aggregated_metrics[key] = {}
                aggregated_metrics[key].setdefault(metric, []).append(v)

    aggregated_stats = []
    for (topk, noise), metrics in aggregated_metrics.items():
        row = {
            "Noise": noise,
            "TopK": topk,
        }
        for metric, vals in metrics.items():
            row[f"{metric}_mean"] = np.mean(vals)
            row[f"{metric}_std"] = np.std(vals)
        aggregated_stats.append(row)

    df = pd.DataFrame(aggregated_stats)
    df = df.sort_values(by=["Noise", "TopK"]).reset_index(drop=True)
    return df

In [7]:
directory = "./outputs/logging_policy"
df = aggregate_results_from_directory(directory)
pd.set_option("display.precision", 3)
df

Unnamed: 0,Noise,TopK,ndcg_mean,ndcg_std,precision@05_mean,precision@05_std,recall@20_mean,recall@20_std
0,5,5,0.772,0.023,0.256,0.006,0.893,0.009
1,5,10,0.779,0.009,0.256,0.004,0.891,0.014
2,5,20,0.794,0.006,0.263,0.005,0.9,0.006
3,5,30,0.795,0.009,0.266,0.005,0.905,0.002
4,10,5,0.76,0.024,0.251,0.008,0.887,0.011
5,10,10,0.778,0.01,0.257,0.003,0.885,0.011
6,10,20,0.781,0.021,0.262,0.006,0.894,0.006
7,10,30,0.769,0.023,0.255,0.009,0.891,0.006
8,20,5,0.715,0.032,0.234,0.011,0.855,0.017
9,20,10,0.744,0.019,0.242,0.006,0.857,0.023


### Adding Gaussian noise

In [8]:
# # # NOTE: comment it out once it's done
# logging_policy = LoggingPolicy(policy_path="data/")

# # NOISE_STDS = [0.05, 0.1, 0.2, 0.3]
# # noise_name_list = ["005", "010", "020", "030"]
# # TOPKS = [10, 20, 30]
# NOISE_STDS = [0.4]
# noise_name_list = ["040"]
# TOPKS = [5]
# data = load_data()

# seed(42)
# for topk in TOPKS:
#     for n_name, std in enumerate(NOISE_STDS):
#         print("Training Model", topk, std)
#         params = Namespace(
#             epochs=20,
#             lr=1e-4,
#             batch_size=1,
#             propensity=logging_policy.propensity,
#             metrics={"ndcg@10", "precision@10", "recall@10"},
#             gaussian_noise=True,
#             gaussian_noise_std=std,
#         )
#         for i in range(5):
#             print("Training Model", i)
#             unbiased_net = CLTRModel(15, width=20)
#             create_results(
#                 data,
#                 unbiased_net,
#                 train_unbiased_listNet,
#                 unbiased_net,
#                 f"./outputs/logging_policy/gaussian_noise/unbiased_listNet_top{topk}_noise{noise_name_list[n_name]}_{i}.json",
#                 params,
#             )

In [9]:
directory = "./outputs/logging_policy/gaussian_noise"
df = aggregate_results_from_directory(directory)
pd.set_option("display.precision", 3)
df

Unnamed: 0,Noise,TopK,ndcg_mean,ndcg_std,precision@05_mean,precision@05_std,recall@20_mean,recall@20_std
0,5,10,0.796,0.015,0.264,0.006,0.901,0.005
1,5,20,0.798,0.003,0.264,0.001,0.902,0.008
2,5,30,0.794,0.012,0.267,0.006,0.902,0.004
3,10,10,0.789,0.01,0.263,0.002,0.901,0.003
4,10,20,0.793,0.006,0.263,0.003,0.897,0.009
5,10,30,0.801,0.007,0.269,0.004,0.9,0.004
6,20,10,0.775,0.013,0.257,0.007,0.89,0.002
7,20,20,0.785,0.005,0.26,0.002,0.893,0.008
8,20,30,0.794,0.009,0.264,0.004,0.9,0.007
9,30,10,0.774,0.026,0.258,0.011,0.897,0.008


## Propensity clipping impact and modifications

In [10]:
# # # NOTE: comment it out once it's done
# logging_policy = LoggingPolicy(policy_path="data/")

# CLIPS = ["no_clip", "sigmoid_clip"]
# # CLIPS = ["log_clip", "percentile_clip", "default_clip"]

# seed(42)
# for clip in CLIPS:
#     print("Training Model", clip)
#     params = Namespace(
#         epochs=10, #20,
#         lr=1e-4,
#         batch_size=1,
#         propensity=logging_policy.propensity,
#         metrics={"ndcg@10", "precision@10", "recall@10"},
#         clip=clip,
#     )
#     for i in range(5):
#         print("Training Model", i)
#         unbiased_net = CLTRModel(15, width=20)
#         create_results(
#             data,
#             unbiased_net,
#             train_unbiased_listNet,
#             unbiased_net,
#             f"./outputs/logging_policy/clipping_10epochs/unbiased_listNet_{clip}_{i}.json",
#             params,
#             )

In [11]:
import os
import json
import numpy as np
import pandas as pd
import re

def aggregate_results_from_directory(directory):
    aggregated_metrics = {}
    # Updated regex to match all clip files: unbiased_listNet_{clip_type}_clip_{clip_number}.json
    pattern = re.compile(r"unbiased_listNet_(\w+)_clip_(\d+).json")

    for filename in os.listdir(directory):
        # print("FILENAME", filename)
        match = pattern.match(filename)
        if not match:
            continue

        clip_mode = match.groups()[0]
        # print("CLIP_MODE", clip_mode)
        key = clip_mode

        with open(os.path.join(directory, filename), "r") as reader:
            result = json.load(reader)
            for metric, (v, std) in result["test_metrics"].items():
                if metric not in ["ndcg", "precision@05", "recall@20"]:
                    continue
                if key not in aggregated_metrics:
                    aggregated_metrics[key] = {}
                aggregated_metrics[key].setdefault(metric, []).append(v)

    aggregated_stats = []
    for clip, metrics in aggregated_metrics.items():
        # print(clip, metrics)
        row = {
            "Clipping": clip,
        }
        for metric, vals in metrics.items():
            row[f"{metric}_mean"] = np.mean(vals)
            row[f"{metric}_std"] = np.std(vals)
        aggregated_stats.append(row)

    df = pd.DataFrame(aggregated_stats)
    df = df.sort_values(by=["Clipping"]).reset_index(drop=True)
    return df


In [12]:
directory = "./outputs/logging_policy/clipping_10epochs"
df = aggregate_results_from_directory(directory)
pd.set_option("display.precision", 3)
df

Unnamed: 0,Clipping,ndcg_mean,ndcg_std,precision@05_mean,precision@05_std,recall@20_mean,recall@20_std
0,default,0.768,0.016,0.252,0.007,0.888,0.003
1,log,0.778,0.018,0.257,0.009,0.884,0.015
2,no,0.775,0.02,0.256,0.011,0.879,0.018
3,percentile,0.747,0.034,0.244,0.015,0.866,0.028
4,sigmoid,0.75,0.033,0.244,0.015,0.87,0.026


In [13]:
print(df.to_latex())

\begin{tabular}{llrrrrrr}
\toprule
 & Clipping & ndcg_mean & ndcg_std & precision@05_mean & precision@05_std & recall@20_mean & recall@20_std \\
\midrule
0 & default & 0.768413 & 0.016043 & 0.252383 & 0.006864 & 0.887868 & 0.002872 \\
1 & log & 0.777713 & 0.018286 & 0.257461 & 0.009491 & 0.883707 & 0.015405 \\
2 & no & 0.774678 & 0.019834 & 0.255679 & 0.010605 & 0.879346 & 0.018129 \\
3 & percentile & 0.747253 & 0.034418 & 0.243653 & 0.015267 & 0.865939 & 0.028186 \\
4 & sigmoid & 0.749913 & 0.032993 & 0.244187 & 0.014501 & 0.870093 & 0.025526 \\
\bottomrule
\end{tabular}



In [14]:
directory = "./outputs/logging_policy/clipping"
df = aggregate_results_from_directory(directory)
pd.set_option("display.precision", 3)
df

Unnamed: 0,Clipping,ndcg_mean,ndcg_std,precision@05_mean,precision@05_std,recall@20_mean,recall@20_std
0,default,0.776,0.014,0.257,0.006,0.883,0.011
1,log,0.796,0.009,0.262,0.003,0.898,0.004
2,no,0.794,0.009,0.262,0.004,0.897,0.006
3,percentile,0.779,0.019,0.257,0.006,0.9,0.005
4,sigmoid,0.777,0.015,0.258,0.006,0.883,0.011
