In [1]:
%pip install -U elasticsearch eland "eland[scikit-learn]" xgboost tqdm -q


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/opt/homebrew/opt/python@3.11/bin/python3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [8]:
INDEX_NAME = "movies"

In [10]:
from elasticsearch import Elasticsearch, helpers
from dotenv import load_dotenv
import os

load_dotenv()

# load the dotenv variables
ELASTICSEARCH_URL = os.getenv("ELASTICSEARCH_URL")
ELASTICSEARCH_API_KEY = os.getenv("ELASTICSEARCH_API_KEY")

es_client = Elasticsearch(
    ELASTICSEARCH_URL,
    api_key=ELASTICSEARCH_API_KEY,
)

## Indexing the movies data

In [11]:
try:
    mappings = {"properties": {"text": {"type": "text"}}}

    index_exists = es_client.indices.exists(index=INDEX_NAME)

    if not index_exists:
        print(f"Index {INDEX_NAME} does not exist, creating it...")
        es_client.indices.create(index=INDEX_NAME, body={"mappings": mappings})
    else:
        print(f"Index {INDEX_NAME} already exists, skipping creation...")

except Exception as e:
    print(e)

Index movies does not exist, creating it...


In [12]:
dataset = [
    {"_id": "doc1", "text": "DiCaprio performance in The Revenant was breathtaking."},
    {
        "_id": "doc2",
        "text": "Inception shows Leonardo DiCaprio in one of his most iconic roles.",
    },
    {
        "_id": "doc3",
        "text": "Brad Pitt delivers a solid performance in this crime thriller.",
    },
    {"_id": "doc4", "text": "An action-packed adventure with stunning visual effects."},
    {
        "_id": "doc5",
        "text": "A heartbreaking story of love and loss that made me cry for hours.",
    },
    {"_id": "doc6", "text": "One of the saddest movies ever made -- bring tissues!"},
    {"_id": "doc7", "text": "A lighthearted comedy that will make you laugh."},
    {"_id": "doc8", "text": "A science-fiction epic full of action and excitement."},
]


def build_data(dataset, index_name, pipeline_id=None):
    for doc in dataset:
        action = {
            "_index": index_name,
            "_id": doc["_id"],
            "_source": {"text": doc["text"]},
        }

        yield action


try:
    success, failed = helpers.bulk(
        es_client,
        build_data(dataset, INDEX_NAME),
    )
    print(f"Successfully indexed {success} documents")
    if failed:
        print(f"Failed to index {len(failed)} documents")
except Exception as e:
    print(e)

Successfully indexed 8 documents


In [None]:
import pandas as pd

# Create judgment list for LTR training
judgments_data = [
    {
        "query_id": "query1",
        "query": "DiCaprio performance",
        "doc_id": "doc1",
        "text": "DiCaprio's performance in The Revenant was breathtaking.",
        "grade": 1,
    },
    {
        "query_id": "query1",
        "query": "DiCaprio performance",
        "doc_id": "doc2",
        "text": "Inception shows Leonardo DiCaprio in one of his most iconic roles.",
        "grade": 1,
    },
    {
        "query_id": "query1",
        "query": "DiCaprio performance",
        "doc_id": "doc3",
        "text": "Brad Pitt delivers a solid performance in this crime thriller.",
        "grade": 0,
    },
    {
        "query_id": "query1",
        "query": "DiCaprio performance",
        "doc_id": "doc4",
        "text": "An action-packed adventure with stunning visual effects.",
        "grade": 0,
    },
    {
        "query_id": "query2",
        "query": "sad movies that make you cry",
        "doc_id": "doc5",
        "text": "A heartbreaking story of love and loss that made me cry for hours.",
        "grade": 1,
    },
    {
        "query_id": "query2",
        "query": "sad movies that make you cry",
        "doc_id": "doc6",
        "text": "One of the saddest movies ever made — bring tissues!",
        "grade": 1,
    },
    {
        "query_id": "query2",
        "query": "sad movies that make you cry",
        "doc_id": "doc7",
        "text": "A lighthearted comedy that will make you laugh",
        "grade": 0,
    },
    {
        "query_id": "query2",
        "query": "sad movies that make you cry",
        "doc_id": "doc8",
        "text": "A science-fiction epic full of action and excitement.",
        "grade": 0,
    },
]

In [35]:
from eland.ml.ltr import LTRModelConfig, QueryFeatureExtractor

feature_extractors = [
    # We want to use the score of the match query for the fields title and content as a feature:
    QueryFeatureExtractor(
        feature_name="text_bm25_score", query={"match": {"text": "{{query}}"}}
    ),
    # QueryFeatureExtractor(
    #     feature_name="content_bm25_score",
    #     query={"match": {"content": "{{query_text}}"}},
    # ),
    # We can use a script_score query to get the value
    # of the field popularity directly as a feature
    # QueryFeatureExtractor(
    #     feature_name="popularity",
    #     query={
    #         "script_score": {
    #             "query": {"exists": {"field": "popularity"}},
    #             "script": {"source": "return doc['popularity'].value;"},
    #         }
    #     },
    # ),
]

ltr_config = LTRModelConfig(feature_extractors)

In [None]:
judgments_df = pd.DataFrame(judgments_data)

# Extract features for all query-document pairs
for query_id in judgments_df["query_id"].unique():
    # Get the query text
    query_text = judgments_df[judgments_df["query_id"] == query_id]["query"].iloc[0]

    # Get document IDs for this query
    doc_ids = judgments_df[judgments_df["query_id"] == query_id]["doc_id"].tolist()

    # Extract features from Elasticsearch
    features = feature_logger.extract_features(
        query_params={"query": query_text}, doc_ids=doc_ids
    )

    # Update the judgments with the actual scores
    for doc_id, feature_values in features.items():
        mask = (judgments_df["query_id"] == query_id) & (
            judgments_df["doc_id"] == doc_id
        )
        judgments_df.loc[mask, "text_bm25_score"] = feature_values[0]


judgments_df

Unnamed: 0,query_id,query,doc_id,text,grade,text_bm25_score
0,query1,DiCaprio performance,doc1,DiCaprio's performance in The Revenant was bre...,1,2.858068
1,query1,DiCaprio performance,doc2,Inception shows Leonardo DiCaprio in one of hi...,1,1.196118
2,query1,DiCaprio performance,doc3,Brad Pitt delivers a solid performance in this...,0,1.246927
3,query1,DiCaprio performance,doc4,An action-packed adventure with stunning visua...,0,
4,query2,sad movies that make you cry,doc5,A heartbreaking story of love and loss that ma...,1,2.653032
5,query2,sad movies that make you cry,doc6,One of the saddest movies ever made — bring ti...,1,1.821567
6,query2,sad movies that make you cry,doc7,A lighthearted comedy that will make you laugh,0,5.17495
7,query2,sad movies that make you cry,doc8,A science-fiction epic full of action and exci...,0,


In [37]:
from eland.ml.ltr import FeatureLogger

feature_logger = FeatureLogger(es_client, INDEX_NAME, ltr_config)

feature_logger.extract_features(
    query_params={"query": "DiCaprio performance"}, doc_ids=["doc1", "doc2"]
)

{'doc1': [2.8580675], 'doc2': [1.1961181]}

In [None]:
from xgboost import XGBRanker
from sklearn.model_selection import GroupShuffleSplit

# Create the ranker model:
ranker = XGBRanker(
    objective="rank:ndcg",
    eval_metric=["ndcg@10"],
    early_stopping_rounds=20,
)

# Shaping training and eval data in the expected format.
X = judgments_df[ltr_config.feature_names]
y = judgments_df["grade"]
groups = judgments_df["query_id"]

# Split the dataset in two parts respectively used for training and evaluation of the model.
group_preserving_splitter = GroupShuffleSplit(n_splits=1, train_size=0.7).split(
    X, y, groups
)
train_idx, eval_idx = next(group_preserving_splitter)

train_features, eval_features = X.loc[train_idx], X.loc[eval_idx]
train_target, eval_target = y.loc[train_idx], y.loc[eval_idx]
train_query_groups, eval_query_groups = groups.loc[train_idx], groups.loc[eval_idx]

# Training the model
ranker.fit(
    X=train_features,
    y=train_target,
    group=train_query_groups.value_counts().sort_index().values,
    eval_set=[(eval_features, eval_target)],
    eval_group=[eval_query_groups.value_counts().sort_index().values],
    verbose=True,
)

['text_bm25_score']
[0]	validation_0-ndcg@10:1.00000
[1]	validation_0-ndcg@10:1.00000
[2]	validation_0-ndcg@10:1.00000
[3]	validation_0-ndcg@10:1.00000
[4]	validation_0-ndcg@10:1.00000
[5]	validation_0-ndcg@10:1.00000
[6]	validation_0-ndcg@10:1.00000
[7]	validation_0-ndcg@10:1.00000
[8]	validation_0-ndcg@10:1.00000
[9]	validation_0-ndcg@10:1.00000
[10]	validation_0-ndcg@10:1.00000
[11]	validation_0-ndcg@10:1.00000
[12]	validation_0-ndcg@10:1.00000
[13]	validation_0-ndcg@10:1.00000
[14]	validation_0-ndcg@10:1.00000
[15]	validation_0-ndcg@10:1.00000
[16]	validation_0-ndcg@10:1.00000
[17]	validation_0-ndcg@10:1.00000
[18]	validation_0-ndcg@10:1.00000
[19]	validation_0-ndcg@10:1.00000
[20]	validation_0-ndcg@10:1.00000


In [40]:
from eland.ml import MLModel

LEARNING_TO_RANK_MODEL_ID = "ltr-model-xgboost"

MLModel.import_ltr_model(
    es_client=es_client,
    model=ranker,
    model_id=LEARNING_TO_RANK_MODEL_ID,
    ltr_model_config=ltr_config,
    es_if_exists="replace",
)

<eland.ml.ml_model.MLModel at 0x11a67c790>

## Using the rescorer

In [42]:
query = "DiCaprio performance"

# First let's display the result when not using the rescorer:
search_fields = ["text"]
bm25_query = {"multi_match": {"query": query, "fields": search_fields}}

bm25_search_response = es_client.search(index=INDEX_NAME, query=bm25_query)

[
    (movie["_source"]["text"], movie["_score"], movie["_id"])
    for movie in bm25_search_response["hits"]["hits"]
]

[('DiCaprio performance in The Revenant was breathtaking.', 2.8580675, 'doc1'),
 ('Brad Pitt delivers a solid performance in this crime thriller.',
  1.2469268,
  'doc3'),
 ('Inception shows Leonardo DiCaprio in one of his most iconic roles.',
  1.1961181,
  'doc2')]

In [44]:
ltr_rescorer = {
    "learning_to_rank": {
        "model_id": LEARNING_TO_RANK_MODEL_ID,
        "params": {"query": query},
    },
    "window_size": 100,
}

rescored_search_response = es_client.search(
    index=INDEX_NAME, query=bm25_query, rescore=ltr_rescorer
)

[
    (movie["_source"]["text"], movie["_score"], movie["_id"])
    for movie in rescored_search_response["hits"]["hits"]
]

[('DiCaprio performance in The Revenant was breathtaking.', 0.5, 'doc1'),
 ('Inception shows Leonardo DiCaprio in one of his most iconic roles.',
  0.5,
  'doc2'),
 ('Brad Pitt delivers a solid performance in this crime thriller.',
  0.5,
  'doc3')]