In [1]:
def average_precision(relevant_positions):
    """
    Calculate Average Precision (AP) for a single query.
    
    :param relevant_positions: A list of positions of relevant items for a single query.
    :return: Average Precision for the query.
    """
    if not relevant_positions:
        return 0.0
    precision_sum = 0.0
    num_hits = 0.0
    for rank, is_relevant in enumerate(relevant_positions, start=1):
        if is_relevant:
            num_hits += 1
            precision_sum += num_hits / rank
    return precision_sum / len(relevant_positions)

def mean_average_precision(queries):
    """
    Calculate Mean Average Precision (MAP) for a set of queries.
    
    :param queries: A dictionary where each key is a query ID and the value is a list of booleans indicating whether each item is relevant.
    :return: MAP score.
    """
    map_sum = 0.0
    num_queries = 0.0
    for query_id, relevant_positions in queries.items():
        map_sum += average_precision(relevant_positions)
        num_queries += 1
    return map_sum / num_queries

# Example usage
queries = {
    "query1": [True, True, False, True, False], # Relevant items at positions 1, 2, and 4
    "query2": [True, False, True, True, True], # Relevant items at positions 1, 3, 4, and 5
    # Add more queries as needed
}

map_score = mean_average_precision(queries)
print(f"Mean Average Precision (MAP): {map_score}")


Mean Average Precision (MAP): 0.5966666666666667


In [2]:
def mrr(ranks):
    """
    Calculate Mean Reciprocal Rank (MRR) for a list of ranks.
    
    :param ranks: A list of ranks for relevant items.
    :return: MRR score.
    """
    reciprocal_ranks = [1.0 / rank for rank in ranks if rank > 0]
    return sum(reciprocal_ranks) / len(reciprocal_ranks)

# Example usage
ranks = [3, 1, 2] # Relevant items at positions 1, 2, and 3
mrr_score = mrr(ranks)
print(f"Mean Reciprocal Rank (MRR): {mrr_score}")


Mean Reciprocal Rank (MRR): 0.611111111111111


In [3]:
def mrr(recommendations):
    """
    Calculate Mean Reciprocal Rank (MRR) for a list of recommendations.
    
    :param recommendations: A list of lists, where each sublist contains the positions of relevant items for a query.
    :return: MRR score.
    """
    reciprocal_ranks = []
    for positions in recommendations:
        if positions: # Check if there's at least one relevant item
            first_relevant_position = positions[0]
            reciprocal_ranks.append(1.0 / first_relevant_position)
    return sum(reciprocal_ranks) / len(reciprocal_ranks) if reciprocal_ranks else 0.0

# Example usage
recommendations = [
    [3, 1, 2, 4], # Query 1: Relevant items at positions 1, 2, and 3
    [2, 4, 1, 3], # Query 2: Relevant items at positions 1, 2, and 4
    [1, 3, 4, 2], # Query 3: Relevant items at positions 1, 3, and 4
]

mrr_score = mrr(recommendations)
print(f"Mean Reciprocal Rank (MRR): {mrr_score}")


Mean Reciprocal Rank (MRR): 0.611111111111111


In [4]:
def average_precision(relevant_positions):
    """
    Calculate Average Precision (AP) for a single query.
    
    :param relevant_positions: A list of positions of relevant items for a single query.
    :return: Average Precision for the query.
    """
    if not relevant_positions:
        return 0.0
    precision_sum = 0.0
    num_hits = 0.0
    for rank, is_relevant in enumerate(relevant_positions, start=1):
        if is_relevant:
            num_hits += 1
            precision_sum += num_hits / rank
    return precision_sum / len(relevant_positions)

def mean_average_precision(recommendations):
    """
    Calculate Mean Average Precision (MAP) for a list of recommendations.
    
    :param recommendations: A list of lists, where each sublist contains the positions of relevant items for a query.
    :return: MAP score.
    """
    map_sum = 0.0
    num_queries = 0.0
    for positions in recommendations:
        map_sum += average_precision(positions)
        num_queries += 1
    return map_sum / num_queries if num_queries else 0.0

# Example usage
recommendations = [
    [3, 1, 2, 4], # Query 1: Relevant items at positions 1, 2, and 3
    [2, 4, 1, 3], # Query 2: Relevant items at positions 1, 2, and 4
    [1, 3, 4, 2], # Query 3: Relevant items at positions 1, 3, and 4
]

map_score = mean_average_precision(recommendations)
print(f"Mean Average Precision (MAP): {map_score}")


Mean Average Precision (MAP): 1.0


In [5]:
# Relevant results for query #1, #2, and #3
actual_relevant = [
    [2, 4, 5, 7],
    [1, 4, 5, 7],
    [5, 8]
]

# Number of queries
Q = len(actual_relevant)

# Calculate the reciprocal of the first actual relevant rank
cumulative_reciprocal = 0
for i in range(Q):
    first_result = actual_relevant[i][0]
    reciprocal = 1 / first_result
    cumulative_reciprocal += reciprocal
    print(f"query #{i+1} = 1/{first_result} = {reciprocal}")

# Calculate MRR
mrr = 1/Q * cumulative_reciprocal

# Generate results
print("MRR =", round(mrr, 2))


query #1 = 1/2 = 0.5
query #2 = 1/1 = 1.0
query #3 = 1/5 = 0.2
MRR = 0.57


In [6]:
def average_precision(relevant_positions):
    """
    Calculate Average Precision (AP) for a single query.
    
    :param relevant_positions: A list of positions of relevant items for a single query.
    :return: Average Precision for the query.
    """
    if not relevant_positions:
        return 0.0
    precision_sum = 0.0
    num_hits = 0.0
    for rank, is_relevant in enumerate(relevant_positions, start=1):
        if is_relevant:
            num_hits += 1
            precision_sum += num_hits / rank
    return precision_sum / len(relevant_positions)

def mean_average_precision(recommendations):
    """
    Calculate Mean Average Precision (MAP) for a list of recommendations.
    
    :param recommendations: A list of lists, where each sublist contains the positions of relevant items for a query.
    :return: MAP score.
    """
    map_sum = 0.0
    num_queries = 0.0
    for positions in recommendations:
        map_sum += average_precision(positions)
        num_queries += 1
    return map_sum / num_queries if num_queries else 0.0

# Example usage
recommendations = [
    [3, 1, 2, 4], # Query 1: Relevant items at positions 1, 2, and 3
    [2, 4, 1, 3], # Query 2: Relevant items at positions 1, 2, and 4
    [1, 3, 4, 2], # Query 3: Relevant items at positions 1, 3, and 4
]

map_score = mean_average_precision(recommendations)
print(f"Mean Average Precision (MAP): {map_score}")


Mean Average Precision (MAP): 1.0


In [7]:
pip install rank-eval


Collecting rank-eval
  Downloading rank_eval-0.1.3-py3-none-any.whl.metadata (6.8 kB)
Collecting numba>=0.54.1 (from rank-eval)
  Downloading numba-0.59.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (2.7 kB)
Collecting tabulate (from rank-eval)
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Collecting llvmlite<0.43,>=0.42.0dev0 (from numba>=0.54.1->rank-eval)
  Downloading llvmlite-0.42.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (4.8 kB)
Downloading rank_eval-0.1.3-py3-none-any.whl (17 kB)
Downloading numba-0.59.0-cp311-cp311-macosx_11_0_arm64.whl (2.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m41.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Downloading llvmlite-0.42.0-cp311-cp311-macosx_11_0_arm64.whl (28.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m28.8/28.8 MB[0m [31m55.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling c

In [11]:
from rank_eval import Qrels, Run, evaluate

qrels = Qrels()
qrels.add_multi(
    q_ids=["q_1", "q_2"],
    doc_ids=[
        ["doc_12", "doc_25"],  # q_1 relevant documents
        ["doc_11", "doc_2"],  # q_2 relevant documents
    ],
    scores=[
        [5, 3],  # q_1 relevance judgements
        [6, 1],  # q_2 relevance judgements
    ],
)

run = Run()
run.add_multi(
    q_ids=["q_1", "q_2"],
    doc_ids=[
        ["doc_12", "doc_23", "doc_25", "doc_36", "doc_32", "doc_35"],
        ["doc_12", "doc_11", "doc_25", "doc_36", "doc_2",  "doc_35"],
    ],
    scores=[
        [0.9, 0.8, 0.7, 0.6, 0.5, 0.4],
        [0.9, 0.8, 0.7, 0.6, 0.5, 0.4],
    ],
)

In [12]:
# Compute score for a single metric
evaluate(qrels, run, "ndcg@5")

0.7861261099276952

In [13]:
# Compute scores for multiple metrics at once
evaluate(qrels, run, ["map@5", "mrr"])

{'map@5': 0.6416666666666666, 'mrr': 0.75}

In [14]:
# Computed metric scores are saved in the Run object
run.mean_scores

{'ndcg@5': 0.7861261099276952, 'map@5': 0.6416666666666666, 'mrr': 0.75}

In [15]:

# Access scores for each query
dict(run.scores)

{'ndcg@5': {'q_1': 0.9430144683295216, 'q_2': 0.6292377515258687},
 'map@5': {'q_1': 0.8333333333333333, 'q_2': 0.45},
 'mrr': {'q_1': 1.0, 'q_2': 0.5}}

In [18]:
pip install FlagEmbedding

Collecting FlagEmbedding
  Downloading FlagEmbedding-1.2.5.tar.gz (37 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting datasets (from FlagEmbedding)
  Downloading datasets-2.18.0-py3-none-any.whl.metadata (20 kB)
Collecting accelerate>=0.20.1 (from FlagEmbedding)
  Downloading accelerate-0.27.2-py3-none-any.whl.metadata (18 kB)
Collecting pyarrow-hotfix (from datasets->FlagEmbedding)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets->FlagEmbedding)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets->FlagEmbedding)
  Downloading xxhash-3.4.1-cp311-cp311-macosx_11_0_arm64.whl.metadata (12 kB)
Collecting multiprocess (from datasets->FlagEmbedding)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m16.4 MB/s[0m et

In [19]:
from FlagEmbedding import FlagReranker

# Initialize the reranker with a specified model
reranker = FlagReranker('BAAI/bge-reranker-large', use_fp16=True)

# Define query-passage pairs
pairs = [
    ['who is Isaac Newton?', 'Newton is known for his work on gravity'],
    ['who is Isaac Newton?', 'Isaac Newton was an English mathematician, physicist, and astronomer, widely recognized as one of the most influential scientists of all time.']
]

# Compute relevance scores for the defined pairs
scores = reranker.compute_score(pairs)
print(scores)

tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/801 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

[-1.22265625, 7.8984375]


In [20]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load the tokenizer and model for reranking
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-reranker-large')
model = AutoModelForSequenceClassification.from_pretrained('BAAI/bge-reranker-large')
model.eval()

# Define query-passage pairs
pairs = [
    ['who is Isaac Newton?', 'Newton is known for his work on gravity'],
    ['who is Isaac Newton?', 'Isaac Newton was an English mathematician, physicist, and astronomer, widely recognized as one of the most influential scientists of all time.']
]

# Compute relevance scores using the transformer model
with torch.no_grad():
    inputs = tokenizer(pairs, padding=True, truncation=True, return_tensors='pt', max_length=512)
    scores = model(**inputs, return_dict=True).logits.view(-1, ).float()
    print(scores)

tensor([-1.2149,  7.8965])


In [21]:
pip install llama_index

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting llama_index
  Downloading llama_index-0.10.16-py3-none-any.whl.metadata (8.8 kB)
Collecting llama-index-agent-openai<0.2.0,>=0.1.4 (from llama_index)
  Downloading llama_index_agent_openai-0.1.5-py3-none-any.whl.metadata (695 bytes)
Collecting llama-index-cli<0.2.0,>=0.1.2 (from llama_index)
  Downloading llama_index_cli-0.1.7-py3-none-any.whl.metadata (1.6 kB)
Collecting llama-index-core<0.11.0,>=0.10.16 (from llama_index)
  Downloading llama_index_core-0.10.16.post1-py3-none-any.whl.metadata (3.6 kB)
Collecting llama-index-embeddings-openai<0.2.0,>=0.1.5 (from llama_index)
  Downloading llama_index_embeddings_openai-0.1.6-py3-none-any.whl.metadata (654 bytes)
Collecting llama-index-indices-managed-llama-cloud<0.2.0,>=0.1.2 (from llama_index)
  Downloading llama_index_indices_managed_llama_cloud-0.1.3-py3-none-any.whl.metadata (3.9 kB)
Collecting llama-index-legacy<0.10.0,>=0.9.48 (from llama_index)
  Downloading llama_index_legacy-0.9.48-py3-none-any.whl.metadata (8.5 kB)


In [23]:
pip install llama-index-postprocessor-flag-embedding-reranker

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting llama-index-postprocessor-flag-embedding-reranker
  Downloading llama_index_postprocessor_flag_embedding_reranker-0.1.2-py3-none-any.whl.metadata (714 bytes)
Downloading llama_index_postprocessor_flag_embedding_reranker-0.1.2-py3-none-any.whl (2.9 kB)
Installing collected packages: llama-index-postprocessor-flag-embedding-reranker
Successfully installed llama-index-postprocessor-flag-embedding-reranker-0.1.2
Note: you may need to restart the kernel to use updated packages.


In [25]:
from llama_index.postprocessor.flag_embedding_reranker import FlagEmbeddingReranker
from llama_index.core.schema import NodeWithScore, QueryBundle, TextNode

# Step 1: Import and Initialize
reranker = FlagEmbeddingReranker(
    top_n=3,
    model="BAAI/bge-reranker-large",
    use_fp16=False
)

# Step 2: Prepare Nodes and Query
documents = [
    "Deep Learning is a subset of machine learning that focuses on learning representations of data.",
    "Convolutional Neural Networks (CNNs) are commonly used in computer vision tasks.",
    "Recurrent Neural Networks (RNNs) are often used in sequential data analysis, such as natural language processing.",
    "Transformers have gained popularity in NLP tasks due to their ability to handle long-range dependencies efficiently."
]

nodes = [NodeWithScore(node=TextNode(text=doc)) for doc in documents]
query = "What are the different types of neural networks?"

# Step 3: Re-Rank Nodes
query_bundle = QueryBundle(query_str=query)
ranked_nodes = reranker._postprocess_nodes(nodes, query_bundle)

# Step 4: Analyze Results
for node in ranked_nodes:
    print(node.node.get_content(), "-> Score:", node.score)


Convolutional Neural Networks (CNNs) are commonly used in computer vision tasks. -> Score: 4.203321933746338
Recurrent Neural Networks (RNNs) are often used in sequential data analysis, such as natural language processing. -> Score: 3.46016788482666
Transformers have gained popularity in NLP tasks due to their ability to handle long-range dependencies efficiently. -> Score: -0.8046585917472839
